In [1]:
"""
This script was adapted from:
    https://github.com/bmurauer/authbench/blob/main/scripts/unify_cmcc.py

Dataset Access
    Email authoros of: Creating and Using a Correlated Corpora to Glean Communicative Commonalities
    http://www.lrec-conf.org/proceedings/lrec2008/pdf/771_paper.pdf
"""
import os
import re
from glob import glob
from typing import Dict, Any
# from valla.utils.dataset_utils import finalize_cross_dataset, list_dset_to_dict, auth_text_make_unique
from sklearn.model_selection import train_test_split

import pandas as pd
import argparse
import random
import numpy as np
from typing import List, Dict, Union

prompt = {
    "C": "Do you feel the Catholic Church needs to change its ways to adapt to life in the 21st Century?",
    "G": "While some states have legalized gay marriage, others are still opposed to it. Do you think either side is right or wrong?",
    "P": "Recently, school officials prevented a school shooting because one of the shooters posted a myspace bulletin. Do you think this was an invasion of privacy?",
    "M": "The city of Denver has decided to legalize small amounts of marijuana for persons over 21. How do you feel about this?",
    "I": "The controversial war in Iraq has made news headlines almost every day since it began. How do you feel about the war?",
    "S": "Do you feel that gender discrimination is still an issue in the present-day United States?"
}


topic = {
    "Emails": "Write an email response to a someone who asked you the following: ",
    "Essays": "Write an approximately 500 word essay to the following prompt: "
}


def list_dset_to_dict(data: List[List[Union[int, str]]]) -> Dict:
    out = {}
    for auth, text in data:
        out.setdefault(auth, []).append(text)
    return out


def dict_dset_to_list(data: Dict) -> List[List[Union[int, str]]]:
    out = []
    for auth, texts in data.items():
        for text in texts:
            out.append([auth, text])
    return out


def auth_text_make_unique(data: Dict):
    unique = {}
    for author, texts in data.items():
        unique[author] = list(set(texts))
    return unique
        
    
def check_or_fix_dataset_typo(directory: str) -> None:
    """
    There is one typo in the dataset which might have not been corrected yet:
    there is one file 'Discussions/Correlated/S1D113.txt'
    Which is the only file in the corpus that does not comply to the
    naming convention explained in FileCodingSchemes3.doc.
    It should be called S1D1I3.txt with an upper case i instead of a digit one.
    This code was tested on CMCCData.zip with a md5 checksum of:
        157586057cf4ad3dc1876890e94373a5
    """
    wrong = os.path.join(directory, "Discussion", "Correlated", "S1D113.txt")
    right = os.path.join(directory, "Discussion", "Correlated", "S1D1I3.txt")

    if os.path.isfile(wrong):
        print("renaming " + wrong + " to " + right)
        os.rename(wrong, right)


def process_cmcc(pth: str) -> Dict[str, Dict[Any, Any]]:
    processed_dir = os.path.join(pth, "processed")
    if not os.path.isdir(processed_dir):
        os.makedirs(processed_dir)
    directory = pth

    check_or_fix_dataset_typo(directory)

    train_posts = {}
    auth_to_id = {}
    auth_counter = 0
    categories = ["Emails", "Essays"]
    train_categories, train_texts = ["Emails", "Essays"], 0

    for category in categories:
        correlated_dir = os.path.join(directory, category, "Correlated")
        files = glob(correlated_dir + "/*.txt")
        pattern = re.compile(
            r"(?P<author>[A-Z]\d+)(?P<genre>[A-Z])\d+(?P<topic>[A-Z])\d+.txt"
        )

        for f in files:
            # the files are windows-1252-encoded.
            with open(f, "rb") as i_f:
                try:
                    text_raw = i_f.read().decode("cp1252")
                except Exception as e:
                    print(f)
                    raise e

            name = os.path.basename(f)
            match = pattern.match(name)
            if not match:
                raise ValueError("no match found for file: " + f)

            # we only need text_raw and match.groupdict()['author']
            if match.groupdict()['author'] not in auth_to_id:
                auth_to_id[match.groupdict()['author']] = auth_counter
                auth_counter += 1

            a = auth_to_id[match.groupdict()['author']]

            if category in train_categories:
                train_posts.setdefault(a, []).append(
                    { 
                        "prompt": f"{topic[category]}{prompt[name[-6]]}",
                        "output": text_raw.strip()
                    }
                )
                train_texts += 1

    print(f'there are {train_texts} iid texts')

    return {
        'train': train_posts
    }


dataset_path = "./cmcc/"
seed = 0
output_path = "./cmcc/processed"

class Namespace:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

args = Namespace(dataset_path=dataset_path, seed=seed)

random.seed(args.seed)
np.random.seed(args.seed)

all_data = process_cmcc(args.dataset_path)

there are 252 iid texts


In [40]:
# clean up cases where the prompt is included in the output 

prompts = list(prompt.values())
print(prompts)

count = 0 
for author_key in all_data['train']: 
    author_data = all_data['train'][author_key]

    for i in range(len(author_data)):

        output = author_data[i]['output']

        if any(p in output for p in prompts): 
            count += 1
            print(f"prompt found in output, author_id: {author_key}, index: {i}")
            continue         

        if "subject number" in output.lower(): 
            # print(output)
            output = re.sub("\r", "", output)
            output = output.split("\n\n")
            # print(output[-1])
            # print(len(output))
            count += 1 
            print(f"subject number found in output, author_id {author_key}, index: {i}")
            continue

        if "cmc participant" in output.lower(): 
            # print(output)
            count += 1 
            print(f"cmc participant found in output, author_id {author_key}, index: {i}")

            # remove line with > 

count


['Do you feel the Catholic Church needs to change its ways to adapt to life in the 21st Century?', 'While some states have legalized gay marriage, others are still opposed to it. Do you think either side is right or wrong?', 'Recently, school officials prevented a school shooting because one of the shooters posted a myspace bulletin. Do you think this was an invasion of privacy?', 'The city of Denver has decided to legalize small amounts of marijuana for persons over 21. How do you feel about this?', 'The controversial war in Iraq has made news headlines almost every day since it began. How do you feel about the war?', 'Do you feel that gender discrimination is still an issue in the present-day United States?']
prompt found in output, author_id: 11, index: 6
prompt found in output, author_id: 11, index: 7
prompt found in output, author_id: 11, index: 8
prompt found in output, author_id: 11, index: 9
prompt found in output, author_id: 11, index: 10
prompt found in output, author_id: 11,

8

In [38]:
print("hi\nhi hi\n".split())

['hi', 'hi', 'hi']


In [32]:
# this gives us the training set, a cross topic set, and a cross genre set. This is really three problems, so ideally
# we will have: train, val, test, cross_topic_val, cross_topic_test, cross_genre_val, cross_genre_test
# so make lists of the sets and split
iid_data = []
for auth, texts in all_data['train'].items():
    for text in texts:
        iid_data.append([auth, text])
train_set, eval_and_test_set = train_test_split(iid_data, test_size=0.4, shuffle=True, random_state=args.seed,
                                                stratify=[lbl for lbl, _ in iid_data])
eval_set, test_set = train_test_split(eval_and_test_set, test_size=0.5, shuffle=True, random_state=args.seed,
                                                stratify=[lbl for lbl, _ in eval_and_test_set])

# now finalize the dataset.
original_data = []
for dset_name, dset in all_data.items():
    for auth, texts in dset.items():
        for text in texts:
            text["output"] = re.sub(' +', ' ', text["output"])
            text["output"] = re.sub('\t', '', text["output"])
            text["output"] = re.sub('\r', '', text["output"])
            text["output"] = re.sub('\xa0', '', text["output"])
            text["output"] = ' '.join(text["output"].split())

            # text["output"] = re.sub('\n', '\n', text["output"])
            
#             print("NEW")
#             print(text["output"])
            original_data.append([auth, text])
            # print(text)

save_path = os.path.join(args.dataset_path, 'processed')

In [33]:
original_data = list_dset_to_dict(original_data)
train_data = list_dset_to_dict(train_set)
val_data = list_dset_to_dict(eval_set)
test_data = list_dset_to_dict(test_set)

In [34]:
train_data

{20: [{'prompt': 'Write an email response to a someone who asked you the following: Do you feel that gender discrimination is still an issue in the present-day United States?',
   'output': "Although it has definitly improved, gender discrimination still exists today. In the workplace, I feel that men harass women and do not even know it. I learned in my social psychology class that even if an inapropriate joke about women is made among male co-workers and a woman over hears and is offended, that is sexual harassment. They can be brought up on charges and potentially loose their jobs. I think that female discrimination is the one that we see in the public eye the most but that does not mean that men are never discriminated against. Little children are taught with falsh cards. They see the male doctor and the female nurse. What is a man did not want to be a full time doctor? What if he wanted to a nurse? We have been conditioned since an early age that this would be breaking some kind o

In [41]:
for author_id in train_data:
    print(author_id)
    author_data =  train_data[author_id]
    for sample in author_data:
        output = sample['output']
        if any(p in output for p in prompts):
            print(output)

20
16
17
7
15
10
9
14
6
8
13
19
18
4
1
5
0
11
1. LEGALIZATION OF MARIJUANA: The city of Denver has decided to legalize small amounts of marijuana for persons over 21. How do you feel about this? Do you feel that marijuana and its effects are similar enough to alcohol that it should be legal? I do not really know how I feel about the legalization of marijuana. On the one hand I have heard that studies have come out proving that alcohol is actually more harmful the pot. Under alcohol, one loosed much their inhibition and many a persons have made extremely bad judgment calls because of this. I know personally of friends who have blacked out, gotten sick, and have even had sex when they didn't mean to, all because of alcohol. I am not saying that this can not happen and does not happen with the use of weed, but I have not heard it to this extent. I also know that people can get addicted to alcohol if they are not careful, and can eventually get cirrhosis of the liver. This is a serious pro

In [11]:
import pickle
def write_aa_dataset(data: Dict, file_path: str) -> None:
    # Save JSON data as PKL
    with open(file_path, 'wb') as f:
        pickle.dump(data, f)
        
dataset_procs = [
    (train_data, "train"), 
    (val_data, "val"), 
    (test_data, "test"), 
]

for dataset, name in dataset_procs:
    write_aa_dataset(dataset, output_path + f"/cmcc_{name}.pkl")

In [12]:
train_data.keys()

dict_keys([20, 16, 17, 7, 15, 10, 9, 14, 6, 8, 13, 19, 18, 4, 1, 5, 0, 11, 3, 2, 12])

In [13]:
for m in train_data:
    print(f"XX: {len(train_data[m])}")

XX: 8
XX: 7
XX: 7
XX: 7
XX: 7
XX: 7
XX: 7
XX: 7
XX: 7
XX: 8
XX: 8
XX: 7
XX: 7
XX: 7
XX: 8
XX: 7
XX: 7
XX: 7
XX: 7
XX: 7
XX: 7


In [21]:
import pickle

for split in ["train", "val", "test"]:
    for n in ["cmcc", "ccat50"]:
        print(n)
        with open(f"./{n}/processed/{n}_{split}.pkl", 'rb') as pickle_file:
            data = pickle.load(pickle_file)

        print("# authors: ", len(data))
        for idx, k in enumerate(data):
            print(f"{idx}: {len(data[k])}")

cmcc
# authors:  21
0: 8
1: 7
2: 7
3: 7
4: 7
5: 7
6: 7
7: 7
8: 7
9: 8
10: 8
11: 7
12: 7
13: 7
14: 8
15: 7
16: 7
17: 7
18: 7
19: 7
20: 7
ccat50
# authors:  50
0: 40
1: 40
2: 40
3: 40
4: 40
5: 40
6: 40
7: 40
8: 40
9: 40
10: 40
11: 40
12: 40
13: 40
14: 40
15: 40
16: 40
17: 40
18: 40
19: 40
20: 40
21: 40
22: 40
23: 40
24: 40
25: 40
26: 40
27: 40
28: 40
29: 40
30: 40
31: 40
32: 40
33: 40
34: 40
35: 40
36: 40
37: 40
38: 40
39: 40
40: 40
41: 40
42: 40
43: 40
44: 40
45: 40
46: 40
47: 40
48: 40
49: 40
cmcc
# authors:  21
0: 2
1: 2
2: 3
3: 2
4: 3
5: 2
6: 3
7: 2
8: 3
9: 2
10: 3
11: 2
12: 3
13: 2
14: 2
15: 2
16: 3
17: 2
18: 2
19: 2
20: 3
ccat50
# authors:  50
0: 3
1: 3
2: 3
3: 3
4: 3
5: 3
6: 3
7: 3
8: 3
9: 3
10: 3
11: 3
12: 3
13: 3
14: 3
15: 3
16: 3
17: 3
18: 3
19: 3
20: 3
21: 3
22: 3
23: 3
24: 3
25: 3
26: 3
27: 3
28: 3
29: 3
30: 3
31: 3
32: 3
33: 3
34: 3
35: 3
36: 3
37: 3
38: 3
39: 3
40: 3
41: 3
42: 3
43: 3
44: 3
45: 3
46: 3
47: 3
48: 3
49: 3
cmcc
# authors:  21
0: 3
1: 3
2: 2
3: 3
4: 3
5: 3
6: 2