In [12]:
import re
import string
import pandas as pd
import numpy as np
from collections import Counter
import csv

In [7]:
# Define our dictionary for punctuation to word conversion

marks = ['.',',',':',';','?','!','-','(',')','"',"'",'~']
marks_to_wrds = {'.':'prd', ',':'cmma', ':':'coln', ';':'smicln', '?':'qstn', '!':'xclm', '-':'dsh', '(':'lftparen', ')':'rghtparen', '"':'dblqt', "'":'snglqt', '~':'elpss'}

In [2]:
data = pd.read_csv('stats_clean.csv', error_bad_lines = False, encoding = 'latin1')
data = data.dropna()

In [3]:
## Data cleaning
# Replace '...' with '~'
data['text'] = data['text'].str.replace(r"\.\.\.", "~")
print("ellipses are replaced")

# Remove URLs
data['text'] = data['text'].str.replace(r"\[.*?\]\(https?:\/\/.*?\)","")
print("urls removed")

# Remove blank characters
data['text'] = data['text'].str.replace(r"&#x200B;","")
print("blank chars removed")

# to lowercase
data['text'] = data['text'].str.lower()
print("changed to lowercase")

ellipses are replaced
urls removed
blank chars removed
changed to lowercase


In [4]:
bins = [0, 24.5, 43.5, 55.5, 74.5, 100]
gen_labels = ["GenZ", "GenY", "GenX", "BabyBoomers", "Traditionalists"]
data['age'] = pd.cut(data['age'], bins=bins, labels=gen_labels)

In [5]:
# Create even sample across gender

#BBs and trads too small a set
data = data[(data['age']!='BabyBoomers') & (data['age']!='Traditionalists')]

samps = []
for a in data.age.unique():
    m = data[(data['age']==a) & (data['gender']=='Male')]
    m_size = len(m)
    samps.append(m)
    f = data[(data['age']==a) & (data['gender']=='Female')].sample(n=m_size, random_state=1)
    samps.append(f)
even_data = pd.concat(samps)
data = even_data

In [34]:
## Convert text to itemset data
# Concatenate all the comments into one long string.

# data to use
df = data

punc_pat = df['text'].str.extractall(r"(?P<first>[\w']+)\s+(?P<second>[\w']+)\s+(?P<third>[\w']+)\s+(?P<fourth>[\w']+)\b\s?(?P<punc>[.,:;?!\-()\"~])")
itemset = punc_pat.join(data[['gender','age','author']].reindex(punc_pat.index,level=0))
itemset['punc'] = itemset['punc'].map(marks_to_wrds)

In [30]:
## Build VAD dataset

vad = {}

vad_file = "vad.txt"

with open(vad_file, "r") as in_file:
    reader = csv.DictReader(in_file,delimiter="\t")
    for row in reader:
        vad[row['Word']] = np.array([float(row['Valence']), float(row['Arousal']), float(row['Dominance'])])

In [96]:
## Main method

out = []
size = len(itemset.author.unique())
tot_count = 0
for auth in itemset.author.unique():
    tot_count += 1
    subset = itemset[itemset['author']==auth]
    demo_row = [subset.iloc[0]['age'], subset.iloc[0]['gender']]
    if tot_count % int(size/100) == 0:
        print(f"{int(tot_count/size*100)}% Completed")
    for punc in subset.punc.unique():
        
        val = np.array([float(0), float(0), float(0)])
        count = 0

        p_subset = subset[subset['punc']==punc]
        p_subset = p_subset[['first','second','third']].values.tolist()

        for row in p_subset:
            for wrd in row:
                if wrd in vad:
                    count += 1
                    val = val + (vad[wrd]-val)/count
        out_row = demo_row + [punc] + val.tolist()
        out.append(out_row)

0% Completed
1% Completed
2% Completed
3% Completed
4% Completed
5% Completed
6% Completed
7% Completed
8% Completed
9% Completed
10% Completed
11% Completed
12% Completed
13% Completed
14% Completed
15% Completed
16% Completed
17% Completed
18% Completed
19% Completed
20% Completed
21% Completed
22% Completed
23% Completed
24% Completed
25% Completed
26% Completed
27% Completed
28% Completed
29% Completed
30% Completed
31% Completed
32% Completed
33% Completed
34% Completed
35% Completed
36% Completed
37% Completed
38% Completed
39% Completed
40% Completed
41% Completed
42% Completed
43% Completed
44% Completed
45% Completed
46% Completed
47% Completed
48% Completed
49% Completed
50% Completed
51% Completed
52% Completed
53% Completed
54% Completed
55% Completed
56% Completed
57% Completed
58% Completed
59% Completed
60% Completed
61% Completed
62% Completed
63% Completed
64% Completed
65% Completed
66% Completed
67% Completed
68% Completed
69% Completed
70% Completed
71% Completed
72

In [79]:
## Main method, any punc

out1 = []
size = len(itemset.author.unique())
tot_count = 0
for auth in data.author.unique():
    tot_count += 1
    subset = itemset[itemset['author']==auth]
    demo_row = [data.iloc[0]['age'], data.iloc[0]['gender']]
    if tot_count % int(size/100) == 0:
        print(f"{int(tot_count/size*100)}% Completed")
        
    val = np.array([float(0), float(0), float(0)])
    count = 0

    p_subset = subset
    p_subset = p_subset[['first','second','third']].values.tolist()

    for row in p_subset:
        for wrd in row:
            if wrd in vad:
                count += 1
                val = val + (vad[wrd]-val)/count
    out_row = demo_row + ["all"] + val.tolist()
    out1.append(out_row)

0% Completed
1% Completed
2% Completed
3% Completed
4% Completed
5% Completed
6% Completed
7% Completed
8% Completed
9% Completed
10% Completed
11% Completed
12% Completed
13% Completed
14% Completed
15% Completed
16% Completed
17% Completed
18% Completed
19% Completed
20% Completed
21% Completed
22% Completed
23% Completed
24% Completed
25% Completed
26% Completed
27% Completed
28% Completed
29% Completed
30% Completed
31% Completed
32% Completed
33% Completed
34% Completed
35% Completed
36% Completed
37% Completed
38% Completed
39% Completed
40% Completed
41% Completed
42% Completed
43% Completed


KeyboardInterrupt: 

In [None]:
## Main method, all authors

out2 = []
size = len(itemset.author.unique())

subset = itemset
demo_row = ["all", "all"]
for punc in subset.punc.unique():

    val = np.array([float(0), float(0), float(0)])
    count = 0

    p_subset = subset[subset['punc']==punc]
    p_subset = p_subset[['first','second','third']].values.tolist()

    for row in p_subset:
        for wrd in row:
            if wrd in vad:
                count += 1
                val = val + (vad[wrd]-val)/count
    out_row = demo_row + [punc] + val.tolist()
    out2.append(out_row)

In [98]:
working = out.copy()

In [99]:
working = [i for i in working if (i[3] != 0 and i[4] != 0 and i[5] != 0)]

In [100]:
out_filename = "vad_scores_noempty.csv"
head = ["age","gender","punc","valence","arousal","dominance"]

with open(out_filename, 'w', newline='') as out_file:
    writer = csv.writer(out_file)
    writer.writerow(head)
    writer.writerows(working)