In [1]:
import re
import string
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.preprocessing import TransactionEncoder
import numpy as np
from collections import Counter

In [2]:
# Define our dictionary for punctuation to word conversion

marks = ['.',',',':',';','?','!','-','(',')','"',"'",'~']
marks_to_wrds = {'.':'prd', ',':'cmma', ':':'coln', ';':'smicln', '?':'qstn', '!':'xclm', '-':'dsh', '(':'lftparen', ')':'rghtparen', '"':'dblqt', "'":'snglqt', '~':'elpss'}

In [3]:
articles = [
    "the", "a", "an"
]

pronouns = [
    "that", "this", "those", "these", "i", "we", "me", "my", "us", "ours", "you", "yours", 
    "he", "she", "it", "him", "her", "his", "hers", "its", "they", "them", "their", "here", "there", 
    "who", "whose", "whom"
]

prepositions = [
    "to", "of", "in", "for", "on", "with", "at", "by", "from", "up", "about", "into", "after", "over"
]

conjunctions = [
    "as", "but", "or", "and", "so"
]

filler_words = articles + pronouns + prepositions + conjunctions

In [63]:
data = pd.read_csv('stats_clean.csv', error_bad_lines = False, encoding = 'latin1')
data = data.dropna()

In [64]:
## Data cleaning
# Replace '...' with '~'
data['text'] = data['text'].str.replace(r"\.\.\.", "~")
print("ellipses are replaced")

# Remove URLs
data['text'] = data['text'].str.replace(r"\[.*?\]\(https?:\/\/.*?\)","")
print("urls removed")

# Remove blank characters
data['text'] = data['text'].str.replace(r"&#x200B;","")
print("blank chars removed")

# to lowercase
data['text'] = data['text'].str.lower()
print("changed to lowercase")

ellipses are replaced
urls removed
blank chars removed
changed to lowercase


In [65]:
bins = [0, 24.5, 43.5, 55.5, 74.5, 100]
gen_labels = ["GenZ", "GenY", "GenX", "BabyBoomers", "Traditionalists"]
data['age'] = pd.cut(data['age'], bins=bins, labels=gen_labels)

In [66]:
# Create even sample across gender

#BBs and trads too small a set
data = data[(data['age']!='BabyBoomers') & (data['age']!='Traditionalists')]

samps = []
for a in data.age.unique():
    m = data[(data['age']==a) & (data['gender']=='Male')]
    m_size = len(m)
    samps.append(m)
    f = data[(data['age']==a) & (data['gender']=='Female')].sample(n=m_size, random_state=1)
    samps.append(f)
even_data = pd.concat(samps)

In [75]:
data = even_data

In [76]:
## Convert text to itemset data
# Concatenate all the comments into one long string.

# data to use
df = data

punc_pat = df['text'].str.extractall(r"(?P<first>[\w']+)\s+(?P<second>[\w']+)\s+(?P<third>[\w']+)\s+(?P<fourth>[\w']+)\b\s?(?P<punc>[.,:;?!\-()\"~])")
itemset = punc_pat.join(data[['gender','age']].reindex(punc_pat.index,level=0))
itemset['punc'] = itemset['punc'].map(marks_to_wrds)

In [80]:
## Main methods

trait_name = 'age'
trait_itr = itemset.age.unique()
sup = 0.002

for punc in marks_to_wrds.values():
    for trait in trait_itr:
        
        print(f"Processing itemsets\\freqitems_{trait}_{punc}.csv")
        
        subset = itemset[(itemset[trait_name]==trait) & (itemset['punc']==punc)]

        subset = subset[['first','second','third','fourth']].values.tolist()
        subset = [[i for i in sub if i not in filler_words] for sub in subset]
        
        subset = reduce_words(subset)
        
        out = get_fp(subset, sup)
        
        out.to_csv(f"itemsets\\freqitems_{trait}_{punc}.csv")

Processing itemsets\freqitems_GenY_prd.csv
Subset word count reduced to 3986
Processing itemsets\freqitems_GenX_prd.csv
Subset word count is fine, proceeding
Processing itemsets\freqitems_GenZ_prd.csv
Subset word count reduced to 4508
Processing itemsets\freqitems_GenY_cmma.csv
Subset word count reduced to 8853
Processing itemsets\freqitems_GenX_cmma.csv
Subset word count is fine, proceeding
Processing itemsets\freqitems_GenZ_cmma.csv
Subset word count reduced to 8462
Processing itemsets\freqitems_GenY_coln.csv
Subset word count is fine, proceeding
Processing itemsets\freqitems_GenX_coln.csv
Subset word count is fine, proceeding
Processing itemsets\freqitems_GenZ_coln.csv
Subset word count is fine, proceeding
Processing itemsets\freqitems_GenY_smicln.csv
Subset word count is fine, proceeding
Processing itemsets\freqitems_GenX_smicln.csv
Subset word count is fine, proceeding
Processing itemsets\freqitems_GenZ_smicln.csv
Subset word count is fine, proceeding
Processing itemsets\freqitems

In [77]:
def get_fp(transactions, min_sup = 0.05):
    
    # Put the transactions into dataframe form, this allows us to perform FPgrowth on the resultant dataframe.
    te = TransactionEncoder()
    te_transformed = te.fit(transactions).transform(transactions)
    df = pd.DataFrame(te_transformed, columns = te.columns_)

    # Apply FPgrowth to the transactional dataframe.
    freq_pats = fpgrowth(df, min_support = min_sup, use_colnames = True)
    return freq_pats

In [78]:
def reduce_words(subset, max_cells=6500000000):
    
    try:
        new_size = int(max_cells/len(subset))
    except ZeroDivisionError:
        return subset
        
    oneline = [j for i in subset for j in i]
    counted = Counter(oneline)
    ordered = [value for value, count in counted.most_common()]

    if len(ordered) <= new_size:
        print("Subset word count is fine, proceeding")
        return subset
    else:
        print(f"Subset word count reduced to {new_size}")
        freq_words = ordered[0:new_size]
        reduced_subset = [[i for i in sub if i in freq_words] for sub in subset]
        return reduced_subset

In [None]:
## Main method, Generate global frequent itemset (all punc)
sup = 0.002

print(f"Processing itemsets\\freqitems_all.csv")

subset = itemset

subset = subset[['first','second','third','fourth']].values.tolist()
subset = [[i for i in sub if i not in filler_words] for sub in subset]

subset = reduce_words(subset)

out = get_fp(subset, sup)

out.to_csv(f"itemsets\\freqitems_all.csv")

In [None]:
## Main method, Generate global frequent itemset
sup = 0.002

for punc in marks_to_wrds.values():

    print(f"Processing itemsets\\freqitems_all_{punc}.csv")

    subset = itemset[(itemset['punc']==punc)]

    subset = subset[['first','second','third','fourth']].values.tolist()
    subset = [[i for i in sub if i not in filler_words] for sub in subset]

    subset = reduce_words(subset)

    out = get_fp(subset, sup)

    out.to_csv(f"itemsets\\freqitems_all_{punc}.csv")