In [1]:
import re
import string
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.preprocessing import TransactionEncoder
import numpy as np
from collections import Counter

In [2]:
# Define our dictionary for punctuation to word conversion

marks = ['.',',',':',';','?','!','-','(',')','"',"'",'~']
marks_to_wrds = {'.':'prd', ',':'cmma', ':':'coln', ';':'smicln', '?':'qstn', '!':'xclm', '-':'dsh', '(':'lftparen', ')':'rghtparen', '"':'dblqt', "'":'snglqt', '~':'elpss'}

In [3]:
articles = [
    "the", "a", "an"
]

pronouns = [
    "that", "this", "those", "these", "i", "we", "me", "my", "us", "ours", "you", "yours", 
    "he", "she", "it", "him", "her", "his", "hers", "its", "they", "them", "their", "here", "there", 
    "who", "whose", "whom"
]

prepositions = [
    "to", "of", "in", "for", "on", "with", "at", "by", "from", "up", "about", "into", "after", "over"
]

conjunctions = [
    "as", "but", "or", "and", "so"
]

filler_words = articles + pronouns + prepositions + conjunctions

In [4]:
data = pd.read_csv('stats_clean.csv', error_bad_lines = False, encoding = 'latin1')
data = data.dropna()

In [5]:
## Data cleaning
# Replace '...' with '~'
data['text'] = data['text'].str.replace(r"\.\.\.", "~")
print("ellipses are replaced")

# Remove URLs
data['text'] = data['text'].str.replace(r"\[.*?\]\(https?:\/\/.*?\)","")
print("urls removed")

# Remove blank characters
data['text'] = data['text'].str.replace(r"&#x200B;","")
print("blank chars removed")

# to lowercase
data['text'] = data['text'].str.lower()
print("changed to lowercase")

ellipses are replaced
urls removed
blank chars removed
changed to lowercase


In [6]:
bins = [0, 24.5, 43.5, 55.5, 74.5, 100]
gen_labels = ["GenZ", "GenY", "GenX", "BabyBoomers", "Traditionalists"]
data['age'] = pd.cut(data['age'], bins=bins, labels=gen_labels)

In [7]:
## Convert text to itemset data
# Concatenate all the comments into one long string.

# data to use
df = data

punc_pat = df['text'].str.extractall(r"(?P<first>[\w']+)\s+(?P<second>[\w']+)\s+(?P<third>[\w']+)\s+(?P<fourth>[\w']+)\b\s?(?P<punc>[.,:;?!\-()\"~])")
itemset = punc_pat.join(data[['gender','age']].reindex(punc_pat.index,level=0))
itemset['punc'] = itemset['punc'].map(marks_to_wrds)

In [38]:
## Main method, Generate global frequent itemset
sup = 0.001

for punc in marks_to_wrds.values():

    print(f"Processing itemsets\\freqitems_all_{punc}.csv")

    subset = itemset[(itemset['punc']==punc)]

    subset = subset[['first','second','third','fourth']].values.tolist()
    subset = [[i for i in sub if i not in filler_words] for sub in subset]

    subset = reduce_words(subset)

    out = get_fp(subset, sup)

    out.to_csv(f"itemsets\\freqitems_all_{punc}.csv")

Processing itemsets\freqitems_all_prd.csv
Subset word count reduced to 1304
Processing itemsets\freqitems_all_cmma.csv
Subset word count reduced to 2735
Processing itemsets\freqitems_all_coln.csv
Subset word count is fine, proceeding
Processing itemsets\freqitems_all_smicln.csv
Subset word count is fine, proceeding
Processing itemsets\freqitems_all_qstn.csv
Subset word count reduced to 11437
Processing itemsets\freqitems_all_xclm.csv
Subset word count reduced to 18508
Processing itemsets\freqitems_all_dsh.csv
Subset word count reduced to 26955
Processing itemsets\freqitems_all_lftparen.csv
Subset word count reduced to 27352
Processing itemsets\freqitems_all_rghtparen.csv
Subset word count is fine, proceeding
Processing itemsets\freqitems_all_dblqt.csv
Subset word count is fine, proceeding
Processing itemsets\freqitems_all_snglqt.csv
Processing itemsets\freqitems_all_elpss.csv
Subset word count is fine, proceeding


In [39]:
## Main method, Generate global frequent itemset (all punc)
sup = 0.001
        
print(f"Processing itemsets\\freqitems_all.csv")

subset = itemset

subset = subset[['first','second','third','fourth']].values.tolist()
subset = [[i for i in sub if i not in filler_words] for sub in subset]

subset = reduce_words(subset)

out = get_fp(subset, sup)

out.to_csv(f"itemsets\\freqitems_all.csv")

Processing itemsets\freqitems_all.csv
Subset word count reduced to 692


In [10]:
## Main methods

trait_name = 'age'
trait_itr = itemset.age.unique()
sup = 0.005

for punc in marks_to_wrds.values():
    for trait in trait_itr:
        
        print(f"Processing itemsets\\freqitems_{trait}_{punc}.csv")
        
        subset = itemset[(itemset[trait_name]==trait) & (itemset['punc']==punc)]

        subset = subset[['first','second','third','fourth']].values.tolist()
        subset = [[i for i in sub if i not in filler_words] for sub in subset]
        
        subset = reduce_words(subset)
        
        out = get_fp(subset, sup)
        
        out.to_csv(f"itemsets\\freqitems_{trait}_{punc}.csv")

Processing itemsets\freqitems_GenY_prd.csv
Subset word count reduced to 2449
Processing itemsets\freqitems_GenX_prd.csv
Subset word count is fine, proceeding
Processing itemsets\freqitems_GenZ_prd.csv
Subset word count reduced to 2942
Processing itemsets\freqitems_BabyBoomers_prd.csv
Subset word count is fine, proceeding
Processing itemsets\freqitems_Traditionalists_prd.csv
Subset word count is fine, proceeding
Processing itemsets\freqitems_GenY_cmma.csv
Subset word count reduced to 5527
Processing itemsets\freqitems_GenX_cmma.csv
Subset word count is fine, proceeding
Processing itemsets\freqitems_GenZ_cmma.csv
Subset word count reduced to 5621
Processing itemsets\freqitems_BabyBoomers_cmma.csv
Subset word count is fine, proceeding
Processing itemsets\freqitems_Traditionalists_cmma.csv
Subset word count is fine, proceeding
Processing itemsets\freqitems_GenY_coln.csv
Subset word count is fine, proceeding
Processing itemsets\freqitems_GenX_coln.csv
Subset word count is fine, proceeding
P

In [8]:
def get_fp(transactions, min_sup = 0.05):
    
    # Put the transactions into dataframe form, this allows us to perform FPgrowth on the resultant dataframe.
    te = TransactionEncoder()
    te_transformed = te.fit(transactions).transform(transactions)
    df = pd.DataFrame(te_transformed, columns = te.columns_)

    # Apply FPgrowth to the transactional dataframe.
    freq_pats = fpgrowth(df, min_support = min_sup, use_colnames = True)
    return freq_pats

In [9]:
def reduce_words(subset, max_cells=6500000000):
    
    try:
        new_size = int(max_cells/len(subset))
    except ZeroDivisionError:
        return subset
        
    oneline = [j for i in subset for j in i]
    counted = Counter(oneline)
    ordered = [value for value, count in counted.most_common()]

    if len(ordered) <= new_size:
        print("Subset word count is fine, proceeding")
        return subset
    else:
        print(f"Subset word count reduced to {new_size}")
        freq_words = ordered[0:new_size]
        reduced_subset = [[i for i in sub if i in freq_words] for sub in subset]
        return reduced_subset

In [35]:
bb = data[data['age']=='BabyBoomers']
bb

Unnamed: 0,age,gender,author,full_link,text
305,BabyBoomers,Female,usernamenew123,https://www.reddit.com/r/relationships/comment...,"go on, tell me how im not considerate.@ sugges..."
373,BabyBoomers,Male,SawTheLightOfReason,https://www.reddit.com/r/relationships/comment...,> the only way to help it is ~.. constant mind...
443,BabyBoomers,Female,NotmyTable11,https://www.reddit.com/r/relationships/comment...,yeah its like i spend everyday all day with th...
488,BabyBoomers,Female,WatchOutItsAFeminist,https://www.reddit.com/r/relationships/comment...,i'm worried that's coming to my paper. no adve...
660,BabyBoomers,Male,martinPravda,https://www.reddit.com/r/relationships/comment...,"when my depression kicks in, i do the same thi..."
1208,BabyBoomers,Female,mixter-revolution,https://www.reddit.com/r/relationships/comment...,i'm interested in polishing up my spanish as w...
1370,BabyBoomers,Female,GingyTheCatt,https://www.reddit.com/r/relationships/comment...,hitler? oh hunny. look at obamas little follow...
1961,BabyBoomers,Male,PotomacPicnic,https://www.reddit.com/r/relationships/comment...,"with age, i am much nicer than i used to be to..."
2421,BabyBoomers,Male,Newengland933,https://www.reddit.com/r/relationships/comment...,dont care~. fuck em@ black at it again@ lol fu...
2769,BabyBoomers,Male,bill1024,https://www.reddit.com/r/relationships/comment...,> everyone has nose hair. almost. my buddy ha...


In [34]:
for i in bb['text']:
    print(i)

go on, tell me how im not considerate.@ suggestions? no i dont like everything i do in my home being question.   its the little things like why have you taken the bins out a certain way (back gate over the front gate).   shes also got bleach over my bathroom unit, she hasnt told me.  she also barged past me when i told/asked her to not take her bike in the front door because it was full of a kitchen delivery i wanted. i told her the night she came back and the second time she kicked up a fuss. its completely unreasonable as she always wanted to come in the side gate and when i asked her because of a legitimate reason she told me no and barged past me. its also a side gate that she wouldnt be able to lock when exit so we dont use it.   so again, im completely reasonable.@ its my home though, i own the property, i paid for the cups. i think its petty to have a conversation where the cups will be, because ultimate its my decision. there is no wrong of right way to put them and i cant thin