In [40]:
import re
import string
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.preprocessing import TransactionEncoder

In [41]:
# Define our dictionary for punctuation to word conversion

marks = ['.',',',':',';','?','!','-','(',')','"',"'",'~']
marks_to_wrds = {' prd ': '.', ' cmma ': ',', ' coln ': ':', ' smicln ': ';', ' qstn ': '?', ' xclm ': '!', ' dsh ': '-', ' lftparen ': '(', ' rghtparen ': ')', ' dblqt ': '"', ' snglqt ': "'", ' elpss ': '~'}

In [42]:
def pre_fp(df):
    
    # Concatinate all the comments into one long string.
    concat_all_comments = ' '.join(df["text"])
    punc_words = marks_to_wrds.keys()
    
    transactions = []
    fqt_pat_sample = concat_all_comments

    for character in tqdm(fqt_pat_sample, desc = 'Character Loop'):
        if character in marks:
            fqt_pat_sample = fqt_pat_sample.replace(character, list(marks_to_wrds.keys())[list(marks_to_wrds.values()).index(character)])

    wrd_pos = 0
    
    # Remove all remaining punctuation from fqt_pat_sample.
    fqt_pat_sample = fqt_pat_sample.translate(str.maketrans('','', string.punctuation))
    fqt_pat_sample_split = fqt_pat_sample.split()
    
    for word in tqdm(fqt_pat_sample_split, desc = 'Word Loop'):
        grab_three = []
        word_formatted = ' ' + word + ' '
        if word_formatted in punc_words:
            if wrd_pos >= 3:
                grab_three.append(fqt_pat_sample_split[wrd_pos - 3])
                grab_three.append(fqt_pat_sample_split[wrd_pos - 2])
                grab_three.append(fqt_pat_sample_split[wrd_pos - 1])
                grab_three.append(fqt_pat_sample_split[wrd_pos])
            
                transactions.append(grab_three) 
        wrd_pos += 1
        
    return transactions

In [43]:
def get_fp(transactions, min_sup = 0.05):
    
    # Put the transactions into dataframe form, this allows us to perform FPgrowth on the resultant dataframe.
    te = TransactionEncoder()
    te_transformed = te.fit(transactions).transform(transactions)
    df = pd.DataFrame(te_transformed, columns = te.columns_)

    # Apply FPgrowth to the transactional dataframe.
    freq_pats = fpgrowth(df, min_support = min_sup, use_colnames = True)
    return freq_pats

In [44]:
# Reading in the data.

samp = pd.read_csv('stats_sample.csv', error_bad_lines = False, encoding = 'latin1')
samp = samp.dropna()

data = pd.read_csv('stats_clean.csv', error_bad_lines = False, encoding = 'latin1')
data = data.dropna()

In [46]:
# Construct a dataset in the form of stats_clean.csv, but with the transactions in place of comments in the 'text' column.

for comments in tqdm(samp['text'], desc = 'Overall Progress'):
    
    # Store the list of punctuation-word conversions as a list, for efficiency purposes.
    punc_wrd_conv = marks_to_wrds.keys()
    
    # Initialize a list to hold the transactions for each Redditor's comments. 
    transactions = []

    # Loop over each character to do all punctuation to word conversion.
    for character in tqdm(comments, desc = 'Character Loop'):
        if character in marks:
            comments = comments.replace(character, list(marks_to_wrds.keys())[list(marks_to_wrds.values()).index(character)])
        
    wrd_pos = 0 # Positional counter for indexing purposes.
    
    # Remove all remaining punctuation from the comments.
    comments = comments.translate(str.maketrans('','', string.punctuation))
    comments_split = comments.split()
    
    # Loop over every word in the comments for a given Redditor, grab the three words before when encountering punctuation of interest.
    for word in tqdm(comments_split, desc = 'Grab Three'):
        
        grab_three = []
        word_formatted = ' ' + word + ' '
        
        if word_formatted in punc_wrd_conv:
            
            if wrd_pos >= 3:
                grab_three.append(comments_split[wrd_pos - 3])
                grab_three.append(comments_split[wrd_pos - 2])
                grab_three.append(comments_split[wrd_pos - 1])
                grab_three.append(comments_split[wrd_pos])
            
                transactions.append(grab_three)
                
        '''NEED TO UPDATE VALUES HERE'''
                
        wrd_pos += 1 # Increment the counter.
        
        

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until
Widget Javascript not detected.  It may not be installed or enabled properly.


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if sys.path[0] == '':
Widget Javascript not detected.  It may not be installed or enabled properly.





Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.


Widget Javascript not detected.  It may not be installed or enabled properly.








Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.


Widget Javascript not detected.  It may not be installed or enabled properly.








Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.






In [47]:
transactions

[['not', 'telling', 'you', 'prd'],
 ['an', 'open', 'relationship', 'prd'],
 ['them', 'think', 'you', 'snglqt'],
 ['exclusive', 'when', 'you', 'snglqt'],
 ['snglqt', 're', 'not', 'cmma'],
 ['not', 'cmma', 'that', 'snglqt'],
 ['s', 'messed', 'up', 'prd'],
 ['up', 'prd', 'I', 'snglqt'],
 ['not', 'trust', 'him', 'prd'],
 ['hide', 'and', 'you', 'snglqt'],
 ['stages', 'of', 'dating', 'prd'],
 ['dating', 'prd', 'He', 'snglqt'],
 ['a', 'big', 'dealbreaker', 'prd'],
 ['choose', 'to', 'do', 'xclm'],
 ['do', 'xclm', 'Yes', 'cmma'],
 ['Yes', 'cmma', 'it', 'snglqt'],
 ['to', 'be', 'ghosted', 'coln'],
 ['be', 'ghosted', 'coln', 'prd'],
 ['coln', 'prd', 'I', 'snglqt'],
 ['leave', 'it', 'be', 'prd'],
 ['prd', 'If', 'you', 'snglqt'],
 ['illegal', 'or', 'non', 'dsh'],
 ['dsh', 'consensual', 'stuff', 'cmma'],
 ['to', 'a', 'therapist', 'prd'],
 ['consensual', 'and', 'legal', 'cmma'],
 ['should', 'be', 'okay', 'prd'],
 ['your', 'sexual', 'desires', 'lftparen'],
 ['a', 'bad', 'person', 'cmma'],
 ['person', 

In [49]:
len(transactions)

695

In [51]:
t = pre_fp(samp)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.
Widget Javascript not detected.  It may not be installed or enabled properly.





Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
Widget Javascript not detected.  It may not be installed or enabled properly.





In [52]:
len(t)

7190

In [53]:
t

[['are', 'coming', 'from', 'dsh'],
 ['It', 'just', 'doesn', 'snglqt'],
 ['t', 'explain', 'today', 'snglqt'],
 ['from', 'her', 'bf', 'qstn'],
 ['Like', 'I', 'said', 'cmma'],
 ['as', 'fuck', 'action', 'prd'],
 ['them', 'then', 'yes', 'prd'],
 ['yes', 'prd', 'Derp', 'prd'],
 ['was', 'pretty', 'awful', 'prd'],
 ['a', '15km', 'range', 'prd'],
 ['and', 'needed', 'help', 'prd'],
 ['back', 'and', 'forth', 'prd'],
 ['want', 'to', 'say', 'cmma'],
 ['Facebook', 'and', 'Google', 'prd'],
 ['she', 'never', 'helped', 'cmma'],
 ['as', 'a', 'friend', 'cmma'],
 ['so', 'much', 'trouble', 'prd'],
 ['in', 'the', 'house', 'prd'],
 ['prd', 'But', 'hey', 'cmma'],
 ['to', 'be', 'attracted', 'prd'],
 ['be', 'attracted', 'prd', 'prd'],
 ['attracted', 'prd', 'prd', 'prd'],
 ['prd', 'prd', 'Edit', 'coln'],
 ['in', 'the', 'statecountry', 'prd'],
 ['change', 'of', 'everything', 'prd'],
 ['didnt', 'work', 'out', 'prd'],
 ['been', '8', 'years', 'prd'],
 ['kids', 'or', 'married', 'prd'],
 ['is', 'your', 'destiny', 'prd