In [1]:
import re
import string
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.preprocessing import TransactionEncoder

In [2]:
# Test dataframe in the desired format.
testdf = pd.DataFrame([[1, 'F', 'test1', 'www.test1.com', "35, I have a pretty good job but I don't love it. I'm still trying to figure out what I want to be when I grow up. Situation could be a lot worse but I'm at a point where I want to do something I enjoy. @ My family (live with my real mom and step dad) act like me being outed by older sister never happened. And on top of all of this I'm struggling trying to study for my math ged which I failed six times with being 1 point shy from passing."], [2, 'M', 'test2', 'www.test2.com', "If I fail it again, its another two months of waiting to retake it. @ I'm really not sure how i'm supposed to feel anymore knowing that most of my life I was just wasting my time. @ I'm an IT apprentice and was about a month away from finishing my level 4 course. My training provider has just closed down due to a fraud investigation so now im in a position of trying to find a provider for my specific course so I can finish."], [3, 'M', 'test3', 'www.test3.com', "And there's no guarantee my years work can be transferred so I may not even be able to get my certificate without another years of work. @ 29 and I don't like being a mum @ I was going to complain about being a 24yo male who's single and doesn't want to be but some of these responses make mine a drop in the bucket compared to the waters some of you face."], [4, 'F', 'test4', 'www.test4.com', "Best of luck to all of you. @ Split up with my long term so of 5 years and decided to go for a complete career change. Going to pack in this life and go travel for a bit. Experience the world and figure out what it is I want. @ I'm 30, diagnosed last year with ADHS, failed university this year, just needed to write my thesis, no intimacy with my girlfriend of 2 years anymore."], [5, 'F', 'test5', 'www.test5.com', "Yeah Life! @ 32, alcoholic, narcissistic, sociopath, broke, in debt, and i feel eventually my finances will magically fix themselves. I’m also a cocky piece of shit who doesn’t take advice or think there’s anything wrong with me. I truthfully think I’m ok. I need help. So much. @ My dad has lung cancer and is refusing treatment, which is fine. He's a 68 year old man who loves to drink and smoke and has accepted the hand he was dealt. He doesn't have to go through treatment if he doesn't want too."]], columns = ['Age', 'Gender', 'Redditor', 'URL', 'Comments'])

In [3]:
# Define our dictionary for punctuation to word conversion
marks = ['.',',',':',';','?','!','-','(',')','"',"'",'~']
marks_to_wrds = {' prd ': '.', ' cmma ': ',', ' coln ': ':', ' smicln ': ';', ' qstn ': '?', ' xclm ': '!', ' dsh ': '-', ' lftparen ': '(', ' rghtparen ': ')', ' dblqt ': '"', ' snglqt ': "'", ' elpss ': '~'}

In [4]:
def pre_fp(df):
    
    # Concatinate all the comments into one long string.
    concat_all_comments = ' '.join(df["text"])
    punc_words = marks_to_wrds.keys()
    
    transactions = []
    fqt_pat_sample = concat_all_comments

    for character in tqdm(fqt_pat_sample, desc = '1'):
        if character in marks:
            fqt_pat_sample = fqt_pat_sample.replace(character, list(marks_to_wrds.keys())[list(marks_to_wrds.values()).index(character)])

    wrd_pos = 0
    
    # Remove all remaining punctuation from fqt_pat_sample.
    fqt_pat_sample = fqt_pat_sample.translate(str.maketrans('','', string.punctuation))
    fqt_pat_sample_split = fqt_pat_sample.split()
    for word in tqdm(fqt_pat_sample_split, desc = '2'):
        grab_three = []
        word_formatted = ' ' + word + ' '
        if word_formatted in punc_words:
            if wrd_pos >= 3:
                grab_three.append(fqt_pat_sample_split[wrd_pos - 3])
                grab_three.append(fqt_pat_sample_split[wrd_pos - 2])
                grab_three.append(fqt_pat_sample_split[wrd_pos - 1])
                grab_three.append(fqt_pat_sample_split[wrd_pos])
            
                transactions.append(grab_three) 
        wrd_pos += 1
        
    return transactions

In [5]:
def get_fp(transactions, min_sup = 0.01):
    
    # Put the transactions into dataframe form, this allows us to perform FPgrowth on the resultant dataframe.
    te = TransactionEncoder()
    te_transformed = te.fit(transactions).transform(transactions)
    df = pd.DataFrame(te_transformed, columns = te.columns_)

    # Apply FPgrowth to the transactional dataframe.
    freq_pats = fpgrowth(df, min_support = min_sup, use_colnames = True)
    return freq_pats

In [6]:
'''
# Original FPgrowth function for reference.

def orig_fp(writing_sample):
    
    transactions = []
    fqt_pat_sample = writing_sample

    for character in fqt_pat_sample:
        if character in marks:
            fqt_pat_sample = fqt_pat_sample.replace(character, list(marks_to_wrds.keys())[list(marks_to_wrds.values()).index(character)])

    wrd_pos = 0
    for word in fqt_pat_sample.split():
        grab_three = []
        word_formatted = ' ' + word + ' '
        if word_formatted in marks_to_wrds.keys():
            if wrd_pos >= 3:
                grab_three.append(fqt_pat_sample.split()[wrd_pos - 3])
                grab_three.append(fqt_pat_sample.split()[wrd_pos - 2])
                grab_three.append(fqt_pat_sample.split()[wrd_pos - 1])
                grab_three.append(fqt_pat_sample.split()[wrd_pos])
            
                transactions.append(grab_three) 
        wrd_pos += 1
    
    # Put the transactions into dataframe form, this allows us to perform FPgrowth on the resultant dataframe.
    te = TransactionEncoder()
    te_transformed = te.fit(transactions).transform(transactions)
    df = pd.DataFrame(te_transformed, columns = te.columns_)

    # Apply FPgrowth to the transactional dataframe.
    freq_pats = fpgrowth(df, min_support = 0.5, use_colnames = True)
    return freq_pats
'''

"\n# Original FPgrowth function for reference.\n\ndef orig_fp(writing_sample):\n    \n    transactions = []\n    fqt_pat_sample = writing_sample\n\n    for character in fqt_pat_sample:\n        if character in marks:\n            fqt_pat_sample = fqt_pat_sample.replace(character, list(marks_to_wrds.keys())[list(marks_to_wrds.values()).index(character)])\n\n    wrd_pos = 0\n    for word in fqt_pat_sample.split():\n        grab_three = []\n        word_formatted = ' ' + word + ' '\n        if word_formatted in marks_to_wrds.keys():\n            if wrd_pos >= 3:\n                grab_three.append(fqt_pat_sample.split()[wrd_pos - 3])\n                grab_three.append(fqt_pat_sample.split()[wrd_pos - 2])\n                grab_three.append(fqt_pat_sample.split()[wrd_pos - 1])\n                grab_three.append(fqt_pat_sample.split()[wrd_pos])\n            \n                transactions.append(grab_three) \n        wrd_pos += 1\n    \n    # Put the transactions into dataframe form, this allo

In [7]:
# testing

samp = pd.read_csv('stats_clean.csv', error_bad_lines = False, encoding = 'latin1')
samp = samp.dropna()

In [8]:
transactions = pre_fp(samp)

KeyboardInterrupt: 

In [None]:
get_fp(transactions)

In [70]:
samp

Unnamed: 0,age,gender,author,full_link,text
0,28,Female,lunabuddy,https://www.reddit.com/r/relationships/comment...,I feel bad but I got pissed when these world l...
1,55,Male,Peenutbuttjellytime,https://www.reddit.com/r/relationships/comment...,I wouldn't say that being a serial monogamist ...
2,24,Male,DatingAdviceTA1224,https://www.reddit.com/r/relationships/comment...,I understand exactly where you are coming from...
3,18,Male,Purple_Cherry_Hunter,https://www.reddit.com/r/relationships/comment...,It's just a generalization stereotype that peo...
4,21,Female,kafkascockroachgrace,https://www.reddit.com/r/relationships/comment...,U@ Me too@ First and foremost just know that l...
...,...,...,...,...,...
26640,26,Female,bonky24,https://www.reddit.com/r/relationship_advice/c...,Honestly its hard but you both deserve someone...
26641,19,Female,Rmonie01,https://www.reddit.com/r/relationship_advice/c...,"Me too@ Hi, I'm 19 and I'm dating a 28 year ol..."
26642,24,Female,thiccvegan_,https://www.reddit.com/r/relationship_advice/c...,I completed my first ever 24 hour fast today! ...
26643,23,Male,markrediot,https://www.reddit.com/r/relationship_advice/c...,Lol why do you make a new account every week a...


In [10]:
fqt_pat_sample

NameError: name 'fqt_pat_sample' is not defined