In [211]:
"""Logistic regression on AITA data for two classes: YTA, NTA"""

import pandas as pd
import re

import nltk
# pretrained tokenizer
nltk.download('punkt')
from nltk.tokenize import word_tokenize 

# import stemmer
from nltk.stem import PorterStemmer 

# remove stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords

data = pd.read_csv('data.csv')  

def format_dataframe(data):
    data = data.rename(columns = {"link_flair_text":"label"})

    data['label'] = data['label'].map({'Asshole': 1, 'Not the A-hole': 0})

    return data

data = format_dataframe(data)

data

[nltk_data] Downloading package punkt to /Users/sander/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sander/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,title,body,label
0,AITA for refusing to give my parking spot to a...,I live in an apartment building that comes wit...,0
1,"AITA for not agreeing to be used to ""keep thin...","Me (37f), boyfriend (42m), his daughter (13f)....",0
2,WIBTA if i start putting vaseline on the front...,so my apartment unit is on the first floor of ...,1
3,AITA for asking my friend to stop talking abou...,I'm going to start off by saying: I'm 100% pro...,0
4,AITA for asking my friend to stop the way she ...,I am a plus sized gal who is really into fashi...,0
...,...,...,...
799,WIBTA if I ban a man from the children’s libra...,Bit of a pickle here. I am a children’s librar...,1
800,AITA for walking out on a patient and refusing...,"Hello, I have had conflicting feedback about m...",0
801,AITA for literally ignoring my mother when she...,"My wife (30f) and I (32f, we're both women) ar...",0
802,AITA for wanting to name my son for a video ga...,My wife (26F) and I (27M) are both big gamers....,0


In [212]:
# PREPROCESSING
pp_cols = ["title", "body"]

# stemmer
ps = PorterStemmer() 

def prep_dataframe(data):
    # make all lowercase
    data[pp_cols] = data[pp_cols].applymap(lambda s:s.lower())

    # Remove all the special characters
    data[pp_cols] = data[pp_cols].applymap(lambda s:re.sub(r'\W', ' ',s))

    # remove all single characters
    data[pp_cols] = data[pp_cols].applymap(lambda s:re.sub(r'\s+[a-zA-Z]\s+', ' ',s))

    # Remove single characters from the start (no space before them, but space after)
    data[pp_cols] = data[pp_cols].applymap(lambda s:re.sub(r'^[a-zA-Z]\s+', ' ',s))

    # Removing prefixed 'b'
    data[pp_cols] = data[pp_cols].applymap(lambda s:re.sub(r'^b\s+', ' ',s,flags=re.I))

    # remove all of: x200b (zero-width space)
    data[pp_cols] = data[pp_cols].applymap(lambda s:re.sub(r'x200b', ' ',s))

    # Substituting multiple spaces with single space
    data[pp_cols] = data[pp_cols].applymap(lambda s:re.sub(r'\s+', ' ',s,flags=re.I))



    # tokenize data
    data[pp_cols] = data[pp_cols].applymap(lambda s:word_tokenize(s))

    # perform stemming
    data[pp_cols] = data[pp_cols].applymap(lambda s: [ps.stem(word) for word in s])

    # remove stopwords
    data[pp_cols] = data[pp_cols].applymap(lambda s: [word for word in s if word not in stopwords.words('english')])
    
prep_dataframe(data)

In [213]:
posFreqs = {}
negFreqs = {}
words = set(())
totalPos = 0
totalNeg = 0
# calculate frequencies
for index, row in data.iterrows():
    dic = posFreqs if row['label'] == 0 else negFreqs
    words_in_body = len(row['body'])
    if row['label'] == 0:
        totalPos += words_in_body
    else:
        totalNeg += words_in_body

    for word in row['body']:
        if word in dic:
            dic[word] += 1.0
        else:
            dic[word] = 1.0
        words.add(word)

def LaplacianSmoothFrequencies(dic, total):
    for key in dic:
        val = dic[key] + 1
        val /= (len(words) + total)

LaplacianSmoothFrequencies(posFreqs, totalPos)
LaplacianSmoothFrequencies(negFreqs, totalNeg)

def test_freqs(dataframe, preprocess = True):
    if preprocess:
        prep_dataframe(dataframe)
    pred_labels = []
    for index, row in dataframe.iterrows():
        count = 1
        for word in row['body']:
            if word in posFreqs:
                count *= posFreqs[word]
            if word in negFreqs:
                count /= negFreqs[word]
        
        if count > 0:
            pred_labels.append(0)
        elif count < 0:
            pred_labels.append(1)
        else:
            pred_labels.append(-1)
    dataframe['pred_label'] = pred_labels
    
# test_dframe = pd.DataFrame({'title': ["HE hit me"], 'body': ["not my fault"], "link_flair_text":"Asshole"})
test_freqs(data, False)
    

In [215]:
len(data.query("label != pred_label"))/len(data)
# test_data = pd.read_csv('data_test.csv')  

# test_data = format_dataframe(test_data)

# prep_dataframe(test_data)

# test_freqs(test_data, False)


0.17164179104477612

In [198]:
# len(test_data.query("label != pred_label"))/len(test_data)
test_dframe = pd.DataFrame({'title': ["AITA for not letting my stepsister wear my wedding dress?"], 'body': ["""I (26f) got married in 2016. My wedding dress was a dress my mom made for me when I was 11. She knew she was sick and she was a dressmaker and so I asked her to make me a dress for prom and my wedding for the future. They were basically the same dress just one was pink and the other was white lol, but they mean the world to me. Both needed to be altered when I reached the point of wearing them but luckily my grandma is also a dressmaker and she did it for me.

My stepsister (24f) is now getting married and wants to wear my wedding dress to hers. She's my stepsister through my dad's marriage to his wife Liz. They got married when I was 7. She didn't know my mom though and they were nothing to each other. So for the sentimental part, and the part where she didn't even know my mom and couldn't share in that side of the sentimentality, I said no. I didn't want to give away my dress. Maybe if I have a daughter, or a daughter in law I'm close to someday, I might offer it there or give it to them if they want it. But for now I want to keep both and not share.

My no has led to fallout though. My stepsister feels like I'm unfair and is hurt that I don't consider her enough of a sister to share it with her regardless of who made it. My dad is pissed because this is the second time she wanted one of them. She wanted my prom dress as well and he said I should have been willing to share one since my stepsister has always looked up to me. Liz is pissed because I wouldn't let her fix my prom dress at the time (she offered with very little experience fitting dresses) and went to my grandma instead and now I won't let her daughter wear the dress. They all say it's been 5 years now and the wedding dress won't be worn again by me (which might not be true, my husband and I have talked about renewing our vows far in the future and wearing the same clothes).

I have been called selfish, mean, a bitch and other stuff and there is a lot of upset over my decision. My stepsister is genuinely sad. She doesn't even care about the sentimentality. She just wanted it because I had worn it and she always always loved getting my hand me downs.

AITA?"""], "link_flair_text":"Asshole"})

test_dframe = format_dataframe(test_dframe)

test_freqs(test_dframe)

In [199]:
len(data)

Unnamed: 0,title,body,label,pred_label
0,"[aita, let, stepsist, wear, wed, dress]","[26f, got, marri, 2016, wed, dress, wa, dress,...",1,0
