# Am I the AHole Dataset Construction

Constructing datasets for the AITA subreddit as an exercise using AllenNLP.

In [1]:
import sys
sys.path.append('.')
sys.path.append('..')

from subreddit_frequency import load_dataframe_from_jsonl
from tqdm import tqdm
tqdm.pandas()

import seaborn as sns
from datetime import datetime
sns.set('paper')

from ipywidgets import interact
import pandas as pd

In [2]:
comments_df = load_dataframe_from_jsonl("../data/RC_2019-08_AmItheAsshole.dump")
submissions_df = load_dataframe_from_jsonl("../data/RS_2019-08_AmItheAsshole.dump")

1454892it [00:47, 30554.14it/s]
34476it [00:01, 2588.00it/s]

Preprocess the comment to have:
* the id of the previous post that it is a comment to
* the absolute value of the score

In [3]:
def get_comments_from_id(df, parent_id):
    cols = ['author_flair_text', 'stickied', 'author', 'body', 'score', 'score_abs', 'judgement']
    return df[df.prev_id == parent_id][cols]

In [4]:
def determine_AH(body):
    """Determines if poster thinks asshole or not asshole."""
    if body.startswith("YTA"):
        return "YTA"
    elif body.startswith("ESH"):
        return "ESH"
    elif body.startswith("NAH"):
        return "NAH"
    else:
        return "UNK"

In [5]:
comments_df['prev_id'] = comments_df.parent_id.map(lambda x: x.split('_')[-1])
comments_df['score_abs'] = comments_df.score.map(abs)
comments_df['judgement'] = comments_df.body.map(determine_AH)
submissions_df['timestamp'] = submissions_df.created_utc.map(datetime.fromtimestamp)
submissions_df = submissions_df.sort_values('num_comments', ascending=False)

Only get the most popular titles with the highest number of comments.

In [6]:
pd.set_option('display.max_rows', 500)
good_submissions_df = submissions_df[['title', 'num_comments', 'id']].head(2000)

In [7]:
display(list(good_submissions_df.head().title))
display(list(good_submissions_df.head().id))

['AITA for refusing to pay back my cousin after my baby ruined her blouse?',
 'AITA for wanting my girlfriend to wear makeup and take care of body hair?',
 'AITA for telling a friend’s friend that he couldn’t keep the “jackpot” that he hit on my antique slot machine? (About $700)',
 'WIBTA if I told a close family friend that her husband cheated on her 4 years ago?',
 'AITA for catfishing my underaged sister on Tinder and humiliating her in order to teach her a lesson?']

['cw43oc', 'cqin60', 'cm0bft', 'cvlkut', 'cn7li5']

In [8]:
def get_label_from_comments(df):
    try:
        return df[df.judgement != "UNK"].groupby('judgement').score.sum().idxmax()
    except ValueError:
        return "UNK"
    
def get_label_from_submission(submission_id):
    df = get_comments_from_id(comments_df, submission_id)
    return get_label_from_comments(df)

In [None]:
good_submissions_df['label'] = good_submissions_df.id.progress_map(get_label_from_submission)

 10%|▉         | 198/2000 [00:29<04:37,  6.50it/s]

Unnamed: 0,title,num_comments,id
29410,AITA for refusing to pay back my cousin after ...,8858,cw43oc
16142,AITA for wanting my girlfriend to wear makeup ...,7316,cqin60
4184,AITA for telling a friend’s friend that he cou...,7180,cm0bft
28267,WIBTA if I told a close family friend that her...,6236,cvlkut
7540,AITA for catfishing my underaged sister on Tin...,5538,cn7li5
...,...,...,...
28902,AITA for throwing out my neighbors laundry to ...,89,cvvwtx
8712,AITA for thinking girls who have sex can't be ...,89,cnorlu
33296,AITA for letting my son dye his hair when he w...,89,cxmrqq
22656,AITA for telling my wife that it's time to put...,89,ct99sj
