# Am I the AHole Dataset Construction

Constructing datasets for the AITA subreddit as an exercise using AllenNLP.

In [1]:
import sys
sys.path.append('.')
sys.path.append('..')

from subreddit_frequency import load_dataframe_from_jsonl
from tqdm import tqdm
tqdm.pandas()

import seaborn as sns
from datetime import datetime
sns.set('paper')

from ipywidgets import interact
import pandas as pd

In [2]:
comments_df = load_dataframe_from_jsonl("../data/RC_2019-08_AmItheAsshole.dump")
submissions_df = load_dataframe_from_jsonl("../data/RS_2019-08_AmItheAsshole.dump")

1454892it [00:46, 31426.76it/s]
34476it [00:01, 25498.06it/s]


Preprocess the comment to have:
* the id of the previous post that it is a comment to
* the absolute value of the score

In [3]:
def get_comments_from_id(df, parent_id):
    cols = ['author_flair_text', 'stickied', 'author', 'body', 'score', 'score_abs', 'judgement']
    return df[df.prev_id == parent_id][cols]

In [4]:
def determine_AH(body):
    """Determines if poster thinks asshole or not asshole."""
    if body.startswith("YTA"):
        return "YTA"
    elif body.startswith("ESH"):
        return "ESH"
    elif body.startswith("NAH"):
        return "NAH"
    else:
        return "UNK"

In [5]:
comments_df['prev_id'] = comments_df.parent_id.map(lambda x: x.split('_')[-1])
comments_df['score_abs'] = comments_df.score.map(abs)
comments_df['judgement'] = comments_df.body.map(determine_AH)
submissions_df['timestamp'] = submissions_df.created_utc.map(datetime.fromtimestamp)
submissions_df = submissions_df.sort_values('num_comments', ascending=False)

Only get the most popular titles with the highest number of comments.

In [6]:
pd.set_option('display.max_rows', 500)
good_submissions_df = submissions_df.head(2000)

In [7]:
display(list(good_submissions_df.head().title))
display(list(good_submissions_df.head().id))

['AITA for refusing to pay back my cousin after my baby ruined her blouse?',
 'AITA for wanting my girlfriend to wear makeup and take care of body hair?',
 'AITA for telling a friend’s friend that he couldn’t keep the “jackpot” that he hit on my antique slot machine? (About $700)',
 'WIBTA if I told a close family friend that her husband cheated on her 4 years ago?',
 'AITA for catfishing my underaged sister on Tinder and humiliating her in order to teach her a lesson?']

['cw43oc', 'cqin60', 'cm0bft', 'cvlkut', 'cn7li5']

In [8]:
def get_label_from_comments(df):
    try:
        return df[df.judgement != "UNK"].groupby('judgement').score.sum().idxmax()
    except ValueError:
        return "UNK"
    
def get_label_from_submission(submission_id):
    df = get_comments_from_id(comments_df, submission_id)
    return get_label_from_comments(df)

In [9]:
good_submissions_df['label'] = good_submissions_df.id.progress_map(get_label_from_submission)

 10%|▉         | 198/2000 [00:32<04:35,  6.55it/s]

In [10]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
good_submissions_df[['title', 'label']].head()

Unnamed: 0,title,label
29410,AITA for refusing to pay back my cousin after ...,ESH
16142,AITA for wanting my girlfriend to wear makeup ...,YTA
4184,AITA for telling a friend’s friend that he cou...,YTA
28267,WIBTA if I told a close family friend that her...,YTA
7540,AITA for catfishing my underaged sister on Tin...,YTA


In [11]:
good_submissions_df.label.value_counts()

YTA    901
NAH    640
ESH    423
UNK     36
Name: label, dtype: int64

In [12]:
dataset_df = good_submissions_df[good_submissions_df.label != 'UNK']

In [13]:
test_dataset_df = dataset_df.sample(frac=0.1)

In [14]:
traindev_dataset_df = dataset_df.drop(test_dataset_df.index)

In [15]:
train_dataset_df = traindev_dataset_df.sample(frac=0.8)
dev_dataset_df = traindev_dataset_df.drop(train_dataset_df.index)

In [16]:
dev_dataset_df.to_pickle('../aita/aita-dev.pkl')
train_dataset_df.to_pickle('../aita/aita-train.pkl')
test_dataset_df.to_pickle('../aita/aita-test.pkl')