# Am I the AHole Dataset Construction

Constructing datasets for the AITA subreddit as an exercise using AllenNLP.

In [3]:
import sys
sys.path.append('.')
sys.path.append('..')

from subreddit_frequency import load_dataframe_from_jsonl
from tqdm import tqdm
tqdm.pandas()

import seaborn as sns
from datetime import datetime
sns.set('paper')

from ipywidgets import interact
import pandas as pd
from pathlib import Path
from matplotlib import pyplot as plt

## Data Loading

Loads the data from sharded files, need to rebuild and concatonate.
May take up a lot of memory, so runon a computer with enough RAM.

In [4]:
submissions_df = pd.concat([
    load_dataframe_from_jsonl(a)
    for a in Path("../data").glob("RS_*")
],axis=0)

30953it [00:01, 19820.90it/s]
30467it [00:01, 19997.87it/s]
0it [00:00, ?it/s]
27406it [00:01, 15987.40it/s]
27375it [00:00, 30716.96it/s]
28643it [00:01, 21418.59it/s]
27735it [00:01, 17225.57it/s]
7561it [00:00, 25363.85it/s]
30027it [00:01, 17168.54it/s]
29665it [00:01, 15451.61it/s]
6479it [00:00, 24207.00it/s]
11577it [00:00, 26507.17it/s]
35220it [00:02, 15621.15it/s]
34476it [00:01, 28917.54it/s]
26443it [00:00, 30869.78it/s]
25888it [00:02, 12167.12it/s]
406it [00:00, 20271.02it/s]
23976it [00:00, 30038.18it/s]
15621it [00:00, 25766.78it/s]
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


In [5]:
comments_df = pd.concat([
    load_dataframe_from_jsonl(a)
    for a in Path("../data").glob("RC_*")
],axis=0)

1120561it [00:30, 36538.71it/s]
714672it [00:16, 54845.95it/s]

MemoryError: 

## Dataset Statistics

In [None]:
#print(comments_df.created_utc.conve.describe())
sns.distplot(submissions_df.num_comments,bins=5000, kde=True);
plt.xlim(0,200)
plt.show()

In [None]:
submissions_df.score.describe()

In [None]:
sns.regplot(x='score', y='num_comments', data=submissions_df)
plt.show()

Preprocess the comment to have:
* the id of the previous post that it is a comment to
* the absolute value of the score

## Helper Functions

Helper functions to:
1. Get comments from ID
2. Determine if someone is an AH

In [None]:
def get_comments_from_id(df, parent_id):
    cols = ['author_flair_text', 'stickied', 'author', 'body', 'score', 'score_abs', 'judgement']
    return df[df.prev_id == parent_id][cols]

In [None]:
def determine_AH(body):
    """Determines if poster thinks asshole or not asshole."""
    if body.startswith("YTA"):
        return "YTA"
    elif body.startswith("ESH"):
        return "ESH"
    elif body.startswith("NAH"):
        return "NAH"
    elif body.startswith("NTA"):
        return "NTA"
    else:
        return "UNK"

In [None]:
comments_df['prev_id'] = comments_df.parent_id.map(lambda x: x.split('_')[-1])
comments_df['score_abs'] = comments_df.score.map(abs)
comments_df['judgement'] = comments_df.body.map(determine_AH)
submissions_df['timestamp'] = submissions_df.created_utc.map(datetime.fromtimestamp)
submissions_df = submissions_df.sort_values('num_comments', ascending=False)

Only get the most popular titles with the highest number of comments.

In [None]:
pd.set_option('display.max_rows', 500)
good_submissions_df = submissions_df[submissions_df.num_comments > 30]

In [None]:
len(good_submissions_df)

In [None]:
display(list(good_submissions_df.head().title))
display(list(good_submissions_df.head().id))

For matching, filter by if the comment actually makes a judgement.

In [None]:
comments_df.judgement.value_counts()

In [None]:
judgement_df = comments_df[comments_df.judgement != "UNK"].reset_index()

Since it's too slow to join by parentid or query the correct comments from the post id, we will first aggregate by parent id when computing judgements on the judgement dataframe directly before joining it on.

In [None]:
judgement_df.prev_id.value_counts().nlargest(5)

In [None]:
judgement_df.prev_id.nunique()

In [None]:
vote_df = judgement_df.groupby('prev_id').judgement.agg(pd.Series.mode)

In [None]:
vote_df = vote_df.reset_index()
vote_df.judgement = vote_df.judgement.astype(str)

In [None]:
classes = ['ESH', 'NAH', 'NTA', 'YTA']
vote_df = vote_df[vote_df.judgement.isin(classes)]

In [None]:
vote_df = vote_df.set_index('prev_id')

When the mode is returned, there will sometimes be ties.
So let's get rid of the ties.

In [None]:
def get_label_from_comments(df):
    try:
        return df.groupby('judgement').score.sum().idxmax()
    except ValueError:
        return "UNK"
    
def get_label_from_submission(submission_id):
    df = get_comments_from_id(judgement_df, submission_id)
    return get_label_from_comments(df)

def get_label_from_vote_id(submission_id):
    try:
        return vote_df.loc[submission_id].judgement
    except KeyError:
        return "UNK"

In [None]:
good_submissions_df.head().id

In [None]:
good_submissions_df['label'] = good_submissions_df.id.progress_map(get_label_from_vote_id)

In [None]:
good_submissions_df.label.value_counts()

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
good_submissions_df[['title', 'label']].head()
print(good_submissions_df[good_submissions_df.label == 'UNK'].title)

In [None]:
comments_df.head()

In [None]:
label_counts = good_submissions_df.label.value_counts().to_frame()
label_counts.columns = ['counts']
label_counts['label'] = label_counts.index
sns.barplot(y='label', x='counts', data=label_counts)
plt.plot()

In [None]:
good_submissions_df.head()

## Train Test Split

Eliminates the posts that have an unknown label, and splits our huge dataset into a train/val/test split.

In [47]:
dataset_df = good_submissions_df[good_submissions_df.label != 'UNK']

In [48]:
test_dataset_df = dataset_df.sample(frac=0.1)

In [49]:
traindev_dataset_df = dataset_df.drop(test_dataset_df.index)

In [50]:
train_dataset_df = traindev_dataset_df.sample(frac=0.8)
dev_dataset_df = traindev_dataset_df.drop(train_dataset_df.index)

In [51]:
dev_dataset_df.to_pickle('../aita/aita-dev.pkl')
train_dataset_df.to_pickle('../aita/aita-train.pkl')
test_dataset_df.to_pickle('../aita/aita-test.pkl')