In [39]:
import sys
sys.path.append('.')
sys.path.append('..')

from subreddit_frequency import load_dataframe_from_jsonl
from tqdm import tqdm
tqdm.pandas()

import seaborn as sns
from datetime import datetime
sns.set('paper')

from ipywidgets import interact
import pandas as pd

In [25]:
comments_df = load_dataframe_from_jsonl("../data/RC_2019-08_AmItheAsshole.dump")
submissions_df = load_dataframe_from_jsonl("../data/RS_2019-08_AmItheAsshole.dump")

1454892it [00:45, 32194.47it/s]
34476it [00:01, 24617.49it/s]


Preprocess the comment to have:
* the id of the previous post that it is a comment to
* the absolute value of the score

In [69]:
def get_comments_from_id(df, parent_id):
    cols = ['author_flair_text', 'stickied', 'author', 'body', 'score', 'score_abs', 'judgement']
    return df[df.prev_id == parent_id][cols]

In [70]:
def determine_AH(body):
    """Determines if poster thinks asshole or not asshole."""
    if body.startswith("YTA"):
        return "YTA"
    elif body.startswith("ESH"):
        return "ESH"
    elif body.startswith("NAH"):
        return "NAH"
    else:
        return "UNK"

In [71]:
comments_df['prev_id'] = comments_df.parent_id.map(lambda x: x.split('_')[-1])
comments_df['score_abs'] = comments_df.score.map(abs)
comments_df['judgement'] = comments_df.body.map(determine_AH)
submissions_df['timestamp'] = submissions_df.created_utc.map(datetime.fromtimestamp)
submissions_df = submissions_df.sort_values('num_comments', ascending=False)

Only get the most popular titles with the highest number of comments.

In [72]:
pd.set_option('display.max_rows', 500)
good_submissions_df = submissions_df[['title', 'num_comments', 'id']].head(2000)

In [73]:
display(list(good_submissions_df.head().title))
display(list(good_submissions_df.head().id))

['AITA for refusing to pay back my cousin after my baby ruined her blouse?',
 'AITA for wanting my girlfriend to wear makeup and take care of body hair?',
 'AITA for telling a friend’s friend that he couldn’t keep the “jackpot” that he hit on my antique slot machine? (About $700)',
 'WIBTA if I told a close family friend that her husband cheated on her 4 years ago?',
 'AITA for catfishing my underaged sister on Tinder and humiliating her in order to teach her a lesson?']

['cw43oc', 'cqin60', 'cm0bft', 'cvlkut', 'cn7li5']

In [86]:
def get_label_from_comments(df):
    try:
        return df[df.judgement != "UNK"].groupby('judgement').score.sum().idxmax()
    except ValueError:
        return "UNK"
    
def get_label_from_submission(submission_id):
    df = get_comments_from_id(comments_df, submission_id)
    return get_label_from_comments(df)

In [87]:
good_submissions_df.id.progress_map(get_label_from_submission)


  0%|          | 0/2000 [00:00<?, ?it/s][A
  0%|          | 2/2000 [00:00<03:08, 10.60it/s][A
  0%|          | 3/2000 [00:00<03:52,  8.59it/s][A
  0%|          | 4/2000 [00:00<04:23,  7.57it/s][A
  0%|          | 5/2000 [00:00<04:45,  6.99it/s][A
  0%|          | 6/2000 [00:00<05:00,  6.64it/s][A
  0%|          | 7/2000 [00:01<05:10,  6.41it/s][A
  0%|          | 8/2000 [00:01<05:18,  6.25it/s][A
  0%|          | 9/2000 [00:01<05:23,  6.15it/s][A
  0%|          | 10/2000 [00:01<05:36,  5.91it/s][A
  1%|          | 11/2000 [00:01<05:36,  5.90it/s][A
  1%|          | 12/2000 [00:01<05:38,  5.87it/s][A
  1%|          | 13/2000 [00:02<05:37,  5.89it/s][A
  1%|          | 14/2000 [00:02<05:38,  5.88it/s][A
  1%|          | 15/2000 [00:02<05:36,  5.90it/s][A
  1%|          | 16/2000 [00:02<05:34,  5.92it/s][A
  1%|          | 17/2000 [00:02<05:35,  5.92it/s][A
  1%|          | 18/2000 [00:02<05:33,  5.95it/s][A
  1%|          | 19/2000 [00:03<05:33,  5.94it/s][A
  1%|    

  8%|▊         | 154/2000 [00:25<04:56,  6.23it/s][A
  8%|▊         | 155/2000 [00:25<04:55,  6.24it/s][A
  8%|▊         | 156/2000 [00:25<04:55,  6.24it/s][A
  8%|▊         | 157/2000 [00:25<04:54,  6.26it/s][A
  8%|▊         | 158/2000 [00:25<04:53,  6.27it/s][A
  8%|▊         | 159/2000 [00:25<04:53,  6.27it/s][A
  8%|▊         | 160/2000 [00:26<04:52,  6.28it/s][A
  8%|▊         | 161/2000 [00:26<04:53,  6.27it/s][A
  8%|▊         | 162/2000 [00:26<04:52,  6.28it/s][A
  8%|▊         | 163/2000 [00:26<04:52,  6.27it/s][A
  8%|▊         | 164/2000 [00:26<04:52,  6.28it/s][A
  8%|▊         | 165/2000 [00:26<04:53,  6.26it/s][A
  8%|▊         | 166/2000 [00:27<04:53,  6.25it/s][A
  8%|▊         | 167/2000 [00:27<04:52,  6.26it/s][A
  8%|▊         | 168/2000 [00:27<04:53,  6.24it/s][A
  8%|▊         | 169/2000 [00:27<04:52,  6.25it/s][A
  8%|▊         | 170/2000 [00:27<04:52,  6.26it/s][A
  9%|▊         | 171/2000 [00:27<04:52,  6.24it/s][A
  9%|▊         | 172/2000 [0

 15%|█▌        | 305/2000 [00:49<04:29,  6.29it/s][A
 15%|█▌        | 306/2000 [00:49<04:29,  6.28it/s][A
 15%|█▌        | 307/2000 [00:49<04:28,  6.30it/s][A
 15%|█▌        | 308/2000 [00:49<04:28,  6.29it/s][A
 15%|█▌        | 309/2000 [00:49<04:28,  6.29it/s][A
 16%|█▌        | 310/2000 [00:50<04:37,  6.09it/s][A
 16%|█▌        | 311/2000 [00:50<04:35,  6.13it/s][A
 16%|█▌        | 312/2000 [00:50<04:32,  6.19it/s][A
 16%|█▌        | 313/2000 [00:50<04:31,  6.21it/s][A
 16%|█▌        | 314/2000 [00:50<04:31,  6.22it/s][A
 16%|█▌        | 315/2000 [00:50<04:30,  6.23it/s][A
 16%|█▌        | 316/2000 [00:50<04:29,  6.25it/s][A
 16%|█▌        | 317/2000 [00:51<04:29,  6.24it/s][A
 16%|█▌        | 318/2000 [00:51<04:39,  6.01it/s][A
 16%|█▌        | 319/2000 [00:51<04:36,  6.08it/s][A
 16%|█▌        | 320/2000 [00:51<04:33,  6.14it/s][A
 16%|█▌        | 321/2000 [00:51<04:31,  6.19it/s][A
 16%|█▌        | 322/2000 [00:51<04:29,  6.22it/s][A
 16%|█▌        | 323/2000 [0

 23%|██▎       | 456/2000 [01:13<04:04,  6.31it/s][A
 23%|██▎       | 457/2000 [01:13<04:04,  6.31it/s][A
 23%|██▎       | 458/2000 [01:13<04:04,  6.29it/s][A
 23%|██▎       | 459/2000 [01:13<04:04,  6.30it/s][A
 23%|██▎       | 460/2000 [01:13<04:04,  6.30it/s][A
 23%|██▎       | 461/2000 [01:14<04:04,  6.29it/s][A
 23%|██▎       | 462/2000 [01:14<04:04,  6.30it/s][A
 23%|██▎       | 463/2000 [01:14<04:03,  6.32it/s][A
 23%|██▎       | 464/2000 [01:14<04:02,  6.33it/s][A
 23%|██▎       | 465/2000 [01:14<04:02,  6.34it/s][A
 23%|██▎       | 466/2000 [01:14<04:02,  6.33it/s][A
 23%|██▎       | 467/2000 [01:15<04:01,  6.34it/s][A
 23%|██▎       | 468/2000 [01:15<04:02,  6.33it/s][A
 23%|██▎       | 469/2000 [01:15<04:01,  6.33it/s][A
 24%|██▎       | 470/2000 [01:15<04:01,  6.33it/s][A
 24%|██▎       | 471/2000 [01:15<04:01,  6.32it/s][A
 24%|██▎       | 472/2000 [01:15<04:01,  6.32it/s][A
 24%|██▎       | 473/2000 [01:15<04:01,  6.32it/s][A
 24%|██▎       | 474/2000 [0

 30%|███       | 607/2000 [01:37<03:39,  6.34it/s][A
 30%|███       | 608/2000 [01:37<03:40,  6.33it/s][A
 30%|███       | 609/2000 [01:37<03:39,  6.34it/s][A
 30%|███       | 610/2000 [01:37<03:39,  6.34it/s][A
 31%|███       | 611/2000 [01:37<03:39,  6.33it/s][A
 31%|███       | 612/2000 [01:38<03:39,  6.33it/s][A
 31%|███       | 613/2000 [01:38<03:38,  6.34it/s][A
 31%|███       | 614/2000 [01:38<03:38,  6.34it/s][A
 31%|███       | 615/2000 [01:38<03:37,  6.37it/s][A
 31%|███       | 616/2000 [01:38<03:37,  6.36it/s][A
 31%|███       | 617/2000 [01:38<03:37,  6.36it/s][A
 31%|███       | 618/2000 [01:38<03:36,  6.37it/s][A
 31%|███       | 619/2000 [01:39<03:37,  6.35it/s][A
 31%|███       | 620/2000 [01:39<03:37,  6.34it/s][A
 31%|███       | 621/2000 [01:39<03:37,  6.34it/s][A
 31%|███       | 622/2000 [01:39<03:37,  6.34it/s][A
 31%|███       | 623/2000 [01:39<03:37,  6.32it/s][A
 31%|███       | 624/2000 [01:39<03:39,  6.27it/s][A
 31%|███▏      | 625/2000 [0

 38%|███▊      | 758/2000 [02:01<03:15,  6.35it/s][A
 38%|███▊      | 759/2000 [02:01<03:15,  6.34it/s][A
 38%|███▊      | 760/2000 [02:01<03:15,  6.34it/s][A
 38%|███▊      | 761/2000 [02:01<03:14,  6.36it/s][A
 38%|███▊      | 762/2000 [02:01<03:15,  6.33it/s][A
 38%|███▊      | 763/2000 [02:01<03:16,  6.28it/s][A
 38%|███▊      | 764/2000 [02:02<03:16,  6.29it/s][A
 38%|███▊      | 765/2000 [02:02<03:15,  6.30it/s][A
 38%|███▊      | 766/2000 [02:02<03:15,  6.30it/s][A
 38%|███▊      | 767/2000 [02:02<03:15,  6.31it/s][A
 38%|███▊      | 768/2000 [02:02<03:14,  6.34it/s][A
 38%|███▊      | 769/2000 [02:02<03:14,  6.33it/s][A
 38%|███▊      | 770/2000 [02:03<03:14,  6.32it/s][A
 39%|███▊      | 771/2000 [02:03<03:13,  6.34it/s][A
 39%|███▊      | 772/2000 [02:03<03:13,  6.34it/s][A
 39%|███▊      | 773/2000 [02:03<03:13,  6.33it/s][A
 39%|███▊      | 774/2000 [02:03<03:13,  6.32it/s][A
 39%|███▉      | 775/2000 [02:03<03:13,  6.33it/s][A
 39%|███▉      | 776/2000 [0

 45%|████▌     | 909/2000 [02:25<02:53,  6.29it/s][A
 46%|████▌     | 910/2000 [02:25<02:52,  6.30it/s][A
 46%|████▌     | 911/2000 [02:25<02:52,  6.31it/s][A
 46%|████▌     | 912/2000 [02:25<02:51,  6.33it/s][A
 46%|████▌     | 913/2000 [02:25<02:51,  6.33it/s][A
 46%|████▌     | 914/2000 [02:25<02:51,  6.34it/s][A
 46%|████▌     | 915/2000 [02:26<02:51,  6.34it/s][A
 46%|████▌     | 916/2000 [02:26<02:51,  6.33it/s][A
 46%|████▌     | 917/2000 [02:26<02:51,  6.33it/s][A
 46%|████▌     | 918/2000 [02:26<02:50,  6.33it/s][A
 46%|████▌     | 919/2000 [02:26<02:50,  6.34it/s][A
 46%|████▌     | 920/2000 [02:26<02:50,  6.33it/s][A
 46%|████▌     | 921/2000 [02:26<02:50,  6.34it/s][A
 46%|████▌     | 922/2000 [02:27<02:49,  6.35it/s][A
 46%|████▌     | 923/2000 [02:27<02:49,  6.34it/s][A
 46%|████▌     | 924/2000 [02:27<02:49,  6.35it/s][A
 46%|████▋     | 925/2000 [02:27<02:48,  6.36it/s][A
 46%|████▋     | 926/2000 [02:27<02:48,  6.36it/s][A
 46%|████▋     | 927/2000 [0

 53%|█████▎    | 1059/2000 [02:48<02:28,  6.33it/s][A
 53%|█████▎    | 1060/2000 [02:48<02:28,  6.32it/s][A
 53%|█████▎    | 1061/2000 [02:49<02:28,  6.32it/s][A
 53%|█████▎    | 1062/2000 [02:49<02:28,  6.33it/s][A
 53%|█████▎    | 1063/2000 [02:49<02:27,  6.34it/s][A
 53%|█████▎    | 1064/2000 [02:49<02:27,  6.34it/s][A
 53%|█████▎    | 1065/2000 [02:49<02:27,  6.35it/s][A
 53%|█████▎    | 1066/2000 [02:49<02:27,  6.35it/s][A
 53%|█████▎    | 1067/2000 [02:49<02:27,  6.34it/s][A
 53%|█████▎    | 1068/2000 [02:50<02:27,  6.34it/s][A
 53%|█████▎    | 1069/2000 [02:50<02:26,  6.35it/s][A
 54%|█████▎    | 1070/2000 [02:50<02:26,  6.35it/s][A
 54%|█████▎    | 1071/2000 [02:50<02:26,  6.36it/s][A
 54%|█████▎    | 1072/2000 [02:50<02:25,  6.36it/s][A
 54%|█████▎    | 1073/2000 [02:50<02:25,  6.36it/s][A
 54%|█████▎    | 1074/2000 [02:51<02:25,  6.35it/s][A
 54%|█████▍    | 1075/2000 [02:51<02:25,  6.34it/s][A
 54%|█████▍    | 1076/2000 [02:51<02:25,  6.36it/s][A
 54%|█████

 60%|██████    | 1207/2000 [03:12<02:04,  6.35it/s][A
 60%|██████    | 1208/2000 [03:12<02:04,  6.36it/s][A
 60%|██████    | 1209/2000 [03:12<02:04,  6.37it/s][A
 60%|██████    | 1210/2000 [03:12<02:04,  6.37it/s][A
 61%|██████    | 1211/2000 [03:12<02:03,  6.37it/s][A
 61%|██████    | 1212/2000 [03:12<02:03,  6.38it/s][A
 61%|██████    | 1213/2000 [03:13<02:03,  6.38it/s][A
 61%|██████    | 1214/2000 [03:13<02:03,  6.37it/s][A
 61%|██████    | 1215/2000 [03:13<02:03,  6.37it/s][A
 61%|██████    | 1216/2000 [03:13<02:03,  6.37it/s][A
 61%|██████    | 1217/2000 [03:13<02:02,  6.38it/s][A
 61%|██████    | 1218/2000 [03:13<02:02,  6.38it/s][A
 61%|██████    | 1219/2000 [03:14<02:02,  6.37it/s][A
 61%|██████    | 1220/2000 [03:14<02:02,  6.35it/s][A
 61%|██████    | 1221/2000 [03:14<02:02,  6.34it/s][A
 61%|██████    | 1222/2000 [03:14<02:02,  6.35it/s][A
 61%|██████    | 1223/2000 [03:14<02:02,  6.34it/s][A
 61%|██████    | 1224/2000 [03:14<02:02,  6.34it/s][A
 61%|█████

 68%|██████▊   | 1355/2000 [03:35<01:43,  6.21it/s][A
 68%|██████▊   | 1356/2000 [03:35<01:43,  6.23it/s][A
 68%|██████▊   | 1357/2000 [03:35<01:42,  6.26it/s][A
 68%|██████▊   | 1358/2000 [03:36<01:42,  6.27it/s][A
 68%|██████▊   | 1359/2000 [03:36<01:41,  6.30it/s][A
 68%|██████▊   | 1360/2000 [03:36<01:41,  6.31it/s][A
 68%|██████▊   | 1361/2000 [03:36<01:44,  6.11it/s][A
 68%|██████▊   | 1362/2000 [03:36<01:43,  6.17it/s][A
 68%|██████▊   | 1363/2000 [03:36<01:42,  6.21it/s][A
 68%|██████▊   | 1364/2000 [03:37<01:41,  6.24it/s][A
 68%|██████▊   | 1365/2000 [03:37<01:42,  6.22it/s][A
 68%|██████▊   | 1366/2000 [03:37<01:42,  6.16it/s][A
 68%|██████▊   | 1367/2000 [03:37<01:41,  6.22it/s][A
 68%|██████▊   | 1368/2000 [03:37<01:41,  6.24it/s][A
 68%|██████▊   | 1369/2000 [03:37<01:40,  6.26it/s][A
 68%|██████▊   | 1370/2000 [03:38<01:40,  6.28it/s][A
 69%|██████▊   | 1371/2000 [03:38<01:40,  6.29it/s][A
 69%|██████▊   | 1372/2000 [03:38<01:39,  6.31it/s][A
 69%|█████

 75%|███████▌  | 1503/2000 [03:58<01:18,  6.30it/s][A
 75%|███████▌  | 1504/2000 [03:59<01:19,  6.23it/s][A
 75%|███████▌  | 1505/2000 [03:59<01:19,  6.27it/s][A
 75%|███████▌  | 1506/2000 [03:59<01:18,  6.29it/s][A
 75%|███████▌  | 1507/2000 [03:59<01:18,  6.32it/s][A
 75%|███████▌  | 1508/2000 [03:59<01:17,  6.33it/s][A
 75%|███████▌  | 1509/2000 [03:59<01:17,  6.35it/s][A
 76%|███████▌  | 1510/2000 [04:00<01:17,  6.35it/s][A
 76%|███████▌  | 1511/2000 [04:00<01:16,  6.36it/s][A
 76%|███████▌  | 1512/2000 [04:00<01:16,  6.36it/s][A
 76%|███████▌  | 1513/2000 [04:00<01:16,  6.38it/s][A
 76%|███████▌  | 1514/2000 [04:00<01:16,  6.39it/s][A
 76%|███████▌  | 1515/2000 [04:00<01:16,  6.33it/s][A
 76%|███████▌  | 1516/2000 [04:01<01:17,  6.25it/s][A
 76%|███████▌  | 1517/2000 [04:01<01:16,  6.29it/s][A
 76%|███████▌  | 1518/2000 [04:01<01:16,  6.32it/s][A
 76%|███████▌  | 1519/2000 [04:01<01:16,  6.31it/s][A
 76%|███████▌  | 1520/2000 [04:01<01:15,  6.32it/s][A
 76%|█████

 83%|████████▎ | 1651/2000 [04:22<00:55,  6.34it/s][A
 83%|████████▎ | 1652/2000 [04:22<00:54,  6.35it/s][A
 83%|████████▎ | 1653/2000 [04:22<00:54,  6.37it/s][A
 83%|████████▎ | 1654/2000 [04:22<00:54,  6.35it/s][A
 83%|████████▎ | 1655/2000 [04:22<00:54,  6.31it/s][A
 83%|████████▎ | 1656/2000 [04:23<00:54,  6.30it/s][A
 83%|████████▎ | 1657/2000 [04:23<00:54,  6.30it/s][A
 83%|████████▎ | 1658/2000 [04:23<00:54,  6.28it/s][A
 83%|████████▎ | 1659/2000 [04:23<00:54,  6.30it/s][A
 83%|████████▎ | 1660/2000 [04:23<00:53,  6.31it/s][A
 83%|████████▎ | 1661/2000 [04:23<00:53,  6.32it/s][A
 83%|████████▎ | 1662/2000 [04:24<00:53,  6.32it/s][A
 83%|████████▎ | 1663/2000 [04:24<00:53,  6.34it/s][A
 83%|████████▎ | 1664/2000 [04:24<00:53,  6.34it/s][A
 83%|████████▎ | 1665/2000 [04:24<00:52,  6.35it/s][A
 83%|████████▎ | 1666/2000 [04:24<00:52,  6.36it/s][A
 83%|████████▎ | 1667/2000 [04:24<00:52,  6.36it/s][A
 83%|████████▎ | 1668/2000 [04:24<00:52,  6.37it/s][A
 83%|█████

 90%|████████▉ | 1799/2000 [04:45<00:31,  6.36it/s][A
 90%|█████████ | 1800/2000 [04:45<00:31,  6.36it/s][A
 90%|█████████ | 1801/2000 [04:45<00:31,  6.35it/s][A
 90%|█████████ | 1802/2000 [04:46<00:31,  6.34it/s][A
 90%|█████████ | 1803/2000 [04:46<00:30,  6.36it/s][A
 90%|█████████ | 1804/2000 [04:46<00:30,  6.38it/s][A
 90%|█████████ | 1805/2000 [04:46<00:30,  6.39it/s][A
 90%|█████████ | 1806/2000 [04:46<00:30,  6.41it/s][A
 90%|█████████ | 1807/2000 [04:46<00:30,  6.40it/s][A
 90%|█████████ | 1808/2000 [04:46<00:29,  6.41it/s][A
 90%|█████████ | 1809/2000 [04:47<00:29,  6.40it/s][A
 90%|█████████ | 1810/2000 [04:47<00:29,  6.40it/s][A
 91%|█████████ | 1811/2000 [04:47<00:29,  6.40it/s][A
 91%|█████████ | 1812/2000 [04:47<00:29,  6.40it/s][A
 91%|█████████ | 1813/2000 [04:47<00:29,  6.41it/s][A
 91%|█████████ | 1814/2000 [04:47<00:29,  6.41it/s][A
 91%|█████████ | 1815/2000 [04:48<00:28,  6.39it/s][A
 91%|█████████ | 1816/2000 [04:48<00:28,  6.37it/s][A
 91%|█████

 97%|█████████▋| 1947/2000 [05:08<00:08,  6.35it/s][A
 97%|█████████▋| 1948/2000 [05:08<00:08,  6.36it/s][A
 97%|█████████▋| 1949/2000 [05:09<00:08,  6.34it/s][A
 98%|█████████▊| 1950/2000 [05:09<00:07,  6.36it/s][A
 98%|█████████▊| 1951/2000 [05:09<00:07,  6.35it/s][A
 98%|█████████▊| 1952/2000 [05:09<00:07,  6.36it/s][A
 98%|█████████▊| 1953/2000 [05:09<00:07,  6.36it/s][A
 98%|█████████▊| 1954/2000 [05:09<00:07,  6.37it/s][A
 98%|█████████▊| 1955/2000 [05:10<00:07,  6.37it/s][A
 98%|█████████▊| 1956/2000 [05:10<00:06,  6.35it/s][A
 98%|█████████▊| 1957/2000 [05:10<00:06,  6.36it/s][A
 98%|█████████▊| 1958/2000 [05:10<00:06,  6.36it/s][A
 98%|█████████▊| 1959/2000 [05:10<00:06,  6.37it/s][A
 98%|█████████▊| 1960/2000 [05:10<00:06,  6.38it/s][A
 98%|█████████▊| 1961/2000 [05:10<00:06,  6.39it/s][A
 98%|█████████▊| 1962/2000 [05:11<00:05,  6.39it/s][A
 98%|█████████▊| 1963/2000 [05:11<00:05,  6.39it/s][A
 98%|█████████▊| 1964/2000 [05:11<00:05,  6.39it/s][A
 98%|█████

29410    ESH
16142    YTA
4184     YTA
28267    YTA
7540     YTA
        ... 
9508     YTA
6646     NAH
29158    NAH
6394     YTA
32594    NAH
Name: id, Length: 2000, dtype: object

'YTA'