# Load data

In [1]:
import os
import pandas as pd

import pickle
from collections import Counter
from tqdm import tqdm
import numpy as np
import random

import seaborn as sns
from _00_text_utils import title_process, process_quoting, comment_process, selftext_process

import matplotlib.pyplot as plt
sns.set_theme(style="darkgrid", palette="deep")

In [2]:
df = pd.read_csv(f'{os.getcwd()}/data/data_active_authorExt_withAbstValues.tsv', sep='\t')
df_ext = pd.read_csv(f'{os.getcwd()}/data/data_active_situationExt.tsv', sep='\t')

further process df

In [3]:
invalid_indices = []
for idx, row in df.iterrows():
    if row['title'] != row['title']:
        invalid_indices.append(idx)
    elif row['title'].strip() in ['[removed]', '[deleted by user]', '[deleted]', '[removed by user]']:
        invalid_indices.append(idx)
    if row['selftext'] != row['selftext']:
        invalid_indices.append(idx)
    elif row['selftext'].strip() in ['[removed]', '[deleted by user]', '[deleted]', '[removed by user]']:
        invalid_indices.append(idx)
    if row['comment'] != row['comment']:
        invalid_indices.append(idx)
print(len(invalid_indices))
df.drop(invalid_indices, inplace=True)
df.reset_index(drop=True, inplace=True)

0


In [4]:
invalid_indices = []
for idx, row in df_ext.iterrows():
    if row['title'] != row['title']:
        invalid_indices.append(idx)
    elif row['title'].strip() in ['[removed]', '[deleted by user]', '[deleted]', '[removed by user]']:
        invalid_indices.append(idx)
    if row['selftext'] != row['selftext']:
        invalid_indices.append(idx)
    elif row['selftext'].strip() in ['[removed]', '[deleted by user]', '[deleted]', '[removed by user]']:
        invalid_indices.append(idx)
    if row['comment'] != row['comment']:
        invalid_indices.append(idx)
print(len(invalid_indices))
df_ext.drop(invalid_indices, inplace=True)
df_ext.reset_index(drop=True, inplace=True)

1


In [5]:
df['title'] = df['title'].apply(lambda x:title_process(x))
df['selftext'] = df['selftext'].apply(lambda x:selftext_process(x))
df['comment'] = df['comment'].apply(lambda x:comment_process(x))
df['comment'] = df['comment'].apply(lambda x:process_quoting(x))

In [6]:
df_ext['title'] = df_ext['title'].apply(lambda x:title_process(x))
df_ext['selftext'] = df_ext['selftext'].apply(lambda x:selftext_process(x))
df_ext['comment'] = df_ext['comment'].apply(lambda x:comment_process(x))
df_ext['comment'] = df_ext['comment'].apply(lambda x:process_quoting(x))

## Determine which authors to fine-tune for baseline

In [20]:
author_cnt = Counter()
author_cnt.update(df.author.tolist())
author_cnt = sorted(author_cnt.items(), key=lambda x:x[1], reverse=True)

In [21]:
print(f"Num all authors: {len(author_cnt)}")
j, thr_list = 0, [2000, 1000, 500, 200, 100]
for i, elem in enumerate(author_cnt):
    if elem[1] <= thr_list[j]:
        print(f"{i} authors commented more than {thr_list[j]}")
        j += 1
    if j >= len(thr_list):
        break

Num all authors: 226
6 authors commented more than 2000
8 authors commented more than 1000
9 authors commented more than 500
46 authors commented more than 200
219 authors commented more than 100


In [26]:
print(f"Num situations: {len(list(set(df['subID'].tolist())))} | Num avg comments/situation: {len(df)/len(list(set(df['subID'].tolist()))):.2f}" )

Num situations: 17432 | Num avg comments/situation: 3.06


In [13]:
author_cnt = Counter()
author_cnt.update(df_ext.commAuthor.tolist())
author_cnt = sorted(author_cnt.items(), key=lambda x:x[1], reverse=True)

In [19]:
print(f"Num all authors: {len(author_cnt)}")
j, thr_list = 0, [2000, 1000, 500, 200, 100]
for i, elem in enumerate(author_cnt):
    if elem[1] <= thr_list[j]:
        print(f"{i} authors commented more than {thr_list[j]}")
        j += 1
    if j >= len(thr_list):
        break

Num all authors: 3858
8 authors commented more than 2000
41 authors commented more than 1000
224 authors commented more than 500
1475 authors commented more than 200
3826 authors commented more than 100


In [27]:
print(f"Num situations: {len(list(set(df_ext['subID'].tolist())))} | Num avg comments/situation: {len(df_ext)/len(list(set(df_ext['subID'].tolist()))):.2f}" )

Num situations: 217262 | Num avg comments/situation: 4.10


**Most common topics using kmeans**

In [30]:
df_topic = pd.read_csv("/Users/yh/Documents/Research/AbstractiveSubjectivity/data/AITA/directComms_active_RoT_cluster.tsv", sep='\t')

In [35]:
_cluster_to_topic = {
    0: 'conflicts regarding child / family',
    1: 'conflicts regarding parents / family',
    2: 'conflicts regarding child / family',
    3: 'wedding ceremony issues',
    4: 'conflicts regarding family',
    5: 'conflicts regarding siblings',
    6: 'conflicts regarding spouse',
    7: 'conflicts regarding siblings',
    8: 'home / place of residence / accommodation issues',
    9: 'food / meal conflicts',
    10: 'money conflicts',
    11: 'abortion / pregnancy / baby',
}

In [37]:
# subID_to_cluster = {}
cluster_to_cnt = {i:0 for i in range(12)}
subID_to_cluster = {}
for idx, row in df_topic.iterrows():
    if row['subID'] not in subID_to_cluster:
        subID_to_cluster[row['subID']] = row['cluster']
        cluster_to_cnt[int(row['cluster'])] += 1
for _c in cluster_to_cnt:
    print(f"[cluster {_c}] '{_cluster_to_topic[_c]}': {cluster_to_cnt[_c]/len(subID_to_cluster.keys())*100:.2f}")

[cluster 0] 'conflicts regarding child / family': 9.01
[cluster 1] 'conflicts regarding parents / family': 8.24
[cluster 2] 'conflicts regarding child / family': 8.14
[cluster 3] 'wedding ceremony issues': 7.50
[cluster 4] 'conflicts regarding family': 11.65
[cluster 5] 'conflicts regarding siblings': 5.74
[cluster 6] 'conflicts regarding spouse': 4.77
[cluster 7] 'conflicts regarding siblings': 6.99
[cluster 8] 'home / place of residence / accommodation issues': 13.79
[cluster 9] 'food / meal conflicts': 6.34
[cluster 10] 'money conflicts': 12.91
[cluster 11] 'abortion / pregnancy / baby': 4.92
