# Renaming and cleaning data for LME analysis

Not every variable can or should be treated as a categorical variable. So the following will go through and rename variables as needed, as well as sorting some in which we assume there will be a linear, hierarchical effect. It will then save the cleaned data to an output file for LME analysis.

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from webscrapers.weaponizedword.api import weaponizedword, query_data
import regex as re
import os

In [2]:
PATH = 'data_aggregate'
TEXTS_DOC = 'islamophobia-texts.tsv'
ANALYSIS_DOC = 'Islamophobia.csv'

### Creating Regex and Searching through dredged up comments

We'll start by creating a query for our two main targets. We'll then use these queries to saerch the data.

In [None]:
targets = {
    'Antisemitism': [
        ('malignant_meaning', 'jew'), 
        ('malignant_meaning', 'Jew'),
        ('malignant_meaning', 'Judaism'),
    ],
    'Islamophobia': [
        ('malignant_meaning', 'Muslim'), 
        ('malignant_meaning', 'Islam'), 
    ]
}

In [None]:
LOAD = True

ww = weaponizedword()

if LOAD:
    ww.load_search()
else:
    ww.search(endpoint_name='get_discriminatory')
    ww.save_search()

In [None]:
query_ = dict()
for k,v in targets.items():
    query = []
    for field, search in v:
        query += [w.lower() for w in ww.create_query_from_results(field, search).replace('"', '').split(' OR ')]
    if k == 'Antisemitism':
        # query += ['soros', 'rothschild', 'jew']
        query_[k] = r'({})'.format('|'.join(set(['(?:^|\s|$){}(?:^|\s|$)'.format(q) for q in query if q.lower() not in ['khazars']]+['soros', 'rothschild', 'jew', 'khazar', 'israeli'])))
    if k == 'Islamophobia':
        # query += ['palestin', 'arab', 'muslim']
        query_[k] = r'({})'.format('|'.join(set(['(?:^|\s|$){}(?:^|\s|$)'.format(q) for q in query]+['palestin', 'arab', 'muslim', 'middle east'])))

In [None]:
# query_

In [None]:
def expert_dictionary_classifier(text):
    array = []
    for k,v in query_.items():
        array+= [[k, len(re.findall(v,text))]]
    array = np.array(array,dtype=object)
    
    if array[:,1].sum() > 0:
        return array[:,0][array[:,1].argmax()]
    else:
        return 'unknown target'

Let's apply this to data now so we can relabel our utterances in the analysis doc and then clean up the data.

In [None]:
dft = pd.read_table(
    os.path.join(PATH, TEXTS_DOC),
    sep='\t'
)
dft.head()

In [None]:
dft['antisemitism_islamophobia'] = dft['body'].apply(lambda x: expert_dictionary_classifier(str(x).lower()))

In [None]:
dft['antisemitism_islamophobia'].value_counts()

In [None]:
dft.to_csv(os.path.join(PATH, TEXTS_DOC), sep='\t', index=False, encoding='utf-8')

### Merging with data and cleaning up files

In [3]:
from datetime import datetime as dt
import warnings
import json

df = pd.read_csv(os.path.join(PATH, ANALYSIS_DOC))
df = df.loc[df['x_user'] != df['y_user']]
df = df.loc[~df['x_user'].isin([np.nan]) & ~df['y_user'].isin([np.nan])]
df['nx'] = df['nx'].astype(int)
df['ny'] = df['ny'].astype(int)
df = df.loc[df['nx'] >= 5]
df.head()

Unnamed: 0,x,y,H,x_subreddit,x_submission_id,x_comment_created_at,x_comment_ups,x_user,x_parent_id,x_probs,...,y_user,y_parent_id,y_probs,y_hate_target,x_about_Jewish_people,x_about_Muslim_people,y_about_Jewish_people,y_about_Muslim_people,nx,ny
0,i6k35f0,i6n4ud0,0.613424,fingmemes,udysna,1651167000.0,7,kkanekii_boi,t3_udysna,0.01426,...,sin1996,t3_udysna,0.07205,Islamophobia,0,0,0,0,5,25
1,i6k35f0,i6kmu34,0.4491,fingmemes,udysna,1651167000.0,7,kkanekii_boi,t3_udysna,0.01426,...,16-18-8-11-11-0-14-8-5-5,t3_udysna,0.000962,Islamophobia,0,0,0,0,5,16
2,i6k35f0,i6mm6om,0.304221,fingmemes,udysna,1651167000.0,7,kkanekii_boi,t3_udysna,0.01426,...,Finallyback_69,t3_udysna,0.100891,Islamophobia,0,0,0,0,5,5
3,i6k35f0,i6lwzhs,0.566974,fingmemes,udysna,1651167000.0,7,kkanekii_boi,t3_udysna,0.01426,...,akirasup3r,t3_udysna,0.00181,Islamophobia,0,0,0,0,5,23
4,i6k35f0,iamodu1,0.475101,fingmemes,udysna,1651167000.0,7,kkanekii_boi,t3_udysna,0.01426,...,mai_nahi_batuga,t3_udysna,0.244305,Islamophobia,0,0,0,0,5,12


In [4]:
df.dtypes

x                         object
y                         object
H                        float64
x_subreddit               object
x_submission_id           object
x_comment_created_at     float64
x_comment_ups              int64
x_user                    object
x_parent_id               object
x_probs                  float64
x_hate_target             object
y_subreddit               object
y_submission_id           object
y_comment_created_at     float64
y_comment_ups              int64
y_user                    object
y_parent_id               object
y_probs                  float64
y_hate_target             object
x_about_Jewish_people      int64
x_about_Muslim_people      int64
y_about_Jewish_people      int64
y_about_Muslim_people      int64
nx                         int64
ny                         int64
dtype: object

#### Adding in factors for if the comment mentions one of the target groups

In [None]:
# comment_indicator_dic = {cid:rating for cid, rating in dft[['comment_id', 'antisemitism_islamophobia']].values}
# 
# df['x_about_Jewish_people'] = 0
# df['x_about_Muslim_people'] = 0
# df['y_about_Jewish_people'] = 0
# df['y_about_Muslim_people'] = 0
# 
# warnings.filterwarnings('ignore')
# for i in tqdm(df.index):
#     x_res, y_res = comment_indicator_dic[df['x'].loc[i]], comment_indicator_dic[df['y'].loc[i]]
#     
#     if x_res =='Antisemitism':
#         df['x_about_Jewish_people'].loc[i] = 1
#     elif x_res == 'Islamophobia':
#         df['x_about_Muslim_people'].loc[i] = 1
#     
#     if y_res =='Antisemitism':
#         df['y_about_Jewish_people'].loc[i] = 1
#     elif y_res == 'Islamophobia':
#         df['y_about_Muslim_people'].loc[i] = 1
# 
# warnings.filterwarnings('default')

In [None]:
# df.to_csv(
#     os.path.join(PATH, ANALYSIS_DOC),
#     index=False, encoding='utf-8'
# )

#### Variable set-up. 

I performed this offline, but the script is below.

In [5]:
# sets up whether the comment x is the parent of the comment y
df['y_parent_id_'] = [pid.split('_')[-1] for pid in tqdm(df['y_parent_id'].values)]
df['is_parent'] = df['x'] == df['y_parent_id_']
del df['y_parent_id_']

df['is_sibling'] = df['x_parent_id'] == df['y_parent_id']

# Calculate absolute difference in time
df['t_delta'] = (df['y_comment_created_at']-df['x_comment_created_at'])
df['t_delta_abs'] = df['t_delta'].abs()


# sets up a categorical variable for the level of hatefulness exhibited by 
#   an x-comment
# hate_level = {0: '.00-.25', 1: '.25-.50', 2: '.50-.75', 3: '.75-1.0'}
# conds = [
#     (df['x_probs'] > .25).astype(int).values.reshape(1,-1),
#     (df['x_probs'] > .5).astype(int).values.reshape(1,-1),
#     (df['x_probs'] > .75).astype(int).values.reshape(1,-1),
# ]
# conds = np.sum(conds, axis=0)
# print(conds.shape)
# df['hate_level'] = conds.reshape(-1)
# df['hate_level'] = df['hate_level'].replace(hate_level)
# 
# 
# # binary variable for whether a comment is hate or not
# df['x_is_hate'] = (df['x_probs'] >= .8).values.astype(float)
# df['y_is_hate'] = (df['y_probs'] >= .8).values.astype(float)


# Calculate the average entropy for any token in an utterance.
df['avgH'] = df['H']/df['nx']

for relabel_col in ['x_user', 'y_user', 'x', 'x_submission_id', 'y_submission_id']:
    s = dt.now()
    
    # sorting the data by averageH
    # relabel = df[['avgH', relabel_col]].groupby(relabel_col).aggregate('mean').sort_values(by=['avgH']).index.values
    
    # no sorting (because random effects shouldn't need it)
    relabel = df[relabel_col].unique()
    relabel = np.random.choice(relabel, size=(len(relabel)), replace=False)
    
    dic = {xu: i for i, xu in enumerate(relabel)}
    print(relabel_col, "dictionary made", dt.now() - s)
    
    with open(os.path.join(PATH, relabel_col+'-'+ANALYSIS_DOC.replace('.csv', '.json')), 'w') as f:
        jso = json.dumps(dic, indent=4)
        f.write(jso)
    f.close()
    
    df[relabel_col] = [dic[v] for v in tqdm(df[relabel_col].values)]
    print(relabel_col, "calc'ed", dt.now() - s)

100%|██████████| 2596277/2596277 [00:00<00:00, 2621824.50it/s]


x_user dictionary made 0:00:00.169758


100%|██████████| 2596277/2596277 [00:00<00:00, 4491802.25it/s]


x_user calc'ed 0:00:01.335162
y_user dictionary made 0:00:00.168143


100%|██████████| 2596277/2596277 [00:00<00:00, 4068647.00it/s]


y_user calc'ed 0:00:01.431768
x dictionary made 0:00:00.256095


100%|██████████| 2596277/2596277 [00:00<00:00, 3937259.90it/s]


x calc'ed 0:00:01.657535
x_submission_id dictionary made 0:00:00.148081


100%|██████████| 2596277/2596277 [00:00<00:00, 4641089.96it/s]


x_submission_id calc'ed 0:00:01.279206
y_submission_id dictionary made 0:00:00.141095


100%|██████████| 2596277/2596277 [00:00<00:00, 4517118.23it/s]


y_submission_id calc'ed 0:00:01.269714


In [6]:
df.head()

Unnamed: 0,x,y,H,x_subreddit,x_submission_id,x_comment_created_at,x_comment_ups,x_user,x_parent_id,x_probs,...,x_about_Muslim_people,y_about_Jewish_people,y_about_Muslim_people,nx,ny,is_parent,is_sibling,t_delta,t_delta_abs,avgH
0,13533,i6n4ud0,0.613424,fingmemes,240,1651167000.0,7,9934,t3_udysna,0.01426,...,0,0,0,5,25,False,True,53712.0,53712.0,0.122685
1,13533,i6kmu34,0.4491,fingmemes,240,1651167000.0,7,9934,t3_udysna,0.01426,...,0,0,0,5,16,False,True,7869.0,7869.0,0.08982
2,13533,i6mm6om,0.304221,fingmemes,240,1651167000.0,7,9934,t3_udysna,0.01426,...,0,0,0,5,5,False,True,39747.0,39747.0,0.060844
3,13533,i6lwzhs,0.566974,fingmemes,240,1651167000.0,7,9934,t3_udysna,0.01426,...,0,0,0,5,23,False,True,27780.0,27780.0,0.113395
4,13533,iamodu1,0.475101,fingmemes,240,1651167000.0,7,9934,t3_udysna,0.01426,...,0,0,0,5,12,False,True,2806914.0,2806914.0,0.09502


Setting up comment_delta variable. Note: this is only useful for visualization. Otherwise, we prefer to use the time_delta variable.

In [7]:
import warnings; warnings.filterwarnings('ignore')
add_new_comment_delta = True

label_by = [
    'treat reply different from same thread', 
    'treat reply same as same thread', 
    'just ordered', 
    'comparisons must be within same parent or a reply',
][-1]

# establishing comment_delta per each x . . . 
df['is_parent'] = df['is_parent'].replace({True: 0, False: 1})
print(df.is_parent.unique())

if add_new_comment_delta:
    df['comment_delta'] = 0
    df['same_thread'] = False
    for xid in tqdm(df['x'].unique()):
        sub = df.loc[df['x'].isin([xid])]
        same_post_sel = sub['x_submission_id'] == sub['y_submission_id']
        baseline_sel = sub['x_submission_id'] != sub['y_submission_id']
        pre_sel = sub['y_comment_created_at'] < sub['x_comment_created_at'].values[0]
        post_sel = sub['y_comment_created_at'] > sub['x_comment_created_at'].values[0]
        
        if label_by == 'treat reply different from same thread':
            ### Assuming separate time dynamics for reply vs. normal
            is_parent = sub.loc[(sub['is_parent']==0)].sort_values(by=['y_comment_created_at']).index.values
            df['comment_delta'].loc[is_parent] = range(len(is_parent))
            
            not_parent = sub.loc[(sub['is_parent']==1)].sort_values(by=['y_comment_created_at']).index.values
            df['comment_delta'].loc[not_parent] = range(len(not_parent))
        
        if label_by == 'treat reply same as same thread':
            ## Assuming interaction of reply vs. normal
            # ct = 0
            # for idx in sub[same_post_sel & pre_sel].sort_values(by=['y_comment_created_at'], ascending=False).index:
            #     ct-=1
            #     df['comment_delta_'].loc[idx] = ct
        
            ct = 0
            for idx in sub[same_post_sel & post_sel].sort_values(by=['is_parent','y_comment_created_at'], ascending=True).index:
                ct+=1
                df['comment_delta'].loc[idx] = ct
        
        if label_by == 'just ordered':
            values = np.concatenate([sub[['x', 'x_comment_created_at']].values,sub[['y', 'y_comment_created_at']].values], axis=0)
            indexes = np.unique(values[:,0].astype(str), return_index=True)[1]
            # print(indexes)
            values = values[indexes]
            values = values[:,0][values[:,0].astype(str).argsort()]
            t = {v: i+1 for i,v in enumerate(values)}
            
            xs = np.array([t[v] for v in sub['x'].values])
            ys = np.array([t[v] for v in sub['y'].values])
            
            df['comment_delta'].loc[sub.index] = ys - xs 
        
        if label_by == 'comparisons must be within same parent or a reply':
            ### Assuming separate time dynamics for reply vs. normal
            is_parent = sub.loc[(sub['is_parent']==0)].sort_values(by=['y_comment_created_at']).index.values
            df['comment_delta'].loc[is_parent] = range(1,len(is_parent)+1)
            df['same_thread'].loc[is_parent] = True
            
            not_parent = sub.loc[ (sub['x_parent_id']==sub['y_parent_id'])].sort_values(by=['y_comment_created_at']).index.values
            df['comment_delta'].loc[not_parent] = range(1,len(not_parent)+1)
            df['same_thread'].loc[not_parent] = True
            
            not_same_thread = sub.loc[(sub['is_parent']==1) & (sub['x_parent_id']!=sub['y_parent_id'])].sort_values(by=['y_comment_created_at']).index.values
            df['comment_delta'].loc[not_same_thread] = range(1,len(not_same_thread)+1)
            
    
    df['comment_delta_abs'] = df['comment_delta'].abs()
    
else:
    k = pd.read_csv(os.path.join(PATH, ANALYSIS_DOC.replace('.csv', '-cleaned.csv')), usecols=['comment_delta', 'comment_delta_abs'])
    df = pd.concat([df,k], axis=1)
    del k

[1 0]


100%|██████████| 29914/29914 [04:09<00:00, 119.71it/s]


In [8]:
df['comment_delta'].value_counts()

1      58719
2      45236
3      40894
4      38561
5      37028
       ...  
449        1
450        1
451        1
447        1
452        1
Name: comment_delta, Length: 453, dtype: int64

In [9]:
df['is_parent'] = df['is_parent'].replace({0: True, 1: False})
df['is_parent'].value_counts()

False    2577120
True       19157
Name: is_parent, dtype: int64

In [10]:
df.head()

Unnamed: 0,x,y,H,x_subreddit,x_submission_id,x_comment_created_at,x_comment_ups,x_user,x_parent_id,x_probs,...,nx,ny,is_parent,is_sibling,t_delta,t_delta_abs,avgH,comment_delta,same_thread,comment_delta_abs
0,13533,i6n4ud0,0.613424,fingmemes,240,1651167000.0,7,9934,t3_udysna,0.01426,...,5,25,False,True,53712.0,53712.0,0.122685,4,True,4
1,13533,i6kmu34,0.4491,fingmemes,240,1651167000.0,7,9934,t3_udysna,0.01426,...,5,16,False,True,7869.0,7869.0,0.08982,1,True,1
2,13533,i6mm6om,0.304221,fingmemes,240,1651167000.0,7,9934,t3_udysna,0.01426,...,5,5,False,True,39747.0,39747.0,0.060844,3,True,3
3,13533,i6lwzhs,0.566974,fingmemes,240,1651167000.0,7,9934,t3_udysna,0.01426,...,5,23,False,True,27780.0,27780.0,0.113395,2,True,2
4,13533,iamodu1,0.475101,fingmemes,240,1651167000.0,7,9934,t3_udysna,0.01426,...,5,12,False,True,2806914.0,2806914.0,0.09502,5,True,5


In [11]:
(df['x_parent_id'] == df['y_parent_id']).sum()

381188

Finish up by saving the document to file.

In [12]:
df.to_csv(
    os.path.join(PATH, ANALYSIS_DOC.replace('.csv', '-cleaned.csv')), 
    index=False, 
    encoding='utf-8'
)