# Group-by-Group Analyses of Social Factors Influencing Entropy Values

### 1. Importing Data

In [None]:
import numpy as np
import pandas as pd
import os

data_file = 'data/'+os.listdir('data/')[-1]

df = pd.read_csv(data_file)
df = df.loc[df['x_user'] != df['y_user']]
df = df.loc[~df['x_user'].isin([np.nan]) & ~df['y_user'].isin([np.nan])]
df['n'] = df['n'].astype(int)
df = df.loc[(df['n'] > 5)]
# df = df.loc[df['y_user'].isin(df['x_user'].unique())]

We need to set up a few additional columns as well.

In [None]:
# Calculate the distance between comments
#  (a rough estimate of time. Not used in current analysis)
df['comment_delta'] = (df['x_comment_no'] - df['y_comment_no'])
df['comment_delta_abs'] = df['comment_delta'].abs()

# convert user ids to numerical indicator
user_dic = {x:i for i,x in enumerate(np.unique(df[['x_user', 'y_user']].values.astype(str)))}
df['x_user'] = df['x_user'].apply(lambda x: user_dic[x])
df['y_user'] = df['y_user'].apply(lambda x: user_dic[x])

# convert subreddit ids to numerical indicator
#   AND check if two comments are from same subreddit
subreddit_dic = {x:i for i,x in enumerate(df['x_subreddit'].unique())}
df['x_subreddit'] = df['x_subreddit'].apply(lambda x: subreddit_dic[x])
df['y_subreddit'] = df['y_subreddit'].apply(lambda x: subreddit_dic[x])
df['same_subreddit'] = (df['x_subreddit'] == df['y_subreddit']).astype(int) + 1

# Check if x and y are from same post.
df['same_post'] = (df['x_post_id'] == df['y_post_id'])

# Calculate absolute difference in time
df['t_delta'] = (df['xtime'] - df['ytime'])
df['t_delta_abs'] = df['t_delta'].abs()

# Calculate the average entropy for any token in an utterance.
df['avgH'] = df['H']/df['n']

In [None]:
print(len(df))
df[['x_subreddit', 'y_subreddit']].value_counts(sort=False)

### 2. Analyses & Results

In [None]:
# import scipy.stats as stats
# import statsmodels.api as sm
import statsmodels.formula.api as smf
from mod.vis.latex_table import latex_table, format_num_string

import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('darkgrid')

print(subreddit_dic)

#### 2.1 r/MensLib

Let's kick it off with MensLib (because I'm interested in this group specifically)

Start by restricting our analyses to only those instances in which x and y are from r/MensLib (id=2).

In [None]:
data2 = df.loc[
    df['x_subreddit'].isin([2])
    & df['y_subreddit'].isin([2]) # did this too with all subs in and + (2|same_subreddit)
]

In [None]:
model = "avgH ~ x_comment_ups*y_comment_ups*same_post*t_delta_abs + (1|x_user) + (1|y_user)"

md = smf.mixedlm(model, data=data2, groups=data2['x'])
mdf = md.fit()
# print(mdf.summary())

In [None]:
reporting = pd.DataFrame()
reporting['coefs'] = mdf.params
reporting['stat'] = mdf.tvalues
reporting['p'] = mdf.pvalues
reporting

In [None]:
for col in list(reporting):
    reporting[col] = reporting[col].apply(lambda x: format_num_string(np.format_float_scientific(x, precision=3)))

reporting['var'] = reporting.index.values
reporting['var'] = reporting['var'].apply(lambda x: x.replace('_', '-'))

table = latex_table(reporting[['var', 'coefs', 'stat', 'p']], add_hline=True)
with open('/Volumes/ROY/comp_ling/datasci/SIS/data/reddit_feminism/lme-tables/ml-table.txt', 'w') as f:
    f.write(table)
    f.close()

creating a relplot using comment delta (because using t-values is obscenely noisy)

In [None]:
data2_ = data2.loc[
    ((data2['comment_delta'] > -16) & (data2['comment_delta'] < 16))
    & (data2['comment_delta'] != 0)
]
data2_['y_subreddit'].loc[data2_['x_post_id'] == data2_['y_post_id']] = 'same-post'
data2_['y_subreddit'].loc[data2_['x_post_id'] != data2_['y_post_id']] = 'baseline'

ax = sns.catplot(data=data2_.loc[~data2_['y_subreddit'].isin([1,0])], y='avgH', hue='y_subreddit', x='comment_delta', kind='point',capsize=.2, errorbar=None)
plt.show()

Confirming the directionality of the coeficcients calculated in LME model compared to covariance

In [None]:
pd.DataFrame.cov(data2[['avgH', 'x_comment_ups', 'y_comment_ups', 't_delta_abs', 'same_post', 'same_subreddit']])

#### 2.2 r/MensRights

alright, what about MensRights?

Start by restricting analyses to only those comparisons where x and y are both from r/MensRights (id=1).

In [None]:
data1 = df.loc[
    df['x_subreddit'].isin([1])
    # & df['y_subreddit'].isin([1,0])
    & df['y_subreddit'].isin([1]) # did this too with all subs in and + (2|same_subreddit)

]

In [None]:
model = "avgH ~ x_comment_ups*y_comment_ups*same_post*t_delta_abs + (1|x_user) + (1|y_user)"

md = smf.mixedlm(model, data=data1, groups=data1['x'])
mdf = md.fit()

In [None]:
reporting = pd.DataFrame()
reporting['coefs'] = mdf.params
reporting['stat'] = mdf.tvalues
reporting['p'] = mdf.pvalues
reporting

In [None]:
for col in list(reporting):
    reporting[col] = reporting[col].apply(lambda x: format_num_string(np.format_float_scientific(x, precision=3)))

reporting['var'] = reporting.index.values
reporting['var'] = reporting['var'].apply(lambda x: x.replace('_', '-'))

table = latex_table(reporting[['var', 'coefs', 'stat', 'p']], add_hline=True)
with open('/Volumes/ROY/comp_ling/datasci/SIS/data/reddit_feminism/lme-tables/mr-table.txt', 'w') as f:
    f.write(table)
    f.close()

And we'll create a relplot for entropy values using comment_delta (again, time differences are too noisy for a good relplot)

In [None]:
data1_ = data1.loc[
    ((data1['comment_delta'] > -16) & (data1['comment_delta'] < 16))
    & (data1['comment_delta'] != 0)
]
data1_['y_subreddit'].loc[data1_['x_post_id'] == data1_['y_post_id']] = 'same-post'
data1_['y_subreddit'].loc[data1_['x_post_id'] != data1_['y_post_id']] = 'baseline'
ax = sns.catplot(data=data1_.loc[~data1_['y_subreddit'].isin([0,2])], y='avgH', hue='y_subreddit', x='comment_delta', kind='point', capsize=.2, errorbar=None)
plt.show()

And check the covariance.

In [None]:
pd.DataFrame.cov(data1_[['avgH', 'x_comment_ups', 'y_comment_ups', 'comment_delta_abs', 'same_post', 'same_subreddit']])

#### 2.3 Comparison of r/MensLib to r/Feminism and r/MensRights


In [None]:
from scipy.stats import ttest_ind as ttest

data2 = df.loc[
    df['y_subreddit'].isin([1,0])
    & df['x_subreddit'].isin([2])
    & (df['t_delta_abs']<=(24*60*60)) # compare only between comments written within the same day as one another.
]
print(data2['y_subreddit'].value_counts())

print(len(data2))

ttest(
    data2['avgH'].loc[data2['y_subreddit'].isin([0])].values, # r/Feminism
    data2['avgH'].loc[data2['y_subreddit'].isin([1])].values # r/MensRights
)

In [None]:
data2_ = data2.loc[
    ((data2['comment_delta'] > -16) & (data2['comment_delta'] < 16))
    & (data2['comment_delta'] != 0)
]
data2_['y_subreddit'].loc[data2_['y_subreddit'].isin([1])] = 'r/MensRights'
data2_['y_subreddit'].loc[data2_['y_subreddit'].isin([0])] = 'r/Feminism'

ax = sns.catplot(data=data2_, y='avgH', hue='y_subreddit', x='comment_delta', kind='point', capsize=.2, errorbar=None)
plt.show()

#### 2.4 Comparison r/Feminism to r/MensLib and r/MensRights


In [None]:
from scipy.stats import ttest_ind as ttest

data0 = df.loc[
    df['y_subreddit'].isin([1,2])
    & df['x_subreddit'].isin([0])
    & (df['t_delta_abs']<=(24*60*60)) # compare only between comments written within the same day as one another.
]
print(data0['y_subreddit'].value_counts())

print(len(data0))

ttest(
    data0['avgH'].loc[data0['y_subreddit'].isin([2])].values, # r/MensLib
    data0['avgH'].loc[data0['y_subreddit'].isin([1])].values # r/MensRights
)

In [None]:
data0_ = data0.loc[
    ((data0['comment_delta'] > -16) & (data0['comment_delta'] < 16))
    & (data0['comment_delta'] != 0)
]
data0_['y_subreddit'].loc[data0_['y_subreddit'].isin([1])] = 'r/MensRights'
data0_['y_subreddit'].loc[data0_['y_subreddit'].isin([2])] = 'r/MensLib'


ax = sns.catplot(data=data0_, y='avgH', hue='y_subreddit', x='comment_delta', kind='point', capsize=.2, errorbar=None)
plt.show()