# Analyzing linguistic adaptation in terms of entropy

In [1]:
import os
import re

DATA_PATH = 'data/results'
DATA_FILE = 'ceda-results.csv'

REPORTING_PATH = 'data/reports'
REPORT_NAME = os.path.join(REPORTING_PATH, 'report-{}.csv')
MODEL_PERFORMANCE_NAME = os.path.join(REPORTING_PATH, 'model-comparison.csv')

lollipop_vis_name = 'lollipop.png'

## Main Analyses and Results

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import statsmodels.formula.api as smf
from datetime import datetime as dt

df = pd.read_csv(os.path.join(DATA_PATH, DATA_FILE))
# df = df.loc[
#     (df['nx'] >= 5)
#     & (df['ny'] >= 5)
#     # & (df['comment_delta_abs'] <= 20)
# ] # limit by comment size
df.shape

(12174954, 31)

In [3]:
unix_time = True

if not unix_time:
    contexts = pd.concat([
        df[['x_context_id', 'x_context_time']].drop_duplicates(),
        df[['y_context_id', 'y_context_time']].drop_duplicates().copy().rename(columns={'y_context_id': 'x_context_id', 'y_context_time': 'x_context_time'}),
    ], ignore_index=True).drop_duplicates().sort_values(by='x_context_time').values
    
    convert_context_times = {context: i+1 for i, context in enumerate(contexts[:,0])}
    
    df['x_context_time'] = [convert_context_times[context] for context in tqdm(df['x_context_id'].values)]
    df['y_context_time'] = [convert_context_times[context] for context in tqdm(df['y_context_id'].values)]
    

# time difference in unix time
df['time_delta'] = df['x_context_time'] - df['y_context_time']

In [4]:
df['tag_in_context'] = [df['x_tag'].loc[i] in df['y_tag'].loc[i] for i in tqdm(df.index)]

100%|██████████| 12174954/12174954 [04:04<00:00, 49821.21it/s]


In [5]:
# df['cc_is_parent'].value_counts()

In [6]:
# df['cc_is_child'].value_counts()

In [7]:
# df['cc_is_sibling'].value_counts()

In [8]:
df.head()

Unnamed: 0,x_submission_id,x_submission_created_at,x_comment_created_at,x_comment_id,x_user,x_tag,x_line_no,y_submission_id,y_submission_created_at,y_comment_created_at,...,y_context_id,same_context,cc_is_child,cc_is_sibling,cc_is_parent,x_context_time,y_context_time,same_author,time_delta,tag_in_context
0,58,1709004000.0,1709073000.0,2087,3967,pro_life,9,170,1457137000.0,1457137000.0,...,w596h,False,False,False,False,1709004000.0,1341606000.0,False,367398347.0,True
1,58,1709004000.0,1709073000.0,2087,3967,pro_life,9,170,1457137000.0,1457137000.0,...,w596h,False,False,False,False,1709004000.0,1341606000.0,False,367398347.0,True
2,40,1726492000.0,1726493000.0,498,1308,pro_life,16,170,1457137000.0,1457137000.0,...,w596h,False,False,False,False,1726492000.0,1341606000.0,False,384885844.0,True
3,40,1726492000.0,1726493000.0,498,1308,pro_life,16,170,1457137000.0,1457137000.0,...,w596h,False,False,False,False,1726492000.0,1341606000.0,False,384885844.0,True
4,40,1726492000.0,1726496000.0,2287,1604,forced_birth|pro_life,17,170,1457137000.0,1457137000.0,...,w596h,False,False,False,False,1726493000.0,1341606000.0,False,384886739.0,False


### Model 1: Linguistic Adaptation as Linear Change Over Time

In [9]:
##########################################
## Main model
##########################################
# model = "Hxy ~ nx + ny + time_delta + x_comment_ups + y_comment_ups +  cc_is_parent + cc_is_sibling + cc_is_child + (1|x_user) + (1|y_user) + (1|y_submission_id)"
model = "Hxy ~ nx + ny + tag_in_context + time_delta + x_comment_ups + y_comment_ups + (1|x_user) + (1|y_user) + (1|y_submission_id)"

##########################################

start = dt.now()
md = smf.mixedlm(model, data=df, groups=df['x_comment_id'])
mdf = md.fit()
print('completed in:', dt.now()-start)

completed in: 0:02:48.292115


Reporting on the model outputs in a dataframe

In [10]:
reporting = pd.DataFrame()
reporting['coefs'] = mdf.params
reporting['stat'] = mdf.tvalues
reporting['p'] = mdf.pvalues
reporting['CI[.025, .975]'] = ['[{}]'.format(', '.join([np.format_float_scientific(x, precision=2) for x in ci.tolist()])) for ci in mdf.conf_int().values]

reporting['coefs'] = reporting['coefs'].apply(lambda x: np.format_float_scientific(x, precision=2))
reporting['stat'] = reporting['stat'].apply(lambda x: np.format_float_scientific(x, precision=2))
reporting['p'] = reporting['p'].apply(lambda x: np.format_float_scientific(x, precision=2))

reporting.head(100)

Unnamed: 0,coefs,stat,p,"CI[.025, .975]"
Intercept,0.207,5.11,3.17e-07,"[1.28e-01, 2.87e-01]"
tag_in_context[T.True],-0.118,-100.0,0.0,"[-1.20e-01, -1.15e-01]"
nx,0.0825,304.0,0.0,"[8.2e-02, 8.31e-02]"
ny,-0.00809,-1510.0,0.0,"[-8.10e-03, -8.08e-03]"
time_delta,-1.54e-10,-37.6,6.07e-309,"[-1.62e-10, -1.46e-10]"
x_comment_ups,9.91e-05,0.688,0.491,"[-1.83e-04, 3.81e-04]"
y_comment_ups,0.00044,235.0,0.0,"[4.36e-04, 4.43e-04]"
1 | x_user,1.05e-08,0.0241,0.981,"[-8.48e-07, 8.69e-07]"
1 | y_user,5.73e-07,2.2,0.028,"[6.19e-08, 1.08e-06]"
1 | y_submission_id,0.00159,319.0,0.0,"[1.58e-03, 1.59e-03]"


In [11]:
model_version = 'linear-time-difference'
REPORT_NAME_ = REPORT_NAME.format(model_version)

reporting.to_csv(REPORT_NAME_, encoding='utf-8')

reporting['Var'] = reporting.index.values
with open(REPORT_NAME_.replace('.csv', '.txt'), 'w') as f:
    txt =  reporting[['Var', 'coefs', 'stat', 'p']].loc[:reporting.index[-2]].to_latex(index=False).replace('\\toprule', '\\hline').replace('\\midrule', '\\hline\\hline').replace('\\bottomrule', '\\hline')
    f.write(txt)
    f.close()

  txt =  reporting[['Var', 'coefs', 'stat', 'p']].loc[:reporting.index[-2]].to_latex(index=False).replace('\\toprule', '\\hline').replace('\\midrule', '\\hline\\hline').replace('\\bottomrule', '\\hline')


saving model performance metrics

In [12]:
llf = mdf.llf

if not os.path.exists(MODEL_PERFORMANCE_NAME):
    dfo = pd.DataFrame([{
        'model': model_version, 
        'LLF': llf, 
        'params': len(reporting), 
        'n': df.shape[0],
        'BIC': (-2 * llf) + (len(reporting) * np.log(df.shape[0]))
    }])
    dfo.to_csv(MODEL_PERFORMANCE_NAME, index=False, encoding='utf-8')
else:
    dfo = pd.DataFrame([{
        'model': model_version, 
        'LLF': llf, 
        'params': len(reporting), 
        'n': df.shape[0],
        'BIC': (-2 * llf) + (len(reporting) * np.log(df.shape[0]))
    }])
    dfo.to_csv(MODEL_PERFORMANCE_NAME, index=False, header=False, encoding='utf-8', mode='a')

### Model 2: Linguistic Adaptation as Temporally Local

In [13]:
df['time_delta_'] = df['time_delta'].abs()
##########################################
## Main model
##########################################
# model = "Hxy ~ nx + ny + time_delta + x_comment_ups + y_comment_ups +  cc_is_parent + cc_is_sibling + cc_is_child + (1|x_user) + (1|y_user) + (1|y_submission_id)"
model = "Hxy ~ nx + ny + tag_in_context + time_delta_ + x_comment_ups + y_comment_ups + (1|x_user) + (1|y_user) + (1|y_submission_id)"

##########################################

start = dt.now()
md = smf.mixedlm(model, data=df, groups=df['x_comment_id'])
mdf = md.fit()
print('completed in:', dt.now()-start)

completed in: 0:02:42.485432


Reporting on the model outputs in a dataframe

In [14]:
reporting = pd.DataFrame()
reporting['coefs'] = mdf.params
reporting['stat'] = mdf.tvalues
reporting['p'] = mdf.pvalues
reporting['CI[.025, .975]'] = ['[{}]'.format(', '.join([np.format_float_scientific(x, precision=2) for x in ci.tolist()])) for ci in mdf.conf_int().values]

reporting['coefs'] = reporting['coefs'].apply(lambda x: np.format_float_scientific(x, precision=2))
reporting['stat'] = reporting['stat'].apply(lambda x: np.format_float_scientific(x, precision=2))
reporting['p'] = reporting['p'].apply(lambda x: np.format_float_scientific(x, precision=2))

reporting.head(100)

Unnamed: 0,coefs,stat,p,"CI[.025, .975]"
Intercept,0.0949,2.33,0.0198,"[1.51e-02, 1.75e-01]"
tag_in_context[T.True],-0.1,-86.4,0.0,"[-1.02e-01, -9.77e-02]"
nx,0.0825,302.0,0.0,"[8.2e-02, 8.30e-02]"
ny,-0.00819,-1560.0,0.0,"[-8.20e-03, -8.18e-03]"
time_delta_,1.13e-09,231.0,0.0,"[1.12e-09, 1.14e-09]"
x_comment_ups,0.000128,0.884,0.377,"[-1.56e-04, 4.12e-04]"
y_comment_ups,0.000381,210.0,0.0,"[3.78e-04, 3.85e-04]"
1 | x_user,8.29e-09,0.019,0.985,"[-8.49e-07, 8.65e-07]"
1 | y_user,2.45e-07,0.941,0.347,"[-2.65e-07, 7.54e-07]"
1 | y_submission_id,0.00136,270.0,0.0,"[1.35e-03, 1.37e-03]"


In [15]:
model_version = 'locally-bound-time-difference'
REPORT_NAME_ = REPORT_NAME.format(model_version)

reporting.to_csv(REPORT_NAME_, encoding='utf-8')

reporting['Var'] = reporting.index.values
with open(REPORT_NAME_.replace('.csv', '.txt'), 'w') as f:
    txt =  reporting[['Var', 'coefs', 'stat', 'p']].loc[:reporting.index[-2]].to_latex(index=False).replace('\\toprule', '\\hline').replace('\\midrule', '\\hline\\hline').replace('\\bottomrule', '\\hline')
    f.write(txt)
    f.close()

  txt =  reporting[['Var', 'coefs', 'stat', 'p']].loc[:reporting.index[-2]].to_latex(index=False).replace('\\toprule', '\\hline').replace('\\midrule', '\\hline\\hline').replace('\\bottomrule', '\\hline')


saving model performance metrics

In [16]:
llf = mdf.llf

if not os.path.exists(MODEL_PERFORMANCE_NAME):
    dfo = pd.DataFrame([{
        'model': model_version, 
        'LLF': llf, 
        'params': len(reporting), 
        'n': df.shape[0],
        'BIC': (-2 * llf) + (len(reporting) * np.log(df.shape[0]))
    }])
    dfo.to_csv(MODEL_PERFORMANCE_NAME, index=False, encoding='utf-8')
else:
    dfo = pd.DataFrame([{
        'model': model_version, 
        'LLF': llf, 
        'params': len(reporting), 
        'n': df.shape[0],
        'BIC': (-2 * llf) + (len(reporting) * np.log(df.shape[0]))
    }])
    dfo.to_csv(MODEL_PERFORMANCE_NAME, index=False, header=False, encoding='utf-8', mode='a')

### Model 3: Linguistic Adaptation as Convergence Behavior Only

In [17]:
##########################################
## Main model
##########################################
model = "Hxy ~ nx + ny + tag_in_context + same_context + x_comment_ups + y_comment_ups + (1|x_user) + (1|y_user) + (1|y_submission_id)"

##########################################

start = dt.now()
md = smf.mixedlm(model, data=df, groups=df['x_comment_id'])
mdf = md.fit()
print('completed in:', dt.now()-start)

completed in: 0:02:44.301646


Reporting on the model outputs in a dataframe

In [18]:
reporting = pd.DataFrame()
reporting['coefs'] = mdf.params
reporting['stat'] = mdf.tvalues
reporting['p'] = mdf.pvalues
reporting['CI[.025, .975]'] = ['[{}]'.format(', '.join([np.format_float_scientific(x, precision=2) for x in ci.tolist()])) for ci in mdf.conf_int().values]

reporting['coefs'] = reporting['coefs'].apply(lambda x: np.format_float_scientific(x, precision=2))
reporting['stat'] = reporting['stat'].apply(lambda x: np.format_float_scientific(x, precision=2))
reporting['p'] = reporting['p'].apply(lambda x: np.format_float_scientific(x, precision=2))

reporting.head(100)

Unnamed: 0,coefs,stat,p,"CI[.025, .975]"
Intercept,0.206,5.09,3.67e-07,"[1.27e-01, 2.86e-01]"
tag_in_context[T.True],-0.109,-94.2,0.0,"[-1.11e-01, -1.07e-01]"
same_context[T.True],-0.873,-204.0,0.0,"[-8.81e-01, -8.64e-01]"
nx,0.0825,303.0,0.0,"[8.2e-02, 8.30e-02]"
ny,-0.00812,-1550.0,0.0,"[-8.13e-03, -8.11e-03]"
x_comment_ups,0.000129,0.895,0.371,"[-1.54e-04, 4.12e-04]"
y_comment_ups,0.000469,258.0,0.0,"[4.66e-04, 4.73e-04]"
1 | x_user,1.07e-08,0.0245,0.98,"[-8.47e-07, 8.68e-07]"
1 | y_user,3.35e-07,1.29,0.198,"[-1.75e-07, 8.44e-07]"
1 | y_submission_id,0.00163,330.0,0.0,"[1.62e-03, 1.64e-03]"


In [19]:
model_version = 'convergence-only-difference'
REPORT_NAME_ = REPORT_NAME.format(model_version)

reporting.to_csv(REPORT_NAME_, encoding='utf-8')

reporting['Var'] = reporting.index.values
with open(REPORT_NAME_.replace('.csv', '.txt'), 'w') as f:
    txt =  reporting[['Var', 'coefs', 'stat', 'p']].loc[:reporting.index[-2]].to_latex(index=False).replace('\\toprule', '\\hline').replace('\\midrule', '\\hline\\hline').replace('\\bottomrule', '\\hline')
    f.write(txt)
    f.close()

  txt =  reporting[['Var', 'coefs', 'stat', 'p']].loc[:reporting.index[-2]].to_latex(index=False).replace('\\toprule', '\\hline').replace('\\midrule', '\\hline\\hline').replace('\\bottomrule', '\\hline')


saving model performance metrics

In [20]:
llf = mdf.llf

if not os.path.exists(MODEL_PERFORMANCE_NAME):
    dfo = pd.DataFrame([{
        'model': model_version, 
        'LLF': llf, 
        'params': len(reporting), 
        'n': df.shape[0],
        'BIC': (-2 * llf) + (len(reporting) * np.log(df.shape[0]))
    }])
    dfo.to_csv(MODEL_PERFORMANCE_NAME, index=False, encoding='utf-8')
else:
    dfo = pd.DataFrame([{
        'model': model_version, 
        'LLF': llf, 
        'params': len(reporting), 
        'n': df.shape[0],
        'BIC': (-2 * llf) + (len(reporting) * np.log(df.shape[0]))
    }])
    dfo.to_csv(MODEL_PERFORMANCE_NAME, index=False, header=False, encoding='utf-8', mode='a')

## Additional Visualizations/Analyses

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import kruskal

### Lolipop effects plot

In [None]:
import pandas as pd
import numpy as np

# reporting = pd.read_csv('data/reports/antisemitism/report.csv')
reporting = pd.read_csv(os.path.join(REPORTING_PATH,REPORT_NAME))
reporting.index = reporting['Unnamed: 0'].values

In [None]:
def lollipop_chart(df, label_col, length_col, save_path=None, aspect=1/15, plot_title='Predicted change in H'):
    sns.set_style('darkgrid')
    plt.hlines(y=df.index, xmin=0, xmax=df[length_col].values)
    plt.plot(df[length_col].values, df.index, 'o')
    plt.yticks(df.index, df[label_col].values, rotation=.45, fontsize='small')
    plt.axvline(color='maroon')

    xlim_delta = df[length_col].__abs__().max() + .1
    plt.xlim(-xlim_delta, xlim_delta)
    plt.gca().set_aspect(aspect)
    plt.tight_layout()
    plt.xlabel(plot_title)
    if save_path:
        plt.savefig(save_path)
    plt.show()

In [None]:
import plotly.graph_objs as go

marker_offset = 0.0004

def offset_signal(signal, marker_offset):
    if abs(signal) <= marker_offset:
        return 0
    return signal - marker_offset if signal > 0 else signal + marker_offset

def plotly_lollipop(df, label_col, length_col, save_path=None, plot_title='', color='blue', marker_size=2):
    points = df[length_col].to_list()
    heights = list(range(len(df)))
    
    data = [
    go.Scatter(
            x=points,
            y=heights,
            mode='markers',
            marker=dict(
                color=color,
                size=marker_size
            )
        )
    ]

    layout = go.Layout(
    shapes=[dict(
            type='line',
            xref='x',
            yref='y',
            y0=i,
            x0=0,
            y1=i,
            x1=offset_signal(points[i], marker_offset),
            line=dict(
                color=color,
                width=1.5
            )
        ) for i in range(len(points))],
    )

    fig = go.Figure(data, layout)

    for idx in range(len(fig.data)):
        fig.data[idx].y = df_param['cond'].to_list()

    fig.add_vline(x=0, line_width=3, line_color="maroon")
    
    return fig

In [None]:
df_param = [
    # x HS
    ['HS', (reporting['coefs'].loc[['x_probs']]).sum()],
    
    # x AHS
    ['AHS', (
        reporting['coefs'].loc[['x_probs', 'x_target','x_probs:x_target']] #* (reporting['p'].loc[['x_probs', 'x_target','x_probs:x_target']] < .01)
    ).sum()],
    
    # x AHS post-october 7th
    ['AHS after Oct. 7, 2023', (
        reporting['coefs'].loc[['x_probs', 'x_target', 'x_probs:x_target','after_october_7', 'after_october_7:x_target', 'after_october_7:x_probs', 'after_october_7:x_probs:x_target']] #* (reporting['p'].loc[['x_probs', 'x_target', 'x_probs:x_target','after_october_7', 'after_october_7:x_target', 'after_october_7:x_probs', 'after_october_7:x_probs:x_target']] < .01)
    ).sum()],
    
    # # Y HS
    # ['Y HS', (reporting['coefs'].loc[['y_probs']]).sum()],
    # 
    # # Y AHS
    # ['Y AHS', (
    #     reporting['coefs'].loc[[ 'y_probs', 'y_target', 'y_probs:y_target']] #* (reporting['p'].loc[[ 'y_probs', 'y_target', 'y_probs:y_target']] < .01)
    # ).sum()],
    # 
    # # Y AHS post-october 7th
    # ['Y AHS after Oct. 7, 2023', (
    #     reporting['coefs'].loc[['y_probs', 'y_target', 'y_probs:y_target','after_october_7', 'after_october_7:y_target', 'after_october_7:y_probs','after_october_7:y_probs:y_target',]] #* (reporting['p'].loc[['y_probs', 'y_target', 'y_probs:y_target','after_october_7', 'after_october_7:y_target', 'after_october_7:y_probs','after_october_7:y_probs:y_target',]] <.01)
    # ).sum()], 
][::-1]

df_param = pd.DataFrame(
    np.array(df_param, dtype=object),
    columns=['cond', '$Delta$ H']
)

In [None]:
# lollipop_chart(
#     df=df_param,
#     label_col='cond',
#     length_col='$Delta$ H',
#     save_path=lollipop_vis_name,
#     aspect=1/3,
#     plot_title=''
# )

In [None]:
fig = plotly_lollipop(
    df=df_param,
    label_col='cond',
    length_col='$Delta$ H',
    marker_size=10
)

fig.show()

In [None]:
fig.write_html('ahs-parent.html')