# Analyzing linguistic adaptation in terms of entropy

In [None]:
import os
import re

DATA_PATH = 'data/results'
DATA_FILE = 'ceda-results.csv'

REPORTING_PATH = 'data/reports'
REPORT_NAME = os.path.join(REPORTING_PATH, 'report-{}.csv')
MODEL_PERFORMANCE_NAME = os.path.join(REPORTING_PATH, 'model-comparison.csv')

lollipop_vis_name = 'lollipop.png'

## Main Analyses and Results

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import statsmodels.formula.api as smf
from datetime import datetime as dt

df = pd.read_csv(os.path.join(DATA_PATH, DATA_FILE))
# df = df.loc[
#     (df['nx'] >= 5)
#     & (df['ny'] >= 5)
#     # & (df['comment_delta_abs'] <= 20)
# ] # limit by comment size
df.shape

In [None]:
unix_time = True

if not unix_time:
    contexts = pd.concat([
        df[['x_context_id', 'x_context_time']].drop_duplicates(),
        df[['y_context_id', 'y_context_time']].drop_duplicates().copy().rename(columns={'y_context_id': 'x_context_id', 'y_context_time': 'x_context_time'}),
    ], ignore_index=True).drop_duplicates().sort_values(by='x_context_time').values
    
    convert_context_times = {context: i+1 for i, context in enumerate(contexts[:,0])}
    
    df['x_context_time'] = [convert_context_times[context] for context in tqdm(df['x_context_id'].values)]
    df['y_context_time'] = [convert_context_times[context] for context in tqdm(df['y_context_id'].values)]
    

# time difference in unix time
df['time_delta'] = df['x_context_time'] - df['y_context_time']

In [None]:
df['tag_in_context'] = [df['x_tag'].loc[i] in df['y_tag'].loc[i] for i in tqdm(df.index)]

In [None]:
# df['cc_is_parent'].value_counts()

In [None]:
# df['cc_is_child'].value_counts()

In [None]:
# df['cc_is_sibling'].value_counts()

In [None]:
df.head()

### Model 1: Linguistic Adaptation as Linear Change Over Time

In [None]:
##########################################
## Main model
##########################################
# model = "Hxy ~ nx + ny + time_delta + x_comment_ups + y_comment_ups +  cc_is_parent + cc_is_sibling + cc_is_child + (1|x_user) + (1|y_user) + (1|y_submission_id)"
model = "Hxy ~ nx + ny + tag_in_context + time_delta + x_comment_ups + y_comment_ups + (1|x_user) + (1|y_user) + (1|y_submission_id)"

##########################################

start = dt.now()
md = smf.mixedlm(model, data=df, groups=df['x_comment_id'])
mdf = md.fit()
print('completed in:', dt.now()-start)

Reporting on the model outputs in a dataframe

In [None]:
reporting = pd.DataFrame()
reporting['coefs'] = mdf.params
reporting['stat'] = mdf.tvalues
reporting['p'] = mdf.pvalues
reporting['CI[.025, .975]'] = ['[{}]'.format(', '.join([np.format_float_scientific(x, precision=2) for x in ci.tolist()])) for ci in mdf.conf_int().values]

reporting['coefs'] = reporting['coefs'].apply(lambda x: np.format_float_scientific(x, precision=2))
reporting['stat'] = reporting['stat'].apply(lambda x: np.format_float_scientific(x, precision=2))
reporting['p'] = reporting['p'].apply(lambda x: np.format_float_scientific(x, precision=2))

reporting.head(100)

In [None]:
model_version = 'linear-time-difference'
REPORT_NAME_ = REPORT_NAME.format(model_version)

reporting.to_csv(REPORT_NAME_, encoding='utf-8')

reporting['Var'] = reporting.index.values
with open(REPORT_NAME_.replace('.csv', '.txt'), 'w') as f:
    txt =  reporting[['Var', 'coefs', 'stat', 'p']].loc[:reporting.index[-2]].to_latex(index=False).replace('\\toprule', '\\hline').replace('\\midrule', '\\hline\\hline').replace('\\bottomrule', '\\hline')
    f.write(txt)
    f.close()

saving model performance metrics

In [None]:
llf = mdf.llf

if not os.path.exists(MODEL_PERFORMANCE_NAME):
    dfo = pd.DataFrame([{
        'model': model_version, 
        'LLF': llf, 
        'params': len(reporting), 
        'n': df.shape[0],
        'BIC': (-2 * llf) + (len(reporting) * np.log(df.shape[0]))
    }])
    dfo.to_csv(MODEL_PERFORMANCE_NAME, index=False, encoding='utf-8')
else:
    dfo = pd.DataFrame([{
        'model': model_version, 
        'LLF': llf, 
        'params': len(reporting), 
        'n': df.shape[0],
        'BIC': (-2 * llf) + (len(reporting) * np.log(df.shape[0]))
    }])
    dfo.to_csv(MODEL_PERFORMANCE_NAME, index=False, header=False, encoding='utf-8', mode='a')

### Model 2: Linguistic Adaptation as Temporally Local

In [None]:
df['time_delta_'] = df['time_delta'].abs()
##########################################
## Main model
##########################################
# model = "Hxy ~ nx + ny + time_delta + x_comment_ups + y_comment_ups +  cc_is_parent + cc_is_sibling + cc_is_child + (1|x_user) + (1|y_user) + (1|y_submission_id)"
model = "Hxy ~ nx + ny + tag_in_context + time_delta_ + x_comment_ups + y_comment_ups + (1|x_user) + (1|y_user) + (1|y_submission_id)"

##########################################

start = dt.now()
md = smf.mixedlm(model, data=df, groups=df['x_comment_id'])
mdf = md.fit()
print('completed in:', dt.now()-start)

Reporting on the model outputs in a dataframe

In [None]:
reporting = pd.DataFrame()
reporting['coefs'] = mdf.params
reporting['stat'] = mdf.tvalues
reporting['p'] = mdf.pvalues
reporting['CI[.025, .975]'] = ['[{}]'.format(', '.join([np.format_float_scientific(x, precision=2) for x in ci.tolist()])) for ci in mdf.conf_int().values]

reporting['coefs'] = reporting['coefs'].apply(lambda x: np.format_float_scientific(x, precision=2))
reporting['stat'] = reporting['stat'].apply(lambda x: np.format_float_scientific(x, precision=2))
reporting['p'] = reporting['p'].apply(lambda x: np.format_float_scientific(x, precision=2))

reporting.head(100)

In [None]:
model_version = 'locally-bound-time-difference'
REPORT_NAME_ = REPORT_NAME.format(model_version)

reporting.to_csv(REPORT_NAME_, encoding='utf-8')

reporting['Var'] = reporting.index.values
with open(REPORT_NAME_.replace('.csv', '.txt'), 'w') as f:
    txt =  reporting[['Var', 'coefs', 'stat', 'p']].loc[:reporting.index[-2]].to_latex(index=False).replace('\\toprule', '\\hline').replace('\\midrule', '\\hline\\hline').replace('\\bottomrule', '\\hline')
    f.write(txt)
    f.close()

saving model performance metrics

In [None]:
llf = mdf.llf

if not os.path.exists(MODEL_PERFORMANCE_NAME):
    dfo = pd.DataFrame([{
        'model': model_version, 
        'LLF': llf, 
        'params': len(reporting), 
        'n': df.shape[0],
        'BIC': (-2 * llf) + (len(reporting) * np.log(df.shape[0]))
    }])
    dfo.to_csv(MODEL_PERFORMANCE_NAME, index=False, encoding='utf-8')
else:
    dfo = pd.DataFrame([{
        'model': model_version, 
        'LLF': llf, 
        'params': len(reporting), 
        'n': df.shape[0],
        'BIC': (-2 * llf) + (len(reporting) * np.log(df.shape[0]))
    }])
    dfo.to_csv(MODEL_PERFORMANCE_NAME, index=False, header=False, encoding='utf-8', mode='a')

### Model 3: Linguistic Adaptation as Convergence Behavior Only

In [None]:
##########################################
## Main model
##########################################
model = "Hxy ~ nx + ny + tag_in_context + same_context + x_comment_ups + y_comment_ups + (1|x_user) + (1|y_user) + (1|y_submission_id)"

##########################################

start = dt.now()
md = smf.mixedlm(model, data=df, groups=df['x_comment_id'])
mdf = md.fit()
print('completed in:', dt.now()-start)

Reporting on the model outputs in a dataframe

In [None]:
reporting = pd.DataFrame()
reporting['coefs'] = mdf.params
reporting['stat'] = mdf.tvalues
reporting['p'] = mdf.pvalues
reporting['CI[.025, .975]'] = ['[{}]'.format(', '.join([np.format_float_scientific(x, precision=2) for x in ci.tolist()])) for ci in mdf.conf_int().values]

reporting['coefs'] = reporting['coefs'].apply(lambda x: np.format_float_scientific(x, precision=2))
reporting['stat'] = reporting['stat'].apply(lambda x: np.format_float_scientific(x, precision=2))
reporting['p'] = reporting['p'].apply(lambda x: np.format_float_scientific(x, precision=2))

reporting.head(100)

In [None]:
model_version = 'convergence-only-difference'
REPORT_NAME_ = REPORT_NAME.format(model_version)

reporting.to_csv(REPORT_NAME_, encoding='utf-8')

reporting['Var'] = reporting.index.values
with open(REPORT_NAME_.replace('.csv', '.txt'), 'w') as f:
    txt =  reporting[['Var', 'coefs', 'stat', 'p']].loc[:reporting.index[-2]].to_latex(index=False).replace('\\toprule', '\\hline').replace('\\midrule', '\\hline\\hline').replace('\\bottomrule', '\\hline')
    f.write(txt)
    f.close()

saving model performance metrics

In [None]:
llf = mdf.llf

if not os.path.exists(MODEL_PERFORMANCE_NAME):
    dfo = pd.DataFrame([{
        'model': model_version, 
        'LLF': llf, 
        'params': len(reporting), 
        'n': df.shape[0],
        'BIC': (-2 * llf) + (len(reporting) * np.log(df.shape[0]))
    }])
    dfo.to_csv(MODEL_PERFORMANCE_NAME, index=False, encoding='utf-8')
else:
    dfo = pd.DataFrame([{
        'model': model_version, 
        'LLF': llf, 
        'params': len(reporting), 
        'n': df.shape[0],
        'BIC': (-2 * llf) + (len(reporting) * np.log(df.shape[0]))
    }])
    dfo.to_csv(MODEL_PERFORMANCE_NAME, index=False, header=False, encoding='utf-8', mode='a')

## Additional Visualizations/Analyses

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import kruskal

### Lolipop effects plot

In [None]:
import pandas as pd
import numpy as np

# reporting = pd.read_csv('data/reports/antisemitism/report.csv')
reporting = pd.read_csv(os.path.join(REPORTING_PATH,REPORT_NAME))
reporting.index = reporting['Unnamed: 0'].values

In [None]:
def lollipop_chart(df, label_col, length_col, save_path=None, aspect=1/15, plot_title='Predicted change in H'):
    sns.set_style('darkgrid')
    plt.hlines(y=df.index, xmin=0, xmax=df[length_col].values)
    plt.plot(df[length_col].values, df.index, 'o')
    plt.yticks(df.index, df[label_col].values, rotation=.45, fontsize='small')
    plt.axvline(color='maroon')

    xlim_delta = df[length_col].__abs__().max() + .1
    plt.xlim(-xlim_delta, xlim_delta)
    plt.gca().set_aspect(aspect)
    plt.tight_layout()
    plt.xlabel(plot_title)
    if save_path:
        plt.savefig(save_path)
    plt.show()

In [None]:
import plotly.graph_objs as go

marker_offset = 0.0004

def offset_signal(signal, marker_offset):
    if abs(signal) <= marker_offset:
        return 0
    return signal - marker_offset if signal > 0 else signal + marker_offset

def plotly_lollipop(df, label_col, length_col, save_path=None, plot_title='', color='blue', marker_size=2):
    points = df[length_col].to_list()
    heights = list(range(len(df)))
    
    data = [
    go.Scatter(
            x=points,
            y=heights,
            mode='markers',
            marker=dict(
                color=color,
                size=marker_size
            )
        )
    ]

    layout = go.Layout(
    shapes=[dict(
            type='line',
            xref='x',
            yref='y',
            y0=i,
            x0=0,
            y1=i,
            x1=offset_signal(points[i], marker_offset),
            line=dict(
                color=color,
                width=1.5
            )
        ) for i in range(len(points))],
    )

    fig = go.Figure(data, layout)

    for idx in range(len(fig.data)):
        fig.data[idx].y = df_param['cond'].to_list()

    fig.add_vline(x=0, line_width=3, line_color="maroon")
    
    return fig

In [None]:
df_param = [
    # x HS
    ['HS', (reporting['coefs'].loc[['x_probs']]).sum()],
    
    # x AHS
    ['AHS', (
        reporting['coefs'].loc[['x_probs', 'x_target','x_probs:x_target']] #* (reporting['p'].loc[['x_probs', 'x_target','x_probs:x_target']] < .01)
    ).sum()],
    
    # x AHS post-october 7th
    ['AHS after Oct. 7, 2023', (
        reporting['coefs'].loc[['x_probs', 'x_target', 'x_probs:x_target','after_october_7', 'after_october_7:x_target', 'after_october_7:x_probs', 'after_october_7:x_probs:x_target']] #* (reporting['p'].loc[['x_probs', 'x_target', 'x_probs:x_target','after_october_7', 'after_october_7:x_target', 'after_october_7:x_probs', 'after_october_7:x_probs:x_target']] < .01)
    ).sum()],
    
    # # Y HS
    # ['Y HS', (reporting['coefs'].loc[['y_probs']]).sum()],
    # 
    # # Y AHS
    # ['Y AHS', (
    #     reporting['coefs'].loc[[ 'y_probs', 'y_target', 'y_probs:y_target']] #* (reporting['p'].loc[[ 'y_probs', 'y_target', 'y_probs:y_target']] < .01)
    # ).sum()],
    # 
    # # Y AHS post-october 7th
    # ['Y AHS after Oct. 7, 2023', (
    #     reporting['coefs'].loc[['y_probs', 'y_target', 'y_probs:y_target','after_october_7', 'after_october_7:y_target', 'after_october_7:y_probs','after_october_7:y_probs:y_target',]] #* (reporting['p'].loc[['y_probs', 'y_target', 'y_probs:y_target','after_october_7', 'after_october_7:y_target', 'after_october_7:y_probs','after_october_7:y_probs:y_target',]] <.01)
    # ).sum()], 
][::-1]

df_param = pd.DataFrame(
    np.array(df_param, dtype=object),
    columns=['cond', '$Delta$ H']
)

In [None]:
# lollipop_chart(
#     df=df_param,
#     label_col='cond',
#     length_col='$Delta$ H',
#     save_path=lollipop_vis_name,
#     aspect=1/3,
#     plot_title=''
# )

In [None]:
fig = plotly_lollipop(
    df=df_param,
    label_col='cond',
    length_col='$Delta$ H',
    marker_size=10
)

fig.show()

In [None]:
fig.write_html('ahs-parent.html')