# Analyzing linguistic adaptation in terms of entropy

In [None]:
import os
import re

DATA_PATH = 'data/results'
DATA_FILE = 'ceda-results.csv'

REPORTING_PATH = 'data/reports'
REPORT_NAME = os.path.join(REPORTING_PATH, 'report-{}.csv')
MODEL_PERFORMANCE_NAME = os.path.join(REPORTING_PATH, 'model-comparison.csv')

lollipop_vis_name = 'lollipop.png'

## Main Analyses and Results

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import statsmodels.formula.api as smf
from datetime import datetime as dt

df = pd.read_csv(os.path.join(DATA_PATH, DATA_FILE))
# df = df.loc[
#     (df['nx'] >= 5)
#     & (df['ny'] >= 5)
#     # & (df['comment_delta_abs'] <= 20)
# ] # limit by comment size
df.shape

In [None]:
df['mixed_gender'] = ~df['GenderDemoComp'].isin(['Male-Male', 'Female-Female'])

In [None]:
df.head()

### Model 1: Linguistic Adaptation as Linear Change Over Time

In [None]:
##########################################
## Main model
##########################################
# model = "Hxy ~ nx + ny + time_delta + x_comment_ups + y_comment_ups +  cc_is_parent + cc_is_sibling + cc_is_child + (1|x_user) + (1|y_user) + (1|y_submission_id)"
model = "Hxy ~ nx + ny + mixed_gender + transition +  (1|speaker) + (1|speaker2) + (2|dyad)"

##########################################

start = dt.now()
md = smf.mixedlm(model, data=df, groups=df.index)
mdf = md.fit()
print('completed in:', dt.now()-start)

Reporting on the model outputs in a dataframe

In [None]:
reporting = pd.DataFrame()
reporting['coefs'] = mdf.params
reporting['stat'] = mdf.tvalues
reporting['p'] = mdf.pvalues
reporting['CI[.025, .975]'] = ['[{}]'.format(', '.join([np.format_float_scientific(x, precision=2) for x in ci.tolist()])) for ci in mdf.conf_int().values]

reporting['coefs'] = reporting['coefs'].apply(lambda x: np.format_float_scientific(x, precision=2))
reporting['stat'] = reporting['stat'].apply(lambda x: np.format_float_scientific(x, precision=2))
reporting['p'] = reporting['p'].apply(lambda x: np.format_float_scientific(x, precision=2))

reporting.head(100)

In [None]:
model_version = 'final'
REPORT_NAME_ = REPORT_NAME.format(model_version)

reporting.to_csv(REPORT_NAME_, encoding='utf-8')

reporting['Var'] = reporting.index.values
with open(REPORT_NAME_.replace('.csv', '.txt'), 'w') as f:
    txt =  reporting[['Var', 'coefs', 'stat', 'p']].loc[:reporting.index[-2]].to_latex(index=False).replace('\\toprule', '\\hline').replace('\\midrule', '\\hline\\hline').replace('\\bottomrule', '\\hline')
    f.write(txt)
    f.close()

saving model performance metrics

In [None]:
# individual differences vs. mixed-gender effects
test_matrix = np.zeros(shape=(len(mdf.params)))
sel = np.array([
    ('mixed_gender' in k)
    for k in mdf.params.keys()
])
test_matrix[sel] = 2

sel = np.array([
    ('speaker' in k)
    for k in mdf.params.keys()
])
test_matrix[sel] = -1

mdf.f_test(test_matrix)