# Analyzing linguistic adaptation in terms of entropy

In [1]:
import os
import re

DATA_PATH = '../data/results'
DATA_FILE = 'ceda-results-with_fNIRs.csv'

REPORTING_PATH = '../data/reports'
REPORT_NAME = os.path.join(REPORTING_PATH, 'report-{}.csv')
MODEL_PERFORMANCE_NAME = os.path.join(REPORTING_PATH, 'model-comparison.csv')

lollipop_vis_name = 'lollipop.png'

## Main Analyses and Results

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import statsmodels.formula.api as smf
from datetime import datetime as dt

df = pd.read_csv(os.path.join(DATA_PATH, DATA_FILE))
# df = df.loc[
#     (df['nx'] >= 5)
#     & (df['ny'] >= 5)
#     # & (df['comment_delta_abs'] <= 20)
# ] # limit by comment size
df.shape

(12369, 68)

In [3]:
df['mixed_gender'] = ~df['GenderDemoComp'].isin(['Male-Male', 'Female-Female'])

In [4]:
df.head()

Unnamed: 0,speaker,timestamp,utterance,unedited,overlapping_utterance,file,next_speaker,next_utterance,next_unedited,next_overlapping_utterance,...,R_SPL,R_TPJ,next_L_lPFC,next_mPFC,next_R_lPFC,next_L_SPL,next_L_TPJ,next_R_SPL,next_R_TPJ,mixed_gender
0,R (F),0:02,Okay umm Do you want to start or,"Okay, umm… Do you want to start, or?",False,4040,L (F),Uhh you can you can go,Uhh you can- you can go.,False,...,6.114621,5.460205,7.693702,15.168717,15.93604,8.353804,0.961866,6.03521,5.940956,False
1,L (F),0:04,Uhh you can you can go,Uhh you can- you can go.,False,4040,R (F),Ok I feel like a perfect day for me would be s...,Ok. I feel like a perfect day for me would be…...,False,...,5.894488,6.816861,4.703152,7.161448,6.642062,3.024416,2.671346,2.628057,5.163731,False
2,R (F),0:05,Ok I feel like a perfect day for me would be s...,Ok. I feel like a perfect day for me would be…...,False,4040,L (F),Yeah yours sounds really nice,Yeah yours sounds… ((laughing)) really nice!,False,...,2.567251,5.133704,4.714618,7.023219,6.446579,3.017216,2.747721,2.534276,5.056303,False
3,L (F),0:58,Yeah yours sounds really nice,Yeah yours sounds… ((laughing)) really nice!,False,4040,R (F),Yea,((laughing)) Yea,False,...,1.708186,2.725299,7.261149,9.810425,9.824648,4.639324,4.824461,2.27462,3.93166,False
4,R (F),1:00,Yea,((laughing)) Yea,False,4040,L (F),Ummm I would also sleep in I dont really past ...,Ummm I would also sleep in… I don't really- pa...,False,...,2.446101,4.443919,5.621054,8.876137,8.997023,2.609468,3.074916,1.810273,4.748684,False


### Model 1: Linguistic Adaptation as Linear Change Over Time

In [5]:
##########################################
## Main model
##########################################
# model = "Hxy ~ nx + ny + time_delta + x_comment_ups + y_comment_ups +  cc_is_parent + cc_is_sibling + cc_is_child + (1|x_user) + (1|y_user) + (1|y_submission_id)"
model = "Hxy ~ nx + ny + mixed_gender + transition + (1|speaker) + (1|speaker2) + (2|dyad)"

##########################################

start = dt.now()
md = smf.mixedlm(model, data=df, groups=df.index)
mdf = md.fit()
print('completed in:', dt.now()-start)

completed in: 0:00:00.913868




Reporting on the model outputs in a dataframe

In [6]:
reporting = pd.DataFrame()
reporting['coefs'] = mdf.params
reporting['stat'] = mdf.tvalues
reporting['p'] = mdf.pvalues
reporting['CI[.025, .975]'] = ['[{}]'.format(', '.join([np.format_float_scientific(x, precision=2) for x in ci.tolist()])) for ci in mdf.conf_int().values]

reporting['coefs'] = reporting['coefs'].apply(lambda x: np.format_float_scientific(x, precision=2))
reporting['stat'] = reporting['stat'].apply(lambda x: np.format_float_scientific(x, precision=2))
reporting['p'] = reporting['p'].apply(lambda x: np.format_float_scientific(x, precision=2))

reporting.head(100)

Unnamed: 0,coefs,stat,p,"CI[.025, .975]"
Intercept,-0.0569,-1.93,0.0542,"[-1.15e-01, 1.03e-03]"
mixed_gender[T.True],-0.0329,-2.11,0.0349,"[-6.34e-02, -2.34e-03]"
nx,0.0986,295.0,0.0,"[9.8e-02, 9.93e-02]"
ny,-0.0077,-22.4,9.88e-111,"[-8.37e-03, -7.02e-03]"
transition,4.76e-05,0.96,0.337,"[-4.96e-05, 1.45e-04]"
1 | speaker,-0.000947,-2.24,0.0252,"[-1.78e-03, -1.18e-04]"
1 | speaker2,-0.00157,-3.72,0.000201,"[-2.40e-03, -7.44e-04]"
2 | dyad,-0.0009,-0.927,0.354,"[-2.80e-03, 1.00e-03]"
Group Var,1.0,,,"[nan, nan]"


In [8]:
model_version = 'final'
REPORT_NAME_ = REPORT_NAME.format(model_version)

reporting.to_csv(REPORT_NAME_, encoding='utf-8')

reporting['Var'] = reporting.index.values
with open(REPORT_NAME_.replace('.csv', '.txt'), 'w') as f:
    txt =  reporting[['Var', 'coefs', 'stat', 'p']].loc[:reporting.index[-2]].to_latex(index=False).replace('\\toprule', '\\hline').replace('\\midrule', '\\hline\\hline').replace('\\bottomrule', '\\hline')
    f.write(txt)
    f.close()

Matplotlib is building the font cache; this may take a moment.


saving model performance metrics

In [9]:
# individual differences vs. mixed-gender effects
test_matrix = np.zeros(shape=(len(mdf.params)))
sel = np.array([
    ('mixed_gender' in k)
    for k in mdf.params.keys()
])
test_matrix[sel] = 2

sel = np.array([
    ('speaker' in k)
    for k in mdf.params.keys()
])
test_matrix[sel] = -1

mdf.f_test(test_matrix)

<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=4.148344393231542, p=0.041712176195110195, df_denom=7.12e+03, df_num=1>