In [1]:
from tqdm.notebook import tqdm
import os
import sys
import random
import numpy as np
import pandas as pd

git_dir = r"C:/Users/Aditya/GitHub/isquash"
#git_dir = "/Users/adityachhabra/Github/isquash"
sys.path.append(git_dir)
sys.path.append(os.path.join(git_dir,'isquash'))
pd.set_option('display.max_columns',500)

In [2]:
from isquash import nlpmodel, preprocessing, evaluation, crossval, tests

In [3]:
enhanced_data = pd.read_csv("data/enhanced_bootstrap_mean.csv")
quant_df      = pd.read_csv("data/quant_data.csv")

### Variables

In [4]:
annotation_vars = [
     "ability_high",
     "ability_low",
     "awareness_information_high",
     "awareness_information_low",
     "budget_high",
     "budget_low",
     "camp_regulations",
     "covid_impacts",
     "education_high",
     "education_low",
     "education_neutral",
     "education_religious",
     "entrepreneur",
     "job_secular",
     "marriage",
     "migration",
     "no_ambition",
     "public_assistance",
     "reliance_on_god",
     "religious",
     "secular",
     "vague_job",
     "vague_non_specific",
     "vocational_training",
     "worries_anxieties",
    ]
# TODO: Add Note to explain suffixes
human_annotation_vars = [c+"_act" for c in annotation_vars]

In [5]:
id_vars_human    = ['uid','data_round','refugee_status']
id_vars_enhanced = ['uid','data_round','refugee_status','bootstrap_run']

### Human Annotated Dataframe (Merged with quantiative data)

In [6]:
human_df = enhanced_data.loc[enhanced_data.annotated==1,[*id_vars_human,*human_annotation_vars]].copy()
human_df = human_df.rename(columns={c+"_act":c for c in annotation_vars}).drop_duplicates()
human_df = pd.merge(human_df,quant_df,on=['uid','data_round'],how='left')

human_df['sample_type'] = 'Human'

### Enhanced Dataframe (Merged with quantiative data)

In [7]:
enhanced_df = enhanced_data[[*id_vars_enhanced,*annotation_vars]].copy()
enhanced_df = pd.merge(enhanced_df,quant_df,on=['uid','data_round'],how='left')

enhanced_df['sample_type'] = 'Enhanced'

### Interpretability

In [8]:
# Categorical regressors
categorical_vars     = ['refugee','hh_head_sex','eld_sex','parent_reledu','data_round']

# Continuous regressors
continuous_vars = [
     'num_child',
     'hh_head_age',
     'parent_eduyears',
     'eld_age',
     'hh_asset_index',
     'hh_income',
     'int_trauma_exp',
]

### Interpretability tests on Human Data

In [200]:
interpreter = tests.Interpretability(human_df,
                                     annotation_vars=annotation_vars,
                                     categorical_regressors=categorical_vars,
                                     continuous_regressors=continuous_vars,
                ).fit_all()

human_interp_df = interpreter.get_results()
human_interp_df['sample_type'] = 'Human'

human_interp_df.head(5)

Unnamed: 0,annotation,fstat,log_fstat,pval,sample_type
0,worries_anxieties,5.024955,1.614417,4.787082e-08,Human
1,vocational_training,0.873789,-0.134917,0.5737753,Human
2,vague_non_specific,11.329886,2.427444,6.542234e-21,Human
3,vague_job,5.485263,1.702065,5.618896e-09,Human
4,secular,2.601944,0.956259,0.002148217,Human


### Interpretability tests on Enhanced Data (Looping over Bootstrap runs)

In [10]:
enh_interp_dfs = []

bootstrap_runs = enhanced_df['bootstrap_run'].unique()

pbar = tqdm(total=len(bootstrap_runs))

for b in bootstrap_runs:
    interpreter = tests.Interpretability(enhanced_df[enhanced_df['bootstrap_run']==b],
                                         annotation_vars=annotation_vars,
                                         categorical_regressors=categorical_vars,
                                         continuous_regressors=continuous_vars,
                    ).fit_all()
    
    interp_df = interpreter.get_results()
    interp_df['bootstrap_run'] = b
    
    enh_interp_dfs.append(interp_df)
    pbar.update(1)
    
enhanced_interp_df = pd.concat(enh_interp_dfs)    
enhanced_interp_df['sample_type'] = 'Enhanced'

  0%|          | 0/25 [00:00<?, ?it/s]

In [11]:
interpret_data = pd.concat([human_interp_df,enhanced_interp_df]).sort_values('annotation',ascending=False,ignore_index=True)
interpret_data.head(5)

Unnamed: 0,annotation,fstat,log_fstat,pval,sample_type,bootstrap_run
0,worries_anxieties,10.363416,2.338282,3.5669629999999996e-20,Enhanced,9.0
1,worries_anxieties,10.808631,2.380345,3.47749e-21,Enhanced,1.0
2,worries_anxieties,12.26295,2.506582,1.710019e-24,Enhanced,11.0
3,worries_anxieties,6.972158,1.941925,1.514813e-12,Enhanced,12.0
4,worries_anxieties,12.621941,2.535437,2.604177e-25,Enhanced,13.0


### Calculate Differences

In [49]:
enhanced_avg_df = enhanced_interp_df.groupby('annotation')['log_fstat'].mean().reset_index()
enhanced_avg_df['sample_type'] = 'Enhanced'

In [99]:
enhanced_avg_dict =  dict(enhanced_avg_df[['annotation','log_fstat']].values)
human_avg_dict    =  dict(human_interp_df[['annotation','log_fstat']].values)

### Visualize

In [None]:
import plotly.express as px
import plotly.graph_objects as go

In [208]:
fig = go.Figure()
annotation_vars.sort()
annotation_vars.reverse()

#### Difference in F-statistic (Logarithmic) averages 
for annotation in annotation_vars:
    enh_value, human_value = enhanced_avg_dict[annotation], human_avg_dict[annotation]
    line_color = 'lightgreen' if enh_value>=human_value else 'red'
        
    difference_trace = go.Scatter(
                 y=[annotation,annotation],
                 x=[enh_value,human_value],
                 mode='lines',                 
                 line_color=line_color,        
                 name='Difference',
                 showlegend=False
                )

    fig.add_trace(difference_trace)

    
### Human Data F-statistic (Logarithmic) values
human_trace = go.Scatter(
                 y=interpret_data.loc[interpret_data['sample_type']=='Human','annotation'],
                 x=interpret_data.loc[interpret_data['sample_type']=='Human','log_fstat'],
                 mode='markers',
                 marker=dict(
                     size=10,line_width=1,color='black',symbol='cross-thin',
                 ),
                 name='Human'
                )
fig.add_trace(human_trace)

### Enhanced Data  Log F-statistic values for all bootstrap runs
enhanced_trace = go.Scatter(
                 y=interpret_data.loc[interpret_data['sample_type']=='Enhanced','annotation'],
                 x=interpret_data.loc[interpret_data['sample_type']=='Enhanced','log_fstat'],
                 mode='markers',
                 marker=dict(
                     size=5,
                     color='black',
                     opacity=0.5,
                     line_width=1,
                     symbol='circle-open',
                 ),
                 name='Enhanced'
                )

fig.add_trace(enhanced_trace)



fig.update_yaxes(title=dict(text='Annotation',standoff=10),
                 gridwidth=0.75,gridcolor='lightgray',ticklen=5,ticks='outside',dtick=1,
                 mirror=True,showline=True,linecolor='black', zeroline=True,zerolinecolor='lightgray'
                )
fig.update_xaxes(title=dict(text='log F statistic',standoff=10),
                 gridwidth=0.75,gridcolor='lightgray',ticklen=5,ticks='outside',dtick=1,
                 mirror=True,showline=True,linecolor='black', zeroline=True,zerolinecolor='lightgray'
                )
fig.update_layout(width=800,height=800,
                  margin=dict(pad=0,l=0,t=100,r=0,b=100),
                  title=dict(text='Figure 9: Interpretability test',x=0.5,font_size=18,font_family='Arial',),
                  legend=dict(title='Sample',title_font_family="Arial",orientation='v'),
                  plot_bgcolor='white',  
                  

                 )