In [1]:
import os, sys
git_dir = r"C:/Users/Aditya/GitHub/WorldBank/iQual"
sys.path.append(git_dir)
sys.path.append(os.path.join(git_dir,'src'))

### Imports

In [2]:
import os
import numpy as np
import pandas as pd

from iqual import nlpmodel, preprocessing, evaluation, crossval, tests

> ### Load datasets

In [3]:
data_dir    = os.path.join(git_dir,"data")

### Enhanced qualitative data
enhanced_data = pd.read_csv(os.path.join(data_dir,"enhanced_train_sizes.csv"))

### Quantitative data
quant_df      = pd.read_csv(os.path.join(data_dir,"quant_data.csv"))

> ### Variables

In [88]:
annotation_vars = ["ability_low","secular"]

id_vars = ['uid','split','data_round','refugee_status','bootstrap_run','train_sample_size']

> ### Human Annotated Data (Merged with quantiative data)

In [100]:
train_sample_sizes = [100,200,300,400,500,600,700]
bootstrap_runs     = np.arange(1,11,1)

machine_sample_sizes = np.arange(200,1400,200)

In [114]:
enhanced_data[(enhanced_data.train_sample_size==100) & (enhanced_data.bootstrap_run==1)]

Unnamed: 0,uid,refugee_status,bootstrap_run,train_sample_size,split,data_round,religious_act,secular_act,no_ambition_act,vague_job_act,...,ability_high,ability_low,budget_high,budget_low,awareness_information_high,awareness_information_low,camp_regulations,covid_impacts,public_assistance,worries_anxieties
0,C601001009R2,refugee,1,100,test,R2,0.00,0.166667,0.0,0.333333,...,0.0,0.000000,0.0,0.333333,0.000000,0.000000,0.00,0.000000,0.00,0.0
1,C601001009R3,refugee,1,100,test,R3,0.00,0.117647,0.0,0.058824,...,0.0,0.058824,0.0,0.000000,0.000000,0.117647,0.00,0.000000,0.00,0.0
2,C601002009R2,refugee,1,100,test,R2,0.00,0.000000,0.0,0.125000,...,0.0,0.125000,0.0,0.125000,0.000000,0.000000,0.25,0.000000,0.00,0.0
3,C601003005R3,refugee,1,100,test,R3,0.00,0.000000,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.00,0.125000,0.00,0.0
4,C602004004R3,refugee,1,100,test,R3,0.05,0.000000,0.0,0.000000,...,0.1,0.000000,0.0,0.000000,0.000000,0.000000,0.05,0.000000,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2402,H857013004R3,host,1,100,unannotated,R3,,,,,...,0.0,0.045455,0.0,0.000000,0.000000,0.045455,0.00,0.000000,0.00,0.0
2403,H857013008R3,host,1,100,unannotated,R3,,,,,...,0.0,0.000000,0.0,0.066667,0.066667,0.000000,0.00,0.066667,0.00,0.0
2404,H857025002R2,host,1,100,unannotated,R2,,,,,...,0.0,0.000000,0.0,0.000000,0.250000,0.000000,0.00,0.000000,0.25,0.0
2405,H857025003R2,host,1,100,unannotated,R2,,,,,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.00,0.000000,0.00,0.0


In [130]:
enh_df.loc[:,[*id_vars,*annotation_vars]]

Unnamed: 0,uid,split,data_round,refugee_status,bootstrap_run,train_sample_size,ability_low,secular
0,H532011009R3,unannotated,R3,host,1,100,0.000000,0.0
1,H122002005R2,test,R2,host,1,100,0.000000,0.6
2,C605020012R3,unannotated,R3,refugee,1,100,0.066667,0.0
3,H857025003R3,unannotated,R3,host,1,100,0.000000,0.0
4,H857013002R3,unannotated,R3,host,1,100,0.000000,0.0
...,...,...,...,...,...,...,...,...
195,H001047003R3,test,R3,host,1,100,0.066667,0.0
196,C725156005R3,unannotated,R3,refugee,1,100,0.000000,0.0
197,C723135002R2,unannotated,R2,refugee,1,100,0.250000,0.0
198,C727166003R2,unannotated,R2,refugee,1,100,0.000000,0.0


In [135]:
enh_df.split.value_counts()

unannotated    1618
test            189
Name: split, dtype: int64

> ### Enhanced Dataframe (Merged with quantiative data)

In [17]:
enhanced_df = enhanced_data[[*id_vars_enhanced,*annotation_vars]].copy()
enhanced_df = pd.merge(enhanced_df,quant_df,on=['uid','data_round'],how='left')

> ### Interpretability tests on Human Data

In [18]:
# Categorical regressors
categorical_vars     = ['refugee','hh_head_sex','eld_sex','parent_reledu']

# Continuous regressors
continuous_vars = [
     'num_child',
     'hh_head_age',
     'parent_eduyears',
     'eld_age',
     'hh_asset_index',
     'hh_income',
     'int_trauma_exp',
]

> ## Human

In [35]:
total_uids = enhanced_df['uid'].nunique()

In [36]:
coef_checklist = {'ability_low':'refugee','secular':'eld_sex'}

In [40]:
human_coefs = []

for (train_size, bootstrap_run), df_human in human_df.groupby(['train_sample_size','bootstrap_run']):
    
    interpreter = tests.Interpretability(df_human,
                                         annotation_vars=annotation_vars,
                                         categorical_regressors=categorical_vars,
                                         continuous_regressors=continuous_vars,
                    ).fit_all()
    
    for annotation, regressor in coef_checklist.items():
    
        model_fit_params = interpreter.get_model_params(annotation)
        coefficient      = [v for k,v in model_fit_params.items() if regressor in k].pop()            
        coef_dict = {
            'annotation':annotation,
            'n_human':train_size,
            'n_machine':total_uids-train_size,
            'bootstrap_run':bootstrap_run,
            'regressor':regressor,
            'coef':coefficient,
            'sample':'Human'
            }

        human_coefs.append(coef_dict)
    

    
human_coef_data = pd.DataFrame(human_coefs)    

In [41]:
enh_coefs = []

for (train_size, bootstrap_run), df_enh in enhanced_df.groupby(['train_sample_size','bootstrap_run']):
    
    interpreter = tests.Interpretability(df_enh,
                                         annotation_vars=annotation_vars,
                                         categorical_regressors=categorical_vars,
                                         continuous_regressors=continuous_vars,
                    ).fit_all()
    
    for annotation, regressor in coef_checklist.items():
    
        model_fit_params = interpreter.get_model_params(annotation)
        coefficient      = [v for k,v in model_fit_params.items() if regressor in k].pop()            
        coef_dict = {
            'annotation':annotation,
            'n_human':train_size,
            'n_machine':total_uids-train_size,
            'bootstrap_run':bootstrap_run,
            'regressor':regressor,
            'coef':coefficient,
            'sample':'Enhanced'
            }

        enh_coefs.append(coef_dict)
        
enh_coef_data = pd.DataFrame(enh_coefs)    

In [43]:
import plotly.express as px

In [46]:
enh_coef_a = enh_coef_data[enh_coef_data.annotation=='ability_low']

In [57]:
human_coef_a = human_coef_data[human_coef_data.annotation=='ability_low']

In [None]:
import matplotlib.pyplot as plt

nrows, ncols = 6, 4
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(12,12), 
                         sharex=True, sharey=True,                         
                        )
axes_flat = axes.ravel()

for a, annotation_var in enumerate(annotation_vars):
    
    ax = axes_flat[a]
    
    human_x = human_interp_data.loc[human_interp_data['annotation']==annotation_var,'train_size'].values
    human_y = human_interp_data.loc[human_interp_data['annotation']==annotation_var,'log_fstat'].values
    
    enh_x = enhanced_interp_data.loc[enhanced_interp_data['annotation']==annotation_var,'train_size'].values
    enh_y = enhanced_interp_data.loc[enhanced_interp_data['annotation']==annotation_var,'log_fstat'].values
    
    # Scatter plots
    ax.scatter(human_x,human_y, marker="o", color='blue', s=10, alpha=0.2,)
    ax.scatter(enh_x, enh_y, marker="o", color='green', s=10, alpha=0.2, )

    # Mean plots
    
    human_mean = human_interp_data.loc[human_interp_data['annotation']==annotation_var].groupby('train_size').mean()
    enh_mean   = enhanced_interp_data.loc[enhanced_interp_data['annotation']==annotation_var].groupby('train_size').mean()

    ax.plot(human_mean.index, human_mean['log_fstat'], color='blue', label='Human', lw=1)
    ax.plot(enh_mean.index, enh_mean['log_fstat'], color='green', label='Enhanced', lw=1)

    ax.set_title(annotation_var, fontsize=10)
    ax.grid(linewidth=0.5, color='gray', linestyle='--')

fig.text(0.5, 0.08, 'Human annotated sample size', ha='center', fontsize=12)
fig.text(0.08, 0.5, 'log F statistic', va='center', rotation='vertical', fontsize=12)

handles, labels = ax.get_legend_handles_labels()
fig.legend(handles, labels,title='F-statistic', loc='lower center', ncol=2)

fig.suptitle("Figure 15: F-statistic test for interpretability increases with Nh (Holding N fixed)", fontsize=16, )
fig.subplots_adjust(hspace=0.3, wspace=0.05, )

plt.show()