In [1]:
from tqdm.notebook import tqdm
import os
import sys
import random
import numpy as np
import pandas as pd

#git_dir = r"C:/Users/Aditya/GitHub/isquash"
git_dir = "/Users/adityachhabra/Github/isquash"
sys.path.append(git_dir)
sys.path.append(os.path.join(git_dir,'isquash'))
pd.set_option('display.max_columns',500)

In [2]:
from isquash import nlpmodel, preprocessing, evaluation, crossval, tests

In [3]:
enhanced_df = pd.read_csv("data/enhanced_bootstrap_mean.csv")
quant_df    = pd.read_csv("data/quant_data.csv")

In [4]:
annotated_enh_df = enhanced_df.loc[enhanced_df.annotated==1].drop(columns=['annotated']).reset_index(drop=True)

In [5]:
data             = pd.merge(annotated_enh_df,quant_df,on=['uid','data_round'],how='left')

### Bias

In [6]:
categorical_vars = ['refugee','hh_head_sex','eld_sex','parent_reledu','data_round']

continuous_vars = [
     'num_child',
     'hh_head_age',
     'parent_eduyears',
     'eld_age',
     'hh_asset_index',
     'hh_income',
     'int_trauma_exp',
]

annotation_vars = [
     "ability_high",
     "ability_low",
     "awareness_information_high",
     "awareness_information_low",
     "budget_high",
     "budget_low",
     "camp_regulations",
     "covid_impacts",
     "education_high",
     "education_low",
     "education_neutral",
     "education_religious",
     "entrepreneur",
     "job_secular",
     "marriage",
     "migration",
     "no_ambition",
     "public_assistance",
     "reliance_on_god",
     "religious",
     "secular",
     "vague_job",
     "vague_non_specific",
     "vocational_training",
     "worries_anxieties",
    ]

### Bias tests (For each bootstrap-run)

In [7]:
datasets = []
for b, boot_df in data.groupby('bootstrap_run'):
    bias = tests.Bias(boot_df,
                      annotation_vars=annotation_vars,
                      continuous_regressors=continuous_vars,
                      categorical_regressors=categorical_vars,
                     )
    bias.fit_all()
    datasets.extend([{
        'bootstrap_run':b,
        'annotation':annotation,
        'fstat_enh':result.fvalue,
        'log_fstat_enh':np.log(result.fvalue),
        'pval_enh':result.f_pvalue,
    }   for annotation, result in bias.model_fits.items()])
    
bias_df = pd.DataFrame(datasets)    

> ### Significance 

In [8]:
bias_df['sig_level'] = '>5%'
bias_df.loc[bias_df.pval_enh <= 0.01, 'sig_level'] = '<1%'
bias_df.loc[(bias_df.pval_enh > 0.01) & (bias_df.pval_enh <= 0.05), 'sig_level'] = '<5%'

In [9]:
### Marker colors
bias_df['marker_color'] = bias_df['sig_level'].replace({'>5%':'lightgreen','<5%':'yellow','<1%':'red'})

### Bias test (Average across Bootstrap runs)

In [10]:
grouped_df = data.drop(columns=['bootstrap_run']).groupby(['uid','data_round','refugee_status'])
avg_data   = grouped_df.mean(numeric_only=False).reset_index()

bias = tests.Bias(avg_data,
                  annotation_vars=annotation_vars,
                  continuous_regressors=continuous_vars,
                  categorical_regressors=categorical_vars,
                 )
bias.fit_all()

bias_avg_df = pd.DataFrame([{
        'annotation':annotation,
        'fstat_enh':result.fvalue,
        'log_fstat_enh':np.log(result.fvalue),
        'pval_enh':result.f_pvalue,
    }   for annotation, result in bias.model_fits.items()])

> ### Significance 

In [11]:
bias_avg_df['sig_level'] = '>5%'
bias_avg_df.loc[bias_avg_df.pval_enh <= 0.01, 'sig_level'] = '<1%'
bias_avg_df.loc[(bias_avg_df.pval_enh > 0.01) & (bias_avg_df.pval_enh <= 0.05), 'sig_level'] = '<5%'

### Marker Colors
bias_avg_df['marker_color'] = bias_avg_df['sig_level'].replace({'>5%':'lightgreen','<5%':'yellow','<1%':'red'})

In [27]:
#import plotly.express as px
import plotly.graph_objects as go

fig = go.Figure()

bias_df = bias_df.sort_values('annotation',ascending=False,ignore_index=True)
bias_avg_df = bias_avg_df.sort_values('annotation',ascending=False,ignore_index=True)

#### Bias (Bootstrap)
fig.add_trace(go.Scatter(
                 y=bias_df['annotation'],
                 x=bias_df['log_fstat_enh'],
                 marker=dict(size=8,line_width=1,opacity=0.7,
                             color=bias_df['marker_color'],
                             line_color=bias_df['marker_color'],                             
                            ),
                 mode='markers',
                 name='Bootstrap',
                 legendgroup='Bootstrap',
                ))

#### Bias (Average)
fig.add_trace(go.Scatter(
                 y=bias_avg_df['annotation'],
                 x=bias_avg_df['log_fstat_enh'],
                 marker = dict(color=bias_avg_df['marker_color'].values,
                               size = 10,
                               opacity=1,
                               line_width=2,
                               line_color='black',
                              ),
                 mode='markers',
                 name='Mean',
                 legendgroup='Mean',    
                ))



fig.update_yaxes(title=dict(text='Annotation',standoff=10),
                 gridwidth=0.75,gridcolor='lightgray',ticklen=5,ticks='outside',dtick=1,
                 mirror=True,showline=True,linecolor='black', zeroline=True,zerolinecolor='lightgray'
                )
fig.update_xaxes(title=dict(text='log F statistic',standoff=10),
                 gridwidth=0.75,gridcolor='lightgray',ticklen=5,ticks='outside',dtick=1,
                 mirror=True,showline=True,linecolor='black', zeroline=True,zerolinecolor='lightgray'
                )

fig.update_layout(width=800,height=800,
                  margin=dict(pad=0,l=0,t=100,r=0,b=100),
                  title=dict(text='Figure 8: Bias test for each annotation',x=0.5,font_size=18,font_family='Arial',),
                  legend=dict(title='',title_font_family="Arial",orientation='v'),
                  plot_bgcolor='white',  
                  
                 )