### Imports

In [1]:
import os
import numpy as np
import pandas as pd
from iqual import  tests

### Load datasets

In [70]:
data_dir         = "../data"

### Enhanced qualitative data
bootstrap_df = pd.read_csv(os.path.join(data_dir,"enhanced_train_sizes.csv"))

### Quantitative data
quant_df     = pd.read_csv(os.path.join(data_dir,"quant_data.csv"))

### Merge Data

In [72]:
bootstrap_df         = pd.merge(bootstrap_df,quant_df,on=['uid','data_round'],how='left')

In [73]:
bootstrap_df.loc[:,'round_R3'] = 0
bootstrap_df.loc[bootstrap_df.data_round=='R3','round_R3'] = 1

> ### Select variables of interest

In [74]:
numerical_vars = [  'round_R3',       # Dummy variable for R3
                    'refugee',       # Refugee
                    'hh_head_sex',   # Female HH Head
                    'eld_sex',       # Female eldest child
                    'parent_reledu', # Religiously educated parent                    
                    'num_child', # Number of Children
                    'hh_head_age', # Age of HH Head
                    'parent_eduyears', # Parent's years of education
                    'eld_age',       # Age of eldest child
                    'hh_asset_index', # HH asset index
                    'hh_income',      # HH Income
                    'int_trauma_exp', # Trauma Experience
]

annotation_vars = [
     "ability_high",
     "ability_low",
     "awareness_information_high",
     "awareness_information_low",
     "budget_high",
     "budget_low",
     "covid_impacts",
     "education_high",
     "education_low",
     "education_neutral",
     "education_religious",
     "entrepreneur",
     "job_secular",
     "marriage",
     "migration",
     "no_ambition",
     "public_assistance",
     "reliance_on_god",
     "religious",
     "secular",
     "vague_job",
     "vague_non_specific",
     "vocational_training",
     "worries_anxieties",
    ]

In [75]:
bootstrap_df

Unnamed: 0,uid,refugee_status,bootstrap_run,train_sample_size,split,data_round,religious_act,secular_act,no_ambition_act,vague_job_act,...,eld_e01_edu_status_,int_sex,int_age,int_eduyears,int_reledu,int_trauma_exp,int_trauma_heard,int_trauma_witness,hh_in_d2,round_R3
0,C601001009R2,refugee,1,100,test,R2,0.0000,0.1667,0.0000,0.3333,...,,0.0000,50.0000,0.0000,1.0000,2.0000,2.0000,5.0000,,0
1,C601001009R3,refugee,1,100,test,R3,0.0000,0.1176,0.0000,0.0588,...,,0.0000,50.0000,0.0000,1.0000,2.0000,2.0000,5.0000,1.0000,1
2,C601002009R2,refugee,1,100,test,R2,0.0000,0.0000,0.0000,0.1250,...,,1.0000,46.0000,0.0000,0.0000,6.0000,3.0000,3.0000,,0
3,C601003005R3,refugee,1,100,test,R3,0.0000,0.0000,0.0000,0.0000,...,,1.0000,26.0000,4.0000,0.0000,0.0000,3.0000,7.0000,1.0000,1
4,C602004004R3,refugee,1,100,test,R3,0.0500,0.0000,0.0000,0.0000,...,,0.0000,46.0000,0.0000,0.0000,0.0000,5.0000,4.0000,1.0000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192555,H857013004R3,host,10,789,unannotated,R3,,,,,...,,1.0000,32.0000,9.0000,0.0000,0.0000,5.0000,0.0000,1.0000,1
192556,H857013008R3,host,10,789,unannotated,R3,,,,,...,,0.0000,32.0000,0.0000,0.0000,3.0000,8.0000,1.0000,1.0000,1
192557,H857025002R2,host,10,789,unannotated,R2,,,,,...,,0.0000,35.0000,7.0000,0.0000,0.0000,2.0000,4.0000,,0
192558,H857025003R2,host,10,789,unannotated,R2,,,,,...,,,,,,,,,,0


In [89]:
annotation = 'ability_high'
for b in range(1,11):
    input_df = bootstrap_df[bootstrap_df.bootstrap_run==b]
    n_m = input_df[input_df.split=='unannotated'].uid.nunique()
    n_h = input_df[input_df.split!='unannotated'].uid.nunique()
    n_sum  = n_h + n_m
    sig2_m = input_df.loc[input_df.split=='unannotated',annotation+'_pred'].var()
    sig2_h = input_df.loc[input_df.split!='annotated',annotation+'_act'].var()
    boot_resid = input_df.loc[input_df.split=='test',annotation+'_act'] - input_df.loc[input_df.split=='test',annotation+'_pred']
    sig2_eps = boot_resid.var()
    sig2_h=input_df.loc[input_df.split!='unannotated',annotation+"_act"].var()
    mu_h  =input_df.loc[input_df.split!='unannotated',annotation+"_act"].mean()
    mu_m  =input_df.loc[input_df.split=='unannotated',annotation+"_pred"].mean()
    se_h =np.sqrt(sig2_h/n_h)
    se_m = np.sqrt((sig2_m+sig2_eps)/n_m)
    se_enh = np.sqrt(((n_h*sig2_h)+(n_m*(sig2_m+sig2_eps)))/n_sum**2)
print(f"se_h: {se_h:.4f} se_m: {se_m:.4f} se_enh: {se_enh:.4f}")

se_h: 0.0029 se_m: 0.0028 se_enh: 0.0021


```
Mean in human sample
Mean in machine sample
Variance of observed annotations in data
Variance of bootstrapped residuals
Variance of bootstrapped predictions
Standard error in human sample
Standard error in machine sample
Standard error in enhanced sample

for annot in qual_vars:
    row   = compute_se(enh_df, annot, use_oob=False, dec_place=4)
    eff_df = pd.concat([eff_df, row])
eff_df = eff_df[["annot", "sig2_h", "sig2_m", "sig2_eps", "se_h", "se_enh"]]
stargazer(as .matrix(eff_df), title="Measurement error variances",
          table.placement="H", label="tab:error_variances")

# Refugee versus host
# Refugees
input_df = filter(enh_df, refugee == 1)
ref_eff_df = pd.DataFrame()
for annot in qual_vars:
  row = compute_se(input_df, annot, use_oob=False, dec_place=4)
  ref_eff_df = pd.concat([ref_eff_df, row])
mean(input_df$ability_high, na.rm=T)
# Hosts
input_df = filter(enh_df, refugee == 0)
host_eff_df = pd.DataFrame()
for annot in qual_vars:
  row = compute_se(input_df, annot, use_oob=False, dec_place=4)
  host_eff_df = pd.concat([host_eff_df, row])
mean(input_df$ability_high, na.rm=T)
```