# Thompson Input Formatting Example

In [1]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import itertools

from numpy.random import randn
from numpy.matlib import repmat

from scipy.stats import norm
from scipy.optimize import fmin
from scipy.special import erf

from patsy import dmatrices

from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.linear_model import LogisticRegressionCV, LinearRegression, LassoCV, RidgeClassifierCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MultiLabelBinarizer
from sklearn.feature_selection import chi2
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
job_seekers = pd.read_csv('/home/nolski/Downloads/test.csv')
job_seekers = job_seekers[job_seekers['year2'] == '1']
job_seekers = job_seekers[job_seekers['employed_6_week'].isin(['0', '1'])]

In [3]:
nationality = ['syrian', 'jordanian']
gender = ['male', 'female']
secondary = ['0', '1']
work = [0, 1]
cols = ['nationality', 'gender', 'above_secondary_edu', 'ever_employed']
lists = [nationality, gender, secondary, work]

stratum = pd.DataFrame(list(itertools.product(*lists)), columns=cols)
stratum.to_csv('/home/nolski/Downloads/stratum.csv')

## Getting Strata Code

Now to get the code of our strata, we create a row based upon the input (nationality, gender, above_secondary, edu, ever_employed) andwe look it up in our stratum table.

In [4]:
stratum

Unnamed: 0,nationality,gender,above_secondary_edu,ever_employed
0,syrian,male,0,0
1,syrian,male,0,1
2,syrian,male,1,0
3,syrian,male,1,1
4,syrian,female,0,0
5,syrian,female,0,1
6,syrian,female,1,0
7,syrian,female,1,1
8,jordanian,male,0,0
9,jordanian,male,0,1


In [5]:
test = pd.DataFrame([['syrian', 'female', '0', 1]], 
                    columns=['nationality', 'gender', 'above_secondary_edu', 'ever_employed'])
test

Unnamed: 0,nationality,gender,above_secondary_edu,ever_employed
0,syrian,female,0,1


now we will use the below function to actually get the strata code of `test`

In [6]:
def set_strata(j):
    test = [j['nationality'], j['gender'], j['above_secondary_edu'], j['ever_employed']]
    return stratum[(stratum['nationality'] == test[0]) & 
            (stratum['gender'] == test[1]) & 
            (stratum['above_secondary_edu'] == test[2]) & 
            (stratum['ever_employed'] == test[3])].index[0] + 1
    

In [7]:
test.apply(set_strata, axis=1)

0    6
dtype: int64

You can see that the stata code given is `6`. If you look at row 5 (because we the rows are 0 indexed and thompson is 1 indexed), you can see that the row should match that of test. We do this for every 6 week observation in our database.

In [8]:
def has_intervention(j, i):
    return 1 if j['actual_intervention_received'] == i else 0

In [9]:
new = pd.DataFrame()
new['covar1'] = job_seekers.apply(set_strata, axis=1)
new['treatment1'] = job_seekers.apply(has_intervention, args=('cash',), axis=1)
new['treatment2'] = job_seekers.apply(has_intervention, args=('information',), axis=1)
new['treatment3'] = job_seekers.apply(has_intervention, args=('psychological',), axis=1)
new['outcome'] = job_seekers['employed_6_week']

In [10]:
new['covar1'].values

array([ 3,  2,  5,  6,  2,  5,  9, 12,  2, 10,  2, 10,  1, 14,  2,  4,  2,
        5,  2,  5, 10,  1, 10,  2, 13,  5,  2,  5,  1,  2,  6,  6,  6,  6,
        5,  1,  9, 11,  5,  2,  5,  2, 10,  1,  2,  5,  2,  5,  1,  5, 14,
        1,  9,  5, 13,  9,  1, 14,  1, 10, 14, 10, 14,  2, 14, 10,  9, 14,
       10,  9, 10, 10, 13, 14, 10,  1, 10,  9,  9,  9, 14, 10,  9,  6,  6,
       13,  9,  5, 14, 10,  9,  5, 12])

In [11]:
new['outcome'].replace(['0'], 'FALSE', inplace=True)
new['outcome'].replace(['1'], 'TRUE', inplace=True)

In [12]:
new.to_csv('/home/nolski/Downloads/thompson-input.csv', index=False)