# Desert Povery Lab: Measuring Poverty One Country at a time
## Roadmap
1. Download the data(once)
2. Inspect the data for each country on train and test sets
3. Impute missing values
4. Test baseline model
5. Perform cross validation and feature selection

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool, cv, CatboostIpythonWidget
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
rdn = 42
countries = ['A', 'B', 'C']

In [4]:
# download data (just once!)
# import wget
# url = 'https://s3.amazonaws.com/drivendata/data/50/public/'
# levels = ['hhold', 'indiv']
# types = ['train', 'test']
# wget.download(url + 'submission_format.csv', './input/submission_format.csv')
# [wget.download(f'{url}{x}_{y}_{z}.csv' , f'./input/{x}_{y}_{z}.csv') for x in countries for y in levels for z in types]  

'./input/submission_format.csv'

In [2]:
# read all data, separately! Because they all have different columns
hhld_train = [pd.read_csv(f'./input/{x}_hhold_train.csv', index_col=['id']) for x in countries]
hhld_test = [pd.read_csv(f'./input/{x}_hhold_test.csv', index_col=['id']) for x in countries]
indiv_train = [pd.read_csv(f'./input/{x}_indiv_train.csv', index_col=['iid', 'id']) for x in countries]
indiv_test = [pd.read_csv(f'./input/{x}_indiv_test.csv', index_col=['iid', 'id']) for x in countries]
template = pd.read_csv('./input/submission_format.csv', index_col=['id'])

In [73]:
# EDA for the train and test data
## Check for missing values
print([x.isnull().sum().sum() for x in hhld_train])
# print([x.isnull().sum().sum() for x in hhld_test])

## inspect poverty distribution
print([x.poor.value_counts() for x in hhld_train])

## inspect integers and range
print([ x.describe() for x in hhld_train])

## inspect number of categories (min and max)
print([np.max(x.iloc[:, np.where(x.dtypes == np.object)[0]].nunique()) for x in hhld_train])

[0, 16784, 0]
[False    4500
True     3703
Name: poor, dtype: int64, False    3004
True      251
Name: poor, dtype: int64, False    5496
True      973
Name: poor, dtype: int64]
[          nEsgxvAq     OMtioXZZ     YFMZwKrU     TiwRslOh
count  8203.000000  8203.000000  8203.000000  8203.000000
mean     -7.590638    17.464464    -2.985615    -4.191028
std       5.810942    10.853654     0.896245     4.472567
min     -70.000000  -127.000000    -4.000000   -31.000000
25%     -10.000000    12.000000    -4.000000    -7.000000
50%      -4.000000    12.000000    -3.000000    -3.000000
75%      -4.000000    21.000000    -2.000000    -1.000000
max      -4.000000   111.000000     1.000000     3.000000,           wJthinfa     ZvEApWrk     vuQrLzvK    FGWqGkmD     qrOrXLPM  \
count  3255.000000  3255.000000  3255.000000  602.000000  3255.000000   
mean     43.381260    96.040860    17.427343   -7.509967    22.203379   
std      22.728441   105.556895    72.057949    9.499141     6.962658   
min    

In [35]:
train_b_na = hhld_train[1][hhld_train[1].columns[hhld_train[1].isnull().any()]]
print(train_b_na.info())
test_b_na = hhld_test[1][hhld_test[1].columns[hhld_test[1].isnull().any()]]
print(test_b_na.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3255 entries, 57071 to 4923
Data columns (total 9 columns):
FGWqGkmD    602 non-null float64
BXOWgPgL    2504 non-null float64
umkFMfvA    890 non-null float64
McFBIGsm    2504 non-null float64
IrxBnWxE    272 non-null float64
BRzuVmyf    1794 non-null float64
dnlnKrAg    532 non-null float64
aAufyreG    909 non-null float64
OSmfjCbE    2504 non-null float64
dtypes: float64(9)
memory usage: 254.3 KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1604 entries, 9135 to 52739
Data columns (total 9 columns):
FGWqGkmD    317 non-null float64
BXOWgPgL    1243 non-null float64
umkFMfvA    441 non-null float64
McFBIGsm    1243 non-null float64
IrxBnWxE    139 non-null float64
BRzuVmyf    881 non-null float64
dnlnKrAg    244 non-null float64
aAufyreG    428 non-null float64
OSmfjCbE    1245 non-null float64
dtypes: float64(9)
memory usage: 125.3 KB
None


The missing values in country B concentrate on 9 numeric columns. To deal with the NaNs, try following strategies:
1. drop columns with NA
2. set extreme value -99999
3. impute central tendency (mean, median, etc)

In [1]:
def make_cv(name, train, n):
    # TODO: scale numerics, drop single level columns, impute missing
    X = train.drop(['poor', 'country'], axis=1)
    y = train.poor
    cat_ind = np.where(np.logical_and(X.dtypes != np.float, X.dtypes != np.int))[0]
    pool = Pool(X.values, y.values, cat_features=cat_ind)
    model = CatBoostClassifier(train_dir=f'models/{name}/', task_type='GPU', name=name, iterations=n, loss_function='Logloss', random_seed=rdn)
    scores = cv(pool, model.get_params(), stratified=True, seed=rdn, logging_level='Verbose')
    return scores

def model_train(name, train, n):
    X = train.drop(['poor', 'country'], axis=1)
    y = train.poor
    cat_ind = np.where(np.logical_and(X.dtypes != np.float, X.dtypes != np.int))[0]
    model = CatBoostClassifier(train_dir=f'models/{name}/', task_type='GPU', name=name, iterations=n, loss_function='Logloss', random_seed=rdn)
    model.fit(X, y, cat_features=cat_ind, verbose=True)
    return model

def pred_make(model, X_test, country):
    df = pd.DataFrame()
    df['id'] = X_test.index.get_level_values('id')
    df['country'] = country
    df['poor'] = model.predict_proba(X_test.drop('country', axis=1))[:,1]
    return df

def mean_logloss(scores_a, scores_b, scores_c):
    return np.average([np.min(scores_a['Logloss_test_avg']), np.min(scores_b['Logloss_test_avg']), np.min(scores_c['Logloss_test_avg'])], weights= np.array([x.country.shape[0] for x in hhld_test]))


In [None]:
# clean train_b and examine effects
## try 1: by dropna(axis=1)
scores_b = make_cv('hhld_dropna_b', hhld_train[1].dropna(axis=1), 600)

In [None]:
scores_a = make_cv('hhld_a', hhld_train[0], 2000)

In [None]:
scores_c = make_cv('hhld_c', hhld_train[2], 500)

In [109]:
scores_a.keys()
np.argmin(scores_b['Logloss_test_avg'])

# model_a = model_train('hhld_a', hhld_train[0], )

574

In [None]:
# test train
model_b = model_train('hhld_dropna_b', hhld_train[1].dropna(axis=1), np.argmin(scores_b['Logloss_test_avg']))

In [None]:
model_a = model_train('hhld_a', hhld_train[0], np.argmin(scores_a['Logloss_test_avg']))

In [None]:
model_c = model_train('hhld_c', hhld_train[2], np.argmin(scores_c['Logloss_test_avg']))

In [116]:
# predict and submit
submission = pd.concat([pred_make(x, y.dropna(axis=1), z) for x, y, z in zip([model_a, model_b, model_c], hhld_test, countries)], axis=0)
submission.to_csv('output/submission_b_dropna.csv', index=False)

In [132]:
# calculate score from cv
mean_logloss(scores_a, scores_b, scores_c)


0.17004743833106328

# Incorporate indiv data to hhld

In [117]:
## Check for missing values in indiv data
print([x.isnull().sum().sum() for x in indiv_train])
# print([x.isnull().sum().sum() for x in hhld_test])

## inspect poverty distribution
print([x.poor.value_counts() for x in indiv_train])

## inspect integers and range
print([ x.describe() for x in indiv_train])

## inspect number of categories (min and max)
print([np.max(x.iloc[:, np.where(x.dtypes == np.object)[0]].nunique()) for x in indiv_train])


[6268, 461273, 0]
[True     19684
False    17876
Name: poor, dtype: int64, False    18375
True      1877
Name: poor, dtype: int64, False    22868
True      7045
Name: poor, dtype: int64]
[           OdXpbPGJ      ukWqmeSS
count  31292.000000  37560.000000
mean       8.719129    107.022764
std       21.089956     91.795117
min        4.000000      1.000000
25%        4.000000     36.000000
50%        4.000000     81.000000
75%        4.000000    151.000000
max      214.000000    551.000000,           BoxViLPz     qlLzyqpP     unRAgFtX      TJGiunYp    WmKLEUcd  \
count  5459.000000  1185.000000  1652.000000  12454.000000  354.000000   
mean    -34.296025   -38.275949   -87.351090      0.629758    1.161017   
std      18.357318    30.277305    89.976221      1.801028    3.992848   
min     -68.000000  -177.000000  -644.000000     -1.000000  -19.000000   
25%     -50.000000   -51.000000  -122.000000      0.000000   -1.000000   
50%     -32.000000   -33.000000   -50.000000      0.000000   

In [147]:
# inspect NaN in indiv (train and test same columns of missing data)
[x.isnull().any().sum() for x in indiv_train]
# [x.isnull().any().sum() for x in indiv_test]

[1, 28, 0]

In [170]:
# DEBUG: leftjoin indiv to hhld on train A
# indiv_train[0].head()
# pd.Series(['A', 'A', 'B', 'C', 'C']).value_counts().index[0]
indiv_a_mean = indiv_train[0].loc[:,indiv_train[0].dtypes == np.object].groupby('id').agg(lambda x: x.value_counts().index[0])
indiv_a_mean.head()
# train_a_concat = pd.concat([hhld_train[0], indiv_a_mean])

Unnamed: 0_level_0,HeUgMnzF,CaukPfUC,MzEtIdUF,gtnNTNam,SWoXNmPc,eXbOkwhI,XONDGWjH,KsFoQcUV,qYRZCuJD,FPQrjGnS,...,XBldkztv,tbgZsPXD,qqVibbSA,MgCoFhXK,rFpoTXAq,RXcLsVAQ,rQWIpTiG,XizJGmbu,xqUooaNJ,country
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
14,XJsPz,mOlYV,UFoKR,HIvIU,onRNG,YXCNt,ccbZA,kpkiH,fohru,scxJu,...,tbsMf,yOwsR,QQdHS,uEstx,Hikoa,zQvdC,xUYIC,juMSt,JTCKs,A
18,XJsPz,mOlYV,axSTs,CXizI,onRNG,YXCNt,ccbZA,HgfUG,fohru,scxJu,...,XQevi,yOwsR,QQdHS,uEstx,Hikoa,zQvdC,xUYIC,juMSt,JTCKs,A
36,XJsPz,kzSFB,axSTs,CXizI,onRNG,YXCNt,fOUHD,HgfUG,fohru,HRGCq,...,XQevi,yOwsR,QQdHS,gCSRj,Hikoa,zQvdC,rkLqZ,juMSt,JTCKs,A
39,XJsPz,mOlYV,axSTs,CXizI,onRNG,YXCNt,fOUHD,HgfUG,fohru,scxJu,...,tbsMf,yOwsR,QQdHS,uEstx,Hikoa,zQvdC,rkLqZ,juMSt,JTCKs,A
58,XJsPz,mOlYV,axSTs,CXizI,onRNG,YXCNt,fOUHD,HgfUG,fohru,scxJu,...,tbsMf,yOwsR,QQdHS,uEstx,Hikoa,zQvdC,rkLqZ,FUUXv,JTCKs,A


In [179]:
indiv_train_dropna = [x.dropna(axis=1).drop(['poor', 'country'], axis=1) for x in indiv_train]
indiv_train_reduced = [pd.concat([x.loc[:, x.dtypes == np.object].groupby('id').agg(lambda x: x.value_counts().index[0]), x.loc[:, x.dtypes != np.object].groupby('id').agg('mean')], axis=1) for x in indiv_train_dropna]

In [180]:
indiv_test_dropna = [x.dropna(axis=1).drop(['country'], axis=1) for x in indiv_test]
indiv_test_reduced = [pd.concat([x.loc[:, x.dtypes == np.object].groupby('id').agg(lambda x: x.value_counts().index[0]), x.loc[:, x.dtypes != np.object].groupby('id').agg('mean')], axis=1) for x in indiv_test_dropna]

In [210]:
combined_train = [pd.concat([x.dropna(axis=1), y], axis=1) for x, y in zip(hhld_train, indiv_train_reduced)]
combined_test = [pd.concat([x.dropna(axis=1), y], axis=1) for x, y in zip(hhld_test, indiv_test_reduced)]

In [211]:
# indiv_train_reduced[0].head()
# hhld_train[0].head()
# combined_train[0].head()
print([x.isnull().any().sum() for x in hhld_train])
print([x.isnull().any().sum() for x in combined_train])
print([x.isnull().any().sum() for x in hhld_test])
print([x.isnull().any().sum() for x in combined_test])

[0, 9, 0]
[0, 0, 0]
[0, 9, 0]
[0, 0, 0]


In [None]:
combined_cvs = [make_cv(x, y, z) for x, y, z in zip(['combined_a', 'combined_b', 'combined_c'], combined_train, [2000, 600, 500])]

In [None]:
combined_models = [model_train(x, y, np.argmin(z['Logloss_test_avg'])) for x, y, z in zip(['combined_a', 'combined_b', 'combined_c'], combined_train, combined_cvs)]

In [228]:
submission_combined = pd.concat([pred_make(x, y, z) for x, y, z in zip(combined_models, combined_test, countries)], axis=0).set_index('id').reindex(template.index)
submission_combined.to_csv('output/submission_combined_dropna.csv', index=True)


In [223]:
mean_logloss(*combined_cvs)

0.1713750497289378

In [None]:
# fillna flow for combined data
indiv_train_reduced = [pd.concat([x.loc[:, x.dtypes == np.object].groupby('id').agg(lambda x: x.value_counts().index[0]), x.loc[:, x.dtypes != np.object].groupby('id').agg('median')], axis=1) for x in [x.fillna(-99999).drop(['poor', 'country'], axis=1) for x in indiv_train]]
indiv_test_reduced = [pd.concat([x.loc[:, x.dtypes == np.object].groupby('id').agg(lambda x: x.value_counts().index[0]), x.loc[:, x.dtypes != np.object].groupby('id').agg('median')], axis=1) for x in [x.fillna(-99999).drop(['country'], axis=1) for x in indiv_test]]
combined_train = [pd.concat([x.fillna(-99999), y], axis=1) for x, y in zip(hhld_train, indiv_train_reduced)]
combined_test = [pd.concat([x.fillna(-99999), y], axis=1) for x, y in zip(hhld_test, indiv_test_reduced)]
combined_models = [model_train(x, y, np.argmin(z['Logloss_test_avg'])) for x, y, z in zip(['combined_a_fillna', 'combined_b_fillna', 'combined_c_fillna'], combined_train, combined_cvs)]

In [None]:
submission_combined = pd.concat([pred_make(x, y, z) for x, y, z in zip(combined_models, combined_test, countries)], axis=0).set_index('id').reindex(template.index)
submission_combined.to_csv('output/submission_combined_fillna_med.csv', index=True)

In [None]:
combined_cvs = [make_cv(x, y, z) for x, y, z in zip(['combined_a_fillna', 'combined_fillna', 'combined_fillna'], combined_train, [2000, 600, 500])]


In [236]:
mean_logloss(*combined_cvs)

0.17273918684293196