# Desert Povery Lab: Measuring Poverty One Country at a time
## Roadmap
1. Download the data(once)
2. Inspect the data for each country on train and test sets
3. Impute missing values
4. Test baseline model
5. Perform cross validation and feature selection

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool, cv, CatboostIpythonWidget
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
rdn = 42
countries = ['A', 'B', 'C']

In [4]:
# download data (just once!)
# import wget
# url = 'https://s3.amazonaws.com/drivendata/data/50/public/'
# levels = ['hhold', 'indiv']
# types = ['train', 'test']
# wget.download(url + 'submission_format.csv', './input/submission_format.csv')
# [wget.download(f'{url}{x}_{y}_{z}.csv' , f'./input/{x}_{y}_{z}.csv') for x in countries for y in levels for z in types]  

'./input/submission_format.csv'

In [2]:
# read all data, separately! Because they all have different columns
hhld_train = [pd.read_csv(f'./input/{x}_hhold_train.csv', index_col=['id']) for x in countries]
hhld_test = [pd.read_csv(f'./input/{x}_hhold_test.csv', index_col=['id']) for x in countries]
indiv_train = [pd.read_csv(f'./input/{x}_indiv_train.csv', index_col=['iid', 'id']) for x in countries]
indiv_test = [pd.read_csv(f'./input/{x}_indiv_test.csv', index_col=['iid', 'id']) for x in countries]
template = pd.read_csv('./input/submission_format.csv', index_col=['id'])

In [73]:
# EDA for the train and test data
## Check for missing values
print([x.isnull().sum().sum() for x in hhld_train])
# print([x.isnull().sum().sum() for x in hhld_test])

## inspect poverty distribution
print([x.poor.value_counts() for x in hhld_train])

## inspect integers and range
print([ x.describe() for x in hhld_train])

## inspect number of categories (min and max)
print([np.max(x.iloc[:, np.where(x.dtypes == np.object)[0]].nunique()) for x in hhld_train])

[0, 16784, 0]
[False    4500
True     3703
Name: poor, dtype: int64, False    3004
True      251
Name: poor, dtype: int64, False    5496
True      973
Name: poor, dtype: int64]
[          nEsgxvAq     OMtioXZZ     YFMZwKrU     TiwRslOh
count  8203.000000  8203.000000  8203.000000  8203.000000
mean     -7.590638    17.464464    -2.985615    -4.191028
std       5.810942    10.853654     0.896245     4.472567
min     -70.000000  -127.000000    -4.000000   -31.000000
25%     -10.000000    12.000000    -4.000000    -7.000000
50%      -4.000000    12.000000    -3.000000    -3.000000
75%      -4.000000    21.000000    -2.000000    -1.000000
max      -4.000000   111.000000     1.000000     3.000000,           wJthinfa     ZvEApWrk     vuQrLzvK    FGWqGkmD     qrOrXLPM  \
count  3255.000000  3255.000000  3255.000000  602.000000  3255.000000   
mean     43.381260    96.040860    17.427343   -7.509967    22.203379   
std      22.728441   105.556895    72.057949    9.499141     6.962658   
min    

In [35]:
train_b_na = hhld_train[1][hhld_train[1].columns[hhld_train[1].isnull().any()]]
print(train_b_na.info())
test_b_na = hhld_test[1][hhld_test[1].columns[hhld_test[1].isnull().any()]]
print(test_b_na.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3255 entries, 57071 to 4923
Data columns (total 9 columns):
FGWqGkmD    602 non-null float64
BXOWgPgL    2504 non-null float64
umkFMfvA    890 non-null float64
McFBIGsm    2504 non-null float64
IrxBnWxE    272 non-null float64
BRzuVmyf    1794 non-null float64
dnlnKrAg    532 non-null float64
aAufyreG    909 non-null float64
OSmfjCbE    2504 non-null float64
dtypes: float64(9)
memory usage: 254.3 KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1604 entries, 9135 to 52739
Data columns (total 9 columns):
FGWqGkmD    317 non-null float64
BXOWgPgL    1243 non-null float64
umkFMfvA    441 non-null float64
McFBIGsm    1243 non-null float64
IrxBnWxE    139 non-null float64
BRzuVmyf    881 non-null float64
dnlnKrAg    244 non-null float64
aAufyreG    428 non-null float64
OSmfjCbE    1245 non-null float64
dtypes: float64(9)
memory usage: 125.3 KB
None


The missing values in country B concentrate on 9 numeric columns. To deal with the NaNs, try following strategies:
1. drop columns with NA
2. set extreme value -99999
3. impute central tendency (mean, median, etc)

In [1]:
def make_cv(name, train, n):
    # TODO: scale numerics, drop single level columns, impute missing
    X = train.drop(['poor', 'country'], axis=1)
    y = train.poor
    cat_ind = np.where(np.logical_and(X.dtypes != np.float, X.dtypes != np.int))[0]
    pool = Pool(X.values, y.values, cat_features=cat_ind)
    model = CatBoostClassifier(train_dir=f'models/{name}/', task_type='GPU', name=name, iterations=n, loss_function='Logloss', random_seed=rdn)
    scores = cv(pool, model.get_params(), stratified=True, seed=rdn, logging_level='Verbose')
    return scores

def model_train(name, train, n):
    X = train.drop(['poor', 'country'], axis=1)
    y = train.poor
    cat_ind = np.where(np.logical_and(X.dtypes != np.float, X.dtypes != np.int))[0]
    model = CatBoostClassifier(train_dir=f'models/{name}/', task_type='GPU', name=name, iterations=n, loss_function='Logloss', random_seed=rdn)
    model.fit(X, y, cat_features=cat_ind, verbose=True)
    return model

def pred_make(model, X_test, country):
    df = pd.DataFrame()
    df['id'] = X_test.index.get_level_values('id')
    df['country'] = country
    df['poor'] = model.predict_proba(X_test.drop('country', axis=1))[:,1]
    return df

def mean_logloss(scores_a, scores_b, scores_c):
    return np.average([np.min(scores_a['Logloss_test_avg']), np.min(scores_b['Logloss_test_avg']), np.min(scores_c['Logloss_test_avg'])], weights= np.array([x.country.shape[0] for x in hhld_test]))


In [93]:
# clean train_b and examine effects
## try 1: by dropna(axis=1)
scores_b = make_cv('hhld_dropna_b', hhld_train[1].dropna(axis=1), 600)

0:	learn: 0.6596432	test: 0.6597821	best: 0.6597821 (0)	total: 481ms	remaining: 8m
1:	learn: 0.6293600	test: 0.6296646	best: 0.6296646 (1)	total: 910ms	remaining: 7m 34s
2:	learn: 0.6021379	test: 0.6024911	best: 0.6024911 (2)	total: 1.3s	remaining: 7m 13s
3:	learn: 0.5761072	test: 0.5765745	best: 0.5765745 (3)	total: 1.73s	remaining: 7m 9s
4:	learn: 0.5518476	test: 0.5525604	best: 0.5525604 (4)	total: 2.1s	remaining: 6m 58s
5:	learn: 0.5297920	test: 0.5307566	best: 0.5307566 (5)	total: 2.45s	remaining: 6m 45s
6:	learn: 0.5093551	test: 0.5105209	best: 0.5105209 (6)	total: 2.83s	remaining: 6m 40s
7:	learn: 0.4912753	test: 0.4925889	best: 0.4925889 (7)	total: 3.19s	remaining: 6m 36s
8:	learn: 0.4739233	test: 0.4753462	best: 0.4753462 (8)	total: 3.43s	remaining: 6m 18s
9:	learn: 0.4572149	test: 0.4584525	best: 0.4584525 (9)	total: 3.88s	remaining: 6m 23s
10:	learn: 0.4427402	test: 0.4442357	best: 0.4442357 (10)	total: 4.21s	remaining: 6m 18s
11:	learn: 0.4299785	test: 0.4314787	best: 0.431

In [94]:
scores_a = make_cv('hhld_a', hhld_train[0], 2000)

0:	learn: 0.6746083	test: 0.6751352	best: 0.6751352 (0)	total: 450ms	remaining: 14m 58s
1:	learn: 0.6578568	test: 0.6585895	best: 0.6585895 (1)	total: 817ms	remaining: 13m 36s
2:	learn: 0.6427012	test: 0.6437773	best: 0.6437773 (2)	total: 1.22s	remaining: 13m 30s
3:	learn: 0.6285766	test: 0.6299413	best: 0.6299413 (3)	total: 1.6s	remaining: 13m 18s
4:	learn: 0.6158907	test: 0.6173877	best: 0.6173877 (4)	total: 1.98s	remaining: 13m 10s
5:	learn: 0.6042295	test: 0.6058447	best: 0.6058447 (5)	total: 2.35s	remaining: 12m 59s
6:	learn: 0.5927421	test: 0.5942875	best: 0.5942875 (6)	total: 2.73s	remaining: 12m 57s
7:	learn: 0.5817725	test: 0.5835346	best: 0.5835346 (7)	total: 3.11s	remaining: 12m 54s
8:	learn: 0.5702197	test: 0.5724261	best: 0.5724261 (8)	total: 3.52s	remaining: 12m 59s
9:	learn: 0.5600692	test: 0.5625392	best: 0.5625392 (9)	total: 3.92s	remaining: 12m 59s
10:	learn: 0.5506058	test: 0.5533416	best: 0.5533416 (10)	total: 4.31s	remaining: 12m 59s
11:	learn: 0.5414041	test: 0.54

In [95]:
scores_c = make_cv('hhld_c', hhld_train[2], 500)

0:	learn: 0.6252963	test: 0.6251265	best: 0.6251265 (0)	total: 408ms	remaining: 3m 23s
1:	learn: 0.5637844	test: 0.5637999	best: 0.5637999 (1)	total: 718ms	remaining: 2m 58s
2:	learn: 0.5090783	test: 0.5094080	best: 0.5094080 (2)	total: 1.14s	remaining: 3m 9s
3:	learn: 0.4606022	test: 0.4607026	best: 0.4607026 (3)	total: 1.52s	remaining: 3m 8s
4:	learn: 0.4180109	test: 0.4182962	best: 0.4182962 (4)	total: 1.86s	remaining: 3m 4s
5:	learn: 0.3812194	test: 0.3815109	best: 0.3815109 (5)	total: 2.22s	remaining: 3m 2s
6:	learn: 0.3458806	test: 0.3461966	best: 0.3461966 (6)	total: 2.56s	remaining: 3m
7:	learn: 0.3136790	test: 0.3142403	best: 0.3142403 (7)	total: 2.93s	remaining: 3m
8:	learn: 0.2878043	test: 0.2883062	best: 0.2883062 (8)	total: 3.26s	remaining: 2m 57s
9:	learn: 0.2623623	test: 0.2629233	best: 0.2629233 (9)	total: 3.57s	remaining: 2m 55s
10:	learn: 0.2402066	test: 0.2407269	best: 0.2407269 (10)	total: 3.9s	remaining: 2m 53s
11:	learn: 0.2207031	test: 0.2212773	best: 0.2212773 (

In [109]:
scores_a.keys()
np.argmin(scores_b['Logloss_test_avg'])

# model_a = model_train('hhld_a', hhld_train[0], )

574

In [110]:
# test train
model_b = model_train('hhld_dropna_b', hhld_train[1].dropna(axis=1), np.argmin(scores_b['Logloss_test_avg']))

0:	learn: 0.6595593	total: 206ms	remaining: 1m 58s
1:	learn: 0.6271793	total: 344ms	remaining: 1m 38s
2:	learn: 0.6010576	total: 402ms	remaining: 1m 16s
3:	learn: 0.5761797	total: 491ms	remaining: 1m 9s
4:	learn: 0.5504871	total: 644ms	remaining: 1m 13s
5:	learn: 0.5274209	total: 751ms	remaining: 1m 11s
6:	learn: 0.5078343	total: 888ms	remaining: 1m 11s
7:	learn: 0.4880655	total: 1.03s	remaining: 1m 13s
8:	learn: 0.4711849	total: 1.17s	remaining: 1m 13s
9:	learn: 0.4552448	total: 1.29s	remaining: 1m 13s
10:	learn: 0.4403574	total: 1.43s	remaining: 1m 13s
11:	learn: 0.4277558	total: 1.54s	remaining: 1m 12s
12:	learn: 0.4143759	total: 1.71s	remaining: 1m 13s
13:	learn: 0.4020295	total: 1.85s	remaining: 1m 13s
14:	learn: 0.3916451	total: 1.99s	remaining: 1m 14s
15:	learn: 0.3822334	total: 2.05s	remaining: 1m 11s
16:	learn: 0.3719916	total: 2.21s	remaining: 1m 12s
17:	learn: 0.3634796	total: 2.38s	remaining: 1m 13s
18:	learn: 0.3556749	total: 2.46s	remaining: 1m 11s
19:	learn: 0.3480717	to

In [111]:
model_a = model_train('hhld_a', hhld_train[0], np.argmin(scores_a['Logloss_test_avg']))

0:	learn: 0.6741936	total: 202ms	remaining: 6m 33s
1:	learn: 0.6553440	total: 341ms	remaining: 5m 32s
2:	learn: 0.6422709	total: 576ms	remaining: 6m 13s
3:	learn: 0.6266331	total: 712ms	remaining: 5m 46s
4:	learn: 0.6130985	total: 855ms	remaining: 5m 32s
5:	learn: 0.6003986	total: 1.03s	remaining: 5m 32s
6:	learn: 0.5880272	total: 1.14s	remaining: 5m 17s
7:	learn: 0.5753804	total: 1.27s	remaining: 5m 9s
8:	learn: 0.5657828	total: 1.42s	remaining: 5m 5s
9:	learn: 0.5565074	total: 1.68s	remaining: 5m 26s
10:	learn: 0.5458279	total: 1.84s	remaining: 5m 23s
11:	learn: 0.5386993	total: 1.95s	remaining: 5m 15s
12:	learn: 0.5300494	total: 2.08s	remaining: 5m 10s
13:	learn: 0.5217343	total: 2.24s	remaining: 5m 9s
14:	learn: 0.5141126	total: 2.37s	remaining: 5m 6s
15:	learn: 0.5075972	total: 2.5s	remaining: 5m 3s
16:	learn: 0.5012840	total: 2.73s	remaining: 5m 10s
17:	learn: 0.4954919	total: 2.86s	remaining: 5m 6s
18:	learn: 0.4893707	total: 3.07s	remaining: 5m 12s
19:	learn: 0.4827641	total: 3

In [112]:
model_c = model_train('hhld_c', hhld_train[2], np.argmin(scores_c['Logloss_test_avg']))

0:	learn: 0.6242230	total: 195ms	remaining: 59.5s
1:	learn: 0.5626546	total: 256ms	remaining: 39s
2:	learn: 0.5081125	total: 466ms	remaining: 47.3s
3:	learn: 0.4599206	total: 542ms	remaining: 41s
4:	learn: 0.4164797	total: 681ms	remaining: 41.2s
5:	learn: 0.3769172	total: 783ms	remaining: 39.3s
6:	learn: 0.3426255	total: 932ms	remaining: 39.9s
7:	learn: 0.3115238	total: 1.02s	remaining: 38s
8:	learn: 0.2843604	total: 1.1s	remaining: 36.5s
9:	learn: 0.2591663	total: 1.25s	remaining: 37s
10:	learn: 0.2374848	total: 1.35s	remaining: 36.3s
11:	learn: 0.2181540	total: 1.49s	remaining: 36.7s
12:	learn: 0.2012617	total: 1.6s	remaining: 36.2s
13:	learn: 0.1857464	total: 1.69s	remaining: 35.3s
14:	learn: 0.1720374	total: 1.83s	remaining: 35.6s
15:	learn: 0.1599079	total: 1.97s	remaining: 35.8s
16:	learn: 0.1487121	total: 2.12s	remaining: 36.1s
17:	learn: 0.1317047	total: 2.26s	remaining: 36.3s
18:	learn: 0.1235574	total: 2.47s	remaining: 37.4s
19:	learn: 0.1157582	total: 2.58s	remaining: 37s
20

In [116]:
# predict and submit
submission = pd.concat([pred_make(x, y.dropna(axis=1), z) for x, y, z in zip([model_a, model_b, model_c], hhld_test, countries)], axis=0)
submission.to_csv('output/submission_b_dropna.csv', index=False)

In [132]:
# calculate score from cv
mean_logloss(scores_a, scores_b, scores_c)


0.17004743833106328

# Incorporate indiv data to hhld

In [117]:
## Check for missing values in indiv data
print([x.isnull().sum().sum() for x in indiv_train])
# print([x.isnull().sum().sum() for x in hhld_test])

## inspect poverty distribution
print([x.poor.value_counts() for x in indiv_train])

## inspect integers and range
print([ x.describe() for x in indiv_train])

## inspect number of categories (min and max)
print([np.max(x.iloc[:, np.where(x.dtypes == np.object)[0]].nunique()) for x in indiv_train])


[6268, 461273, 0]
[True     19684
False    17876
Name: poor, dtype: int64, False    18375
True      1877
Name: poor, dtype: int64, False    22868
True      7045
Name: poor, dtype: int64]
[           OdXpbPGJ      ukWqmeSS
count  31292.000000  37560.000000
mean       8.719129    107.022764
std       21.089956     91.795117
min        4.000000      1.000000
25%        4.000000     36.000000
50%        4.000000     81.000000
75%        4.000000    151.000000
max      214.000000    551.000000,           BoxViLPz     qlLzyqpP     unRAgFtX      TJGiunYp    WmKLEUcd  \
count  5459.000000  1185.000000  1652.000000  12454.000000  354.000000   
mean    -34.296025   -38.275949   -87.351090      0.629758    1.161017   
std      18.357318    30.277305    89.976221      1.801028    3.992848   
min     -68.000000  -177.000000  -644.000000     -1.000000  -19.000000   
25%     -50.000000   -51.000000  -122.000000      0.000000   -1.000000   
50%     -32.000000   -33.000000   -50.000000      0.000000   

In [147]:
# inspect NaN in indiv (train and test same columns of missing data)
[x.isnull().any().sum() for x in indiv_train]
# [x.isnull().any().sum() for x in indiv_test]

[1, 28, 0]

In [170]:
# DEBUG: leftjoin indiv to hhld on train A
# indiv_train[0].head()
# pd.Series(['A', 'A', 'B', 'C', 'C']).value_counts().index[0]
indiv_a_mean = indiv_train[0].loc[:,indiv_train[0].dtypes == np.object].groupby('id').agg(lambda x: x.value_counts().index[0])
indiv_a_mean.head()
# train_a_concat = pd.concat([hhld_train[0], indiv_a_mean])

Unnamed: 0_level_0,HeUgMnzF,CaukPfUC,MzEtIdUF,gtnNTNam,SWoXNmPc,eXbOkwhI,XONDGWjH,KsFoQcUV,qYRZCuJD,FPQrjGnS,...,XBldkztv,tbgZsPXD,qqVibbSA,MgCoFhXK,rFpoTXAq,RXcLsVAQ,rQWIpTiG,XizJGmbu,xqUooaNJ,country
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
14,XJsPz,mOlYV,UFoKR,HIvIU,onRNG,YXCNt,ccbZA,kpkiH,fohru,scxJu,...,tbsMf,yOwsR,QQdHS,uEstx,Hikoa,zQvdC,xUYIC,juMSt,JTCKs,A
18,XJsPz,mOlYV,axSTs,CXizI,onRNG,YXCNt,ccbZA,HgfUG,fohru,scxJu,...,XQevi,yOwsR,QQdHS,uEstx,Hikoa,zQvdC,xUYIC,juMSt,JTCKs,A
36,XJsPz,kzSFB,axSTs,CXizI,onRNG,YXCNt,fOUHD,HgfUG,fohru,HRGCq,...,XQevi,yOwsR,QQdHS,gCSRj,Hikoa,zQvdC,rkLqZ,juMSt,JTCKs,A
39,XJsPz,mOlYV,axSTs,CXizI,onRNG,YXCNt,fOUHD,HgfUG,fohru,scxJu,...,tbsMf,yOwsR,QQdHS,uEstx,Hikoa,zQvdC,rkLqZ,juMSt,JTCKs,A
58,XJsPz,mOlYV,axSTs,CXizI,onRNG,YXCNt,fOUHD,HgfUG,fohru,scxJu,...,tbsMf,yOwsR,QQdHS,uEstx,Hikoa,zQvdC,rkLqZ,FUUXv,JTCKs,A


In [179]:
indiv_train_dropna = [x.dropna(axis=1).drop(['poor', 'country'], axis=1) for x in indiv_train]
indiv_train_reduced = [pd.concat([x.loc[:, x.dtypes == np.object].groupby('id').agg(lambda x: x.value_counts().index[0]), x.loc[:, x.dtypes != np.object].groupby('id').agg('mean')], axis=1) for x in indiv_train_dropna]

In [180]:
indiv_test_dropna = [x.dropna(axis=1).drop(['country'], axis=1) for x in indiv_test]
indiv_test_reduced = [pd.concat([x.loc[:, x.dtypes == np.object].groupby('id').agg(lambda x: x.value_counts().index[0]), x.loc[:, x.dtypes != np.object].groupby('id').agg('mean')], axis=1) for x in indiv_test_dropna]

In [210]:
combined_train = [pd.concat([x.dropna(axis=1), y], axis=1) for x, y in zip(hhld_train, indiv_train_reduced)]
combined_test = [pd.concat([x.dropna(axis=1), y], axis=1) for x, y in zip(hhld_test, indiv_test_reduced)]

In [211]:
# indiv_train_reduced[0].head()
# hhld_train[0].head()
# combined_train[0].head()
print([x.isnull().any().sum() for x in hhld_train])
print([x.isnull().any().sum() for x in combined_train])
print([x.isnull().any().sum() for x in hhld_test])
print([x.isnull().any().sum() for x in combined_test])

[0, 9, 0]
[0, 0, 0]
[0, 9, 0]
[0, 0, 0]


In [208]:
combined_cvs = [make_cv(x, y, z) for x, y, z in zip(['combined_a', 'combined_b', 'combined_c'], combined_train, [2000, 600, 500])]

0:	learn: 0.6765912	test: 0.6770705	best: 0.6770705 (0)	total: 519ms	remaining: 17m 17s
1:	learn: 0.6600259	test: 0.6610324	best: 0.6610324 (1)	total: 948ms	remaining: 15m 46s
2:	learn: 0.6449721	test: 0.6463123	best: 0.6463123 (2)	total: 1.36s	remaining: 15m 8s
3:	learn: 0.6311773	test: 0.6326568	best: 0.6326568 (3)	total: 1.75s	remaining: 14m 33s
4:	learn: 0.6178500	test: 0.6197128	best: 0.6197128 (4)	total: 2.18s	remaining: 14m 28s
5:	learn: 0.6051374	test: 0.6077090	best: 0.6077090 (5)	total: 2.58s	remaining: 14m 15s
6:	learn: 0.5927815	test: 0.5956843	best: 0.5956843 (6)	total: 2.94s	remaining: 13m 57s
7:	learn: 0.5811890	test: 0.5844809	best: 0.5844809 (7)	total: 3.36s	remaining: 13m 56s
8:	learn: 0.5707816	test: 0.5742662	best: 0.5742662 (8)	total: 3.79s	remaining: 13m 57s
9:	learn: 0.5613772	test: 0.5650278	best: 0.5650278 (9)	total: 4.21s	remaining: 13m 58s
10:	learn: 0.5523912	test: 0.5564061	best: 0.5564061 (10)	total: 4.63s	remaining: 13m 56s
11:	learn: 0.5435062	test: 0.54

In [209]:
combined_models = [model_train(x, y, np.argmin(z['Logloss_test_avg'])) for x, y, z in zip(['combined_a', 'combined_b', 'combined_c'], combined_train, combined_cvs)]

0:	learn: 0.6731417	total: 243ms	remaining: 7m 56s
1:	learn: 0.6570251	total: 381ms	remaining: 6m 13s
2:	learn: 0.6415302	total: 546ms	remaining: 5m 56s
3:	learn: 0.6270311	total: 674ms	remaining: 5m 30s
4:	learn: 0.6138821	total: 842ms	remaining: 5m 29s
5:	learn: 0.6017461	total: 978ms	remaining: 5m 19s
6:	learn: 0.5910368	total: 1.14s	remaining: 5m 17s
7:	learn: 0.5801731	total: 1.31s	remaining: 5m 19s
8:	learn: 0.5705188	total: 1.52s	remaining: 5m 30s
9:	learn: 0.5610652	total: 1.69s	remaining: 5m 29s
10:	learn: 0.5521067	total: 1.97s	remaining: 5m 49s
11:	learn: 0.5444851	total: 2.11s	remaining: 5m 43s
12:	learn: 0.5375588	total: 2.36s	remaining: 5m 53s
13:	learn: 0.5297214	total: 2.65s	remaining: 6m 8s
14:	learn: 0.5221483	total: 2.79s	remaining: 6m 2s
15:	learn: 0.5146534	total: 2.96s	remaining: 5m 59s
16:	learn: 0.5086089	total: 3.11s	remaining: 5m 55s
17:	learn: 0.5018294	total: 3.35s	remaining: 6m 1s
18:	learn: 0.4943200	total: 3.6s	remaining: 6m 8s
19:	learn: 0.4889023	total:

In [228]:
submission_combined = pd.concat([pred_make(x, y, z) for x, y, z in zip(combined_models, combined_test, countries)], axis=0).set_index('id').reindex(template.index)
submission_combined.to_csv('output/submission_combined_dropna.csv', index=True)


In [223]:
mean_logloss(*combined_cvs)

0.1713750497289378

In [None]:
# fillna flow for combined data
indiv_train_reduced = [pd.concat([x.loc[:, x.dtypes == np.object].groupby('id').agg(lambda x: x.value_counts().index[0]), x.loc[:, x.dtypes != np.object].groupby('id').agg('median')], axis=1) for x in [x.fillna(-99999).drop(['poor', 'country'], axis=1) for x in indiv_train]]
indiv_test_reduced = [pd.concat([x.loc[:, x.dtypes == np.object].groupby('id').agg(lambda x: x.value_counts().index[0]), x.loc[:, x.dtypes != np.object].groupby('id').agg('median')], axis=1) for x in [x.fillna(-99999).drop(['country'], axis=1) for x in indiv_test]]
combined_train = [pd.concat([x.fillna(-99999), y], axis=1) for x, y in zip(hhld_train, indiv_train_reduced)]
combined_test = [pd.concat([x.fillna(-99999), y], axis=1) for x, y in zip(hhld_test, indiv_test_reduced)]
combined_models = [model_train(x, y, np.argmin(z['Logloss_test_avg'])) for x, y, z in zip(['combined_a_fillna', 'combined_b_fillna', 'combined_c_fillna'], combined_train, combined_cvs)]

In [None]:
submission_combined = pd.concat([pred_make(x, y, z) for x, y, z in zip(combined_models, combined_test, countries)], axis=0).set_index('id').reindex(template.index)
submission_combined.to_csv('output/submission_combined_fillna_med.csv', index=True)

In [235]:
combined_cvs = [make_cv(x, y, z) for x, y, z in zip(['combined_a_fillna', 'combined_fillna', 'combined_fillna'], combined_train, [2000, 600, 500])]


0:	learn: 0.6766521	test: 0.6771927	best: 0.6771927 (0)	total: 521ms	remaining: 17m 21s
1:	learn: 0.6608512	test: 0.6618020	best: 0.6618020 (1)	total: 946ms	remaining: 15m 44s
2:	learn: 0.6460807	test: 0.6470587	best: 0.6470587 (2)	total: 1.36s	remaining: 15m 4s
3:	learn: 0.6318749	test: 0.6332224	best: 0.6332224 (3)	total: 1.77s	remaining: 14m 45s
4:	learn: 0.6179143	test: 0.6195324	best: 0.6195324 (4)	total: 2.2s	remaining: 14m 37s
5:	learn: 0.6050026	test: 0.6070453	best: 0.6070453 (5)	total: 2.63s	remaining: 14m 32s
6:	learn: 0.5925184	test: 0.5949752	best: 0.5949752 (6)	total: 3.04s	remaining: 14m 26s
7:	learn: 0.5820480	test: 0.5847087	best: 0.5847087 (7)	total: 3.48s	remaining: 14m 27s
8:	learn: 0.5719317	test: 0.5750834	best: 0.5750834 (8)	total: 3.94s	remaining: 14m 30s
9:	learn: 0.5626237	test: 0.5665168	best: 0.5665168 (9)	total: 4.34s	remaining: 14m 24s
10:	learn: 0.5538645	test: 0.5580107	best: 0.5580107 (10)	total: 4.78s	remaining: 14m 24s
11:	learn: 0.5436466	test: 0.547

KeyboardInterrupt: 

In [236]:
mean_logloss(*combined_cvs)

0.17273918684293196

In [None]:
# OLD
# helpers
# setup classifier for country-separate training
def train_model(df, od=True, sep=True):
    if sep:
        X = df.drop(['poor', 'country'], axis=1)
    else:
        X = df.drop(['poor'], axis=1)
    y = df.poor
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=rdn)
    cat_ind = np.where(np.logical_and(X.dtypes != np.float, X.dtypes != np.int))[0]
    if od:
        clf = CatBoostClassifier(iterations=2000, od_type='Iter', od_wait=20, eval_metric='Logloss', random_seed=rdn)
        clf.fit(X_train, y_train, cat_features=cat_ind, eval_set=(X_val, y_val), verbose=True, plot=False)
    else:
        clf = CatBoostClassifier(iterations=4000, random_seed=rdn)
        clf.fit(X, y, cat_features=cat_ind, verbose=True, plot=False)
    return clf

def make_prediction(model, X_test, country, ind=False):
    df = pd.DataFrame()
    df['id'] = X_test.index.get_level_values('id')
    df['country'] = country
    df['poor'] = model.predict_proba(X_test)[:,1]
    if ind:
        df = df.groupby(['id', 'country'], sort=False).mean()
    return df

In [None]:
# catboost time! (all countries together)
clf = train_model(df, sep=False)
clf.score(X_val, y_val)

In [None]:
# DEBUG: note strange country showing NaN issue without .values
submission = make_prediction(clf, X_test, X_test.country.values)
submission.to_csv('submission.csv', index=False)

In [None]:
# check submission
submission.head()
submission.isnull().sum()

In [None]:
# train separate data setup
X_test_countries = [x.drop('country', axis=1) for k, x in X_test.groupby('country')]

In [None]:
# train separately for countries
model_list = [train_model(x) for k, x in df.groupby('country')]

In [None]:
submission_sep = pd.concat([make_prediction(x, y, z, ind=True) for x, y, z in zip(model_list, X_test_countries, countries)], axis=0)
submission_sep = submission_sep.reindex(template.index)
submission_sep.to_csv('submission_sep_ind.csv', index=True)

In [None]:
# use all data to train
clf_simple = train_model(df, od=False)
submission_simple = make_prediction(clf_simple, X_test, X_test.country.values)
submission_simple.to_csv('submission_all_simple.csv', index=False)

In [None]:
# countries separate no od, rewrite in pipelines
model_list_simple = [train_model(x, od=False) for k, x in df.groupby('country')]

In [None]:
submission_sep_simple = pd.concat([make_prediction(x, y, z) for x, y, z in zip(model_list_simple, X_test_countries, countries)], axis=0)
submission_sep_simple.to_csv('submission_sep_simple.csv', index=False)

In [None]:
# validation (DEBUG need to futurize)
# from drivendata_validator.drivendata_validator import DrivenDataValidator

# v = DrivenDataValidator()
# v.validate('submission_format.csv', 'submission_sep_ind.csv')

In [None]:
# playround
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=rdn)

In [None]:
# feature selection from model for country A
# X_test_countries[0].shape
# len(model_list[0].feature_importances_)
# selmod = SelectFromModel(model_list[0])
# selmod.transform(X_test_countries[0])
X = df[df.country == 'A'].drop(['poor','country'], axis=1)
y = df[df.country == 'A'].poor
cat_ind = np.where(np.logical_and(X.dtypes != np.float, X.dtypes != np.int))[0]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=rdn)
clf_a = CatBoostClassifier(iterations=2000, od_type='Iter', od_wait=20, eval_metric='Logloss', random_seed=rdn)
clf_a.fit(X_train, y_train, cat_features=cat_ind, eval_set=(X_val, y_val), verbose=True, plot=False)
fea_imp = clf_a.get_feature_importance(X_train, y_train, cat_features=cat_ind)

In [None]:
features20 = np.array(fea_imp) > 1
X20 = X[X.columns[features20]]
cat_ind = np.where(np.logical_and(X20.dtypes != np.float, X20.dtypes != np.int))[0]
X_train, X_val, y_train, y_val = train_test_split(X20, y, test_size=0.2, random_state=rdn)
clf_a20 = CatBoostClassifier(iterations=2000, od_type='Iter', od_wait=20, eval_metric='Logloss', random_seed=rdn)
clf_a20.fit(X_train, y_train, cat_features=cat_ind, eval_set=(X_val, y_val), verbose=True, plot=False)

In [None]:
clf_a.score(X_val, y_val)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X20, y, test_size=0.2, random_state=rdn)
clf_a20.score(X_val, y_val)