In [1]:
#http://drivendata.co/blog/worldbank-poverty-benchmark/

%matplotlib inline

import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

# data directory
DATA_DIR = os.path.join(r'C:\Users\piush\Desktop\Dataset\world_bank_poverty_data\household')

In [46]:
data_paths = {'A': {'train': os.path.join(DATA_DIR,  'A_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR,  'A_hhold_test.csv')}, 
              
              'B': {'train': os.path.join(DATA_DIR,  'B_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR,  'B_hhold_test.csv')}, 
              
              'C': {'train': os.path.join(DATA_DIR,  'C_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR,  'C_hhold_test.csv')}}

In [47]:
# load training data
a_train = pd.read_csv(data_paths['A']['train'], index_col='id')
b_train = pd.read_csv(data_paths['B']['train'], index_col='id')
c_train = pd.read_csv(data_paths['C']['train'], index_col='id')

In [48]:
# load test data
a_test = pd.read_csv(data_paths['A']['test'], index_col='id')
b_test = pd.read_csv(data_paths['B']['test'], index_col='id')
c_test = pd.read_csv(data_paths['C']['test'], index_col='id')

In [49]:
b_train.shape

(3255, 442)

In [50]:
b_test.shape

(1604, 441)

In [51]:
df = pd.concat([b_train,b_test])

In [52]:
class LabelCount(object):

    def __init__(self, columns, new_column=False):
        self.count_dict = {}
        self.columns = columns
        self.new_column = new_column
        
    
    def fit(self, df):

        for column in self.columns:
            count = df[column].value_counts()
            
            self.count_dict[column] = count.to_dict()
        
    def transform(self, df):
        for column in self.columns:
            
            new_column_name = column
            
            if self.new_column:
                new_column_name = column + "_label_count"

            missing = 1
            df[new_column_name] = df[column].apply(lambda x : self.count_dict[column].get(x, missing))            


###### Label Count


In [53]:
for i in list(df.select_dtypes(include=['object']).columns.values):
    lc = LabelCount([i])
    lc.fit(df)
    lc.transform(df)
    

In [54]:
y = b_train['poor']

In [55]:
df = df.drop(['poor','country'], axis = 1)

#### Find columns with nans

In [56]:
df.columns[df.isnull().any()].tolist()

['BRzuVmyf',
 'BXOWgPgL',
 'FGWqGkmD',
 'IrxBnWxE',
 'McFBIGsm',
 'OSmfjCbE',
 'aAufyreG',
 'dnlnKrAg',
 'umkFMfvA']

###### Fill nan with median values

In [57]:
df['BRzuVmyf'].fillna((df['BRzuVmyf'].median()), inplace=True)

In [58]:
df['BXOWgPgL'].fillna((df['BXOWgPgL'].median()), inplace=True)

In [59]:
df['FGWqGkmD'].fillna((df['FGWqGkmD'].median()), inplace=True)

In [60]:
df['McFBIGsm'].fillna((df['McFBIGsm'].median()), inplace=True)

In [61]:
df['OSmfjCbE'].fillna((df['OSmfjCbE'].median()), inplace=True)

In [62]:
df['IrxBnWxE'].fillna((df['IrxBnWxE'].median()), inplace=True)

In [63]:
df['aAufyreG'].fillna((df['aAufyreG'].median()), inplace=True)

In [64]:
df['dnlnKrAg'].fillna((df['dnlnKrAg'].median()), inplace=True)

In [65]:
df['umkFMfvA'].fillna((df['umkFMfvA'].median()), inplace=True)

In [67]:
train = df[:len(b_train)]

In [68]:
test = df[len(b_train):]

##### Standard Scalar processing

In [93]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
scaler = RobustScaler()
scaler.fit(df)

train_data_scaled = scaler.transform(train)
test_data_scaled = scaler.transform(test)

In [94]:
from sklearn import model_selection

#from sklearn.linear_model import LogisticRegression
#from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


test_size = 0.33

seed = 7

X_train, X_test, Y_train, Y_test = model_selection.train_test_split(train_data_scaled, y, test_size=test_size, random_state=seed)

#model = LogisticRegression()
#model = RandomForestClassifier(n_estimators=500, random_state=0)
# model = XGBClassifier(
#  learning_rate =0.1,
#  n_estimators=500,
#  max_depth=2,
#  min_child_weight=1,
#  gamma=0,
#  subsample=0.8,
#  colsample_bytree=0.8,
#  objective= 'binary:logistic',
#  nthread=4,
#  scale_pos_weight=1,
#  seed=27)

model = XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05, objective= "binary:logistic",min_child_weight = 1,scale_pos_weight = 1)
model.fit(X_train, Y_train)

y_pred = model.predict_proba(X_test)

#print("f1_score: {}".format(f1_score(, average='micro')))

In [95]:
from sklearn.metrics import log_loss

log_loss(Y_test, y_pred)

0.26448992704249769

In [100]:
#k-fold 

num_instances = len(train_data_scaled)
seed = 7

kfold = model_selection.KFold(n_splits=5, random_state=seed)
#model = LogisticRegression()
model = XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05, objective= "binary:logistic",min_child_weight = 1,scale_pos_weight = 1)


model_selection.cross_val_score(model, train_data_scaled, y, cv=kfold, scoring='neg_log_loss').mean()

-0.21072127179949657

In [None]:
model.fit(train_data_scaled,y)

In [96]:
b_preds = model.predict_proba(test_data_scaled)

In [97]:
def make_country_sub(preds, test_feat, country):
    # make sure we code the country correctly
    country_codes = ['A', 'B', 'C']
    
    # get just the poor probabilities
    country_sub = pd.DataFrame(data=preds[:, 1],  # proba p=1
                               columns=['poor'], 
                               index=test_feat.index)

    
    # add the country code for joining later
    country_sub["country"] = country
    return country_sub[["country", "poor"]]


In [98]:
# convert preds to data frames
b_sub = make_country_sub(b_preds, b_test, 'B')

In [99]:
b_sub.to_csv("b_sub.csv")