###### XGBOOST Parameter tuning
https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

In [23]:
#http://drivendata.co/blog/worldbank-poverty-benchmark/

%matplotlib inline

import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

# data directory
DATA_DIR = os.path.join(r'C:\Users\piush\Desktop\Dataset\world_bank_poverty_data\household')

In [24]:
data_paths = {'A': {'train': os.path.join(DATA_DIR,  'A_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR,  'A_hhold_test.csv')}, 
              
              'B': {'train': os.path.join(DATA_DIR,  'B_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR,  'B_hhold_test.csv')}, 
              
              'C': {'train': os.path.join(DATA_DIR,  'C_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR,  'C_hhold_test.csv')}}

In [25]:
# load training data
a_train = pd.read_csv(data_paths['A']['train'], index_col='id')
b_train = pd.read_csv(data_paths['B']['train'], index_col='id')
c_train = pd.read_csv(data_paths['C']['train'], index_col='id')

In [26]:
# load test data
a_test = pd.read_csv(data_paths['A']['test'], index_col='id')
b_test = pd.read_csv(data_paths['B']['test'], index_col='id')
c_test = pd.read_csv(data_paths['C']['test'], index_col='id')

In [27]:
a_train.shape

(8203, 345)

In [28]:
a_test.shape

(4041, 344)

In [29]:
df = pd.concat([a_train,a_test])

In [30]:
class LabelCount(object):

    def __init__(self, columns, new_column=False):
        self.count_dict = {}
        self.columns = columns
        self.new_column = new_column
        
    
    def fit(self, df):

        for column in self.columns:
            count = df[column].value_counts()
            
            self.count_dict[column] = count.to_dict()
        
    def transform(self, df):
        for column in self.columns:
            
            new_column_name = column
            
            if self.new_column:
                new_column_name = column + "_label_count"

            missing = 1
            df[new_column_name] = df[column].apply(lambda x : self.count_dict[column].get(x, missing))            


###### Label Count


In [31]:
for i in list(df.select_dtypes(include=['object']).columns.values):
    lc = LabelCount([i])
    lc.fit(df)
    lc.transform(df)
    

In [32]:
y = a_train['poor']

In [33]:
df = df.drop(['poor','country'], axis = 1)

In [34]:
train_c = df[:len(a_train)]

In [92]:
train_c.shape

(8203, 343)

In [35]:
from sklearn.feature_selection import VarianceThreshold, RFE, SelectKBest, chi2
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

X_minmax = MinMaxScaler(feature_range=(0,1)).fit_transform(train_c)
X_scored = SelectKBest(score_func=chi2, k='all').fit(X_minmax, y)
feature_scoring = pd.DataFrame({
        'feature': train_c.columns,
        'score': X_scored.scores_
    })

feature_scoring.sort_values('score', ascending=False).head(100)['feature'].values

array(['QyBloWXZ', 'ggNglVqE', 'wEbmsuJO', 'IZFarbPw', 'xkUFKUoW',
       'qgxmqJKa', 'pCgBHqsR', 'GhJKwVWC', 'ZRrposmO', 'bMudmjzJ',
       'ZnBLVaqz', 'EuJrVjyG', 'tHFrzjai', 'DxLvCGgv', 'YlZCqMNw',
       'phwExnuQ', 'PWShFLnY', 'AtGRGAYi', 'kLkPtNnh', 'DbUNVFwv',
       'GIMIxlmv', 'YKwvJgoP', 'CIGUXrRQ', 'JwtIxvKg', 'DNAfxPzs',
       'zFkComtB', 'VBjVVDwp', 'SeZULMCT', 'XDDOZFWf', 'hnrnuMte',
       'AlDbXTlZ', 'YFMZwKrU', 'UXhTXbuS', 'dCGNTMiG', 'LjvKYNON',
       'AsEmHUzj', 'uSKnVaKV', 'lQQeVmCa', 'ktBqxSwa', 'CtHqaXhY',
       'cqUmYeAp', 'ULMvnWcn', 'pKPTBZZq', 'UCAmikjV', 'UXfyiodk',
       'IKqsuNvV', 'naDKOzdk', 'JCDeZBXq', 'OKMtkqdQ', 'FlBqizNL',
       'uJYGhXqG', 'ZmJZXnoA', 'ihGjxdDj', 'FmSlImli', 'znHDEHZP',
       'lFcfBRGd', 'jdetlNNF', 'mycoyYwl', 'ucXrHdoC', 'ngwuvaCV',
       'tlxXCDiW', 'CNkSTLvx', 'ptEAnCSs', 'bIBQTaHw', 'XSgHIFXD',
       'CrfscGZl', 'BfGjiYom', 'vRIvQXtC', 'tbsBPHFD', 'rYvVKPAF',
       'ItpCDLDM', 'wxDnGIwN', 'xZBEXWPR', 'mvgxfsRb', 'nGTepf

In [116]:
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, VotingClassifier, RandomForestClassifier, AdaBoostClassifier

model = RandomForestClassifier()
model.fit(train_c, y)

feature_imp = pd.DataFrame(model.feature_importances_, index=train_c.columns, columns=["importance"])
list(feature_imp.sort_values("importance", ascending=False).head(200).index)


['TiwRslOh',
 'QyBloWXZ',
 'wEbmsuJO',
 'IZFarbPw',
 'xkUFKUoW',
 'ZRrposmO',
 'qgxmqJKa',
 'GIMIxlmv',
 'bMudmjzJ',
 'ggNglVqE',
 'tHFrzjai',
 'zFkComtB',
 'KjkrfGLD',
 'YKwvJgoP',
 'OMtioXZZ',
 'naDKOzdk',
 'pCgBHqsR',
 'GhJKwVWC',
 'nEsgxvAq',
 'AsEmHUzj',
 'UaXLYMMh',
 'BfGjiYom',
 'NRVuZwXK',
 'vwpsXRGk',
 'bPOwgKnT',
 'lQQeVmCa',
 'ltcNxFzI',
 'YFMZwKrU',
 'qgMygRvX',
 'phwExnuQ',
 'DxLvCGgv',
 'rAkSnhJF',
 'UXfyiodk',
 'ngwuvaCV',
 'PWShFLnY',
 'AtGRGAYi',
 'ktBqxSwa',
 'dkPWxwSF',
 'kLkPtNnh',
 'dCGNTMiG',
 'HHAeIHna',
 'uRFXnNKV',
 'gfmfEyjQ',
 'XDDOZFWf',
 'ihGjxdDj',
 'EuJrVjyG',
 'JbjHTYUM',
 'YTdCRVJt',
 'SzUcfjnr',
 'DbUNVFwv',
 'SlDKnCuu',
 'ptEAnCSs',
 'LjvKYNON',
 'QZiSWCCB',
 'cqUmYeAp',
 'aWlBVrkK',
 'VIRwrkXp',
 'OLpGAaEu',
 'CrfscGZl',
 'glEjrMIg',
 'IKqsuNvV',
 'FlBqizNL',
 'SqGRfEuW',
 'CtHqaXhY',
 'wxDnGIwN',
 'HfKRIwMb',
 'vRIvQXtC',
 'ZnBLVaqz',
 'wakWLjkG',
 'gllMXToa',
 'mycoyYwl',
 'lVHmBCmb',
 'RJFKdmYJ',
 'YlZCqMNw',
 'hnrnuMte',
 'EJgrQqET',
 'VBjVVDwp',

In [122]:
df_c = df[['TiwRslOh',
 'QyBloWXZ',
 'wEbmsuJO',
 'IZFarbPw',
 'xkUFKUoW',
 'ZRrposmO',
 'qgxmqJKa',
 'GIMIxlmv',
 'bMudmjzJ',
 'ggNglVqE',
 'tHFrzjai',
 'zFkComtB',
 'KjkrfGLD',
 'YKwvJgoP',
 'OMtioXZZ',
 'naDKOzdk',
 'pCgBHqsR',
 'GhJKwVWC',
 'nEsgxvAq',
 'AsEmHUzj',
 'UaXLYMMh',
 'BfGjiYom',
 'NRVuZwXK',
 'vwpsXRGk',
 'bPOwgKnT',
 'lQQeVmCa',
 'ltcNxFzI',
 'YFMZwKrU',
 'qgMygRvX',
 'phwExnuQ',
 'DxLvCGgv',
 'rAkSnhJF',
 'UXfyiodk',
 'ngwuvaCV',
 'PWShFLnY',
 'AtGRGAYi',
 'ktBqxSwa',
 'dkPWxwSF',
 'kLkPtNnh',
 'dCGNTMiG',
 'HHAeIHna',
 'uRFXnNKV',
 'gfmfEyjQ',
 'XDDOZFWf',
 'ihGjxdDj',
 'EuJrVjyG',
 'JbjHTYUM',
 'YTdCRVJt',
 'SzUcfjnr',
 'DbUNVFwv',
 'SlDKnCuu',
 'ptEAnCSs',
 'LjvKYNON',
 'QZiSWCCB',
 'cqUmYeAp',
 'aWlBVrkK',
 'VIRwrkXp',
 'OLpGAaEu',
 'CrfscGZl',
 'glEjrMIg',
 'IKqsuNvV',
 'FlBqizNL',
 'SqGRfEuW',
 'CtHqaXhY',
 'wxDnGIwN',
 'HfKRIwMb',
 'vRIvQXtC',
 'ZnBLVaqz',
 'wakWLjkG',
 'gllMXToa',
 'mycoyYwl',
 'lVHmBCmb',
 'RJFKdmYJ',
 'YlZCqMNw',
 'hnrnuMte',
 'EJgrQqET',
 'VBjVVDwp',
 'pjHvJhoZ',
 'CIGUXrRQ',
 'RJQbcmKy',
 'UXhTXbuS',
 'EQKKRGkR',
 'OybQOufM',
 'UsENDgsH',
 'JwtIxvKg',
 'bCYWWTxH',
 'jdetlNNF',
 'UCAmikjV',
 'muIetHMK',
 'DNAfxPzs',
 'wKVwRQIp',
 'QayGNSmS',
 'SeZULMCT',
 'HKMQJANN',
 'znHDEHZP',
 'dqRtXzav',
 'pKPTBZZq',
 'jwEuQQve',
 'AlDbXTlZ',
 'uSKnVaKV',
 'ogHwwdzc',
 'ucXrHdoC',
 'NanLCXEI',
 'nGTepfos',
 'ishdUooQ',
 'lOujHrCk',
 'JCDeZBXq',
 'UXSJUVwD',
 'uJYGhXqG',
 'xZBEXWPR',
 'CNkSTLvx',
 'BCehjxAl',
 'bIBQTaHw',
 'wnESwOiN',
 'kZVpcgJL',
 'kWFVfHWP',
 'OKMtkqdQ',
 'iBQXwnGC',
 'WuwrCsIY',
 'qTginJts',
 'TnWhKowI',
 'tbsBPHFD',
 'mvgxfsRb',
 'qlZMvcWc',
 'MKozKLvT',
 'sslNoPlw',
 'ANBCxZzU',
 'CsGvKKBJ',
 'rYvVKPAF',
 'NrUWfvEq',
 'UjuNwfjv',
 'ZmJZXnoA',
 'bgoWYRMQ',
 'KcArMKAe',
 'ZzUrQSMj',
 'CqqwKRSn',
 'WAFKMNwv',
 'nzTeWUeM',
 'galsfNtg',
 'PXtHzrqw',
 'fpHOwfAs',
 'maLAYXwi',
 'cDkXTaWP',
 'NBfffJUe',
 'FmSlImli',
 'bxKGlBYX',
 'UCnazcxd',
 'yeHQSlwg',
 'KAJOWiiw',
 'WiwmbjGW',
 'gfurxECf',
 'wwfmpuWA',
 'uVnApIlJ',
 'tAYCAXge',
 'zzwlWZZC',
 'eeYoszDM',
 'bEPKkJXP',
 'MARfVwUE',
 'btgWptTG',
 'ErggjCIN',
 'rQAsGegu',
 'JzhdOhzb',
 'eoNxXdlZ',
 'UHGnBrNt',
 'mvGdZZcs',
 'QNLOXNwj',
 'orfSPOJX',
 'nqndbwXP',
 'srPNUgVy',
 'dyGFeFAg',
 'hnmsRSvN',
 'UGbBCHRE',
 'hESBInAl',
 'ytYMzOlW',
 'YXkrVgqt',
 'VXXLUaXP',
 'mRgnuJVE',
 'sFWbFEso',
 'ccAHraiP',
 'fxbqfEWb',
 'gOGWzlYC',
 'NmAVTtfA',
 'lFcfBRGd',
 'dEpQghsA',
]]

In [123]:
train = df_c[:len(a_train)]

In [124]:
test = df_c[len(a_train):]

##### Standard Scalar processing

In [125]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
scaler = MinMaxScaler()
scaler.fit(df_c)

train_data_scaled = scaler.transform(train)
test_data_scaled = scaler.transform(test)

In [126]:
from sklearn import model_selection
from sklearn.metrics import log_loss

#from sklearn.linear_model import LogisticRegression
#from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from xgboost import XGBClassifier

test_size = 0.33

seed = 7

X_train, X_test, Y_train, Y_test = model_selection.train_test_split(train_data_scaled, y, test_size=test_size, random_state=seed)

#model = LogisticRegression()
#model = RandomForestClassifier(n_estimators=10, random_state=0)
model = XGBClassifier(max_depth=3, learning_rate=0.4, n_estimators=50, 
                      silent=True, objective='binary:logistic',  
                      gamma=0, min_child_weight=1, max_delta_step=1, 
                      subsample=1, colsample_bytree=1, colsample_bylevel=1, 
                      reg_alpha=0, reg_lambda=1, scale_pos_weight= 1, 
                      base_score= 0.3)

#model =  GaussianProcessClassifier(1.0 * RBF(1.0))
model.fit(X_train, Y_train)

y_pred = model.predict_proba(X_test)

log_loss(Y_test, y_pred)

#print("f1_score: {}".format(f1_score(, average='micro')))

0.29471822164754397

In [91]:
# #k-fold 

# num_instances = len(train_data_scaled)
# seed = 7

# kfold = model_selection.KFold(n_splits=5, random_state=seed)
# #model = LogisticRegression()
# model = XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05, objective= "binary:logistic",min_child_weight = 1,scale_pos_weight = 1)


# model_selection.cross_val_score(model, train_data_scaled, y, cv=kfold, scoring='neg_log_loss').mean()

-0.3475704040692198

In [111]:
model.fit(train_data_scaled,y)

XGBClassifier(base_score=0.3, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.4, max_delta_step=1, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=50, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [112]:
a_preds = model.predict_proba(test_data_scaled)

In [113]:
def make_country_sub(preds, test_feat, country):
    # make sure we code the country correctly
    country_codes = ['A', 'B', 'C']
    
    # get just the poor probabilities
    country_sub = pd.DataFrame(data=preds[:, 1],  # proba p=1
                               columns=['poor'], 
                               index=test_feat.index)

    
    # add the country code for joining later
    country_sub["country"] = country
    return country_sub[["country", "poor"]]


In [114]:
# convert preds to data frames
a_sub = make_country_sub(a_preds, a_test, 'A')

In [115]:
a_sub.to_csv("a_sub_6.csv")

In [34]:
# b_sub = pd.read_csv("b_sub.csv")

In [35]:
# b_sub = b_sub.set_index("id")

In [36]:
b_sub

Unnamed: 0_level_0,country,poor
id,Unnamed: 1_level_1,Unnamed: 2_level_1
9135,B,0.035659
117,B,0.064033
29085,B,0.008225
55442,B,0.028141
29281,B,0.008137
63854,B,0.003112
87565,B,0.020042
64033,B,0.007160
395,B,0.009856
64544,B,0.029994


In [37]:
c_sub = pd.read_csv("c_sub.csv")

In [39]:
C_sub = c_sub.set_index("id")

In [40]:
c_sub.tail()

Unnamed: 0,id,country,poor
3182,6775,C,3.4e-05
3183,88300,C,1.3e-05
3184,35424,C,0.000176
3185,81668,C,3.5e-05
3186,98377,C,6.8e-05


In [41]:
submission = pd.concat([a_sub, b_sub, c_sub])

In [42]:
#submission = submission.drop("id", axis = 1)

In [43]:
submission.shape

(8832, 3)

In [45]:
submission.to_csv('submission_X_G_tr2.csv')

In [46]:
submission.tail()

Unnamed: 0,country,id,poor
3182,C,6775.0,3.4e-05
3183,C,88300.0,1.3e-05
3184,C,35424.0,0.000176
3185,C,81668.0,3.5e-05
3186,C,98377.0,6.8e-05
