In [1]:
#http://drivendata.co/blog/worldbank-poverty-benchmark/

%matplotlib inline

import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

# data directory
DATA_DIR = os.path.join(r'C:\Users\piush\Desktop\Dataset\world_bank_poverty_data\household')

In [2]:
data_paths = {'A': {'train': os.path.join(DATA_DIR,  'A_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR,  'A_hhold_test.csv')}, 
              
              'B': {'train': os.path.join(DATA_DIR,  'B_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR,  'B_hhold_test.csv')}, 
              
              'C': {'train': os.path.join(DATA_DIR,  'C_hhold_train.csv'), 
                    'test':  os.path.join(DATA_DIR,  'C_hhold_test.csv')}}

In [3]:
# load training data
a_train = pd.read_csv(data_paths['A']['train'], index_col='id')
b_train = pd.read_csv(data_paths['B']['train'], index_col='id')
c_train = pd.read_csv(data_paths['C']['train'], index_col='id')

In [4]:
# load test data
a_test = pd.read_csv(data_paths['A']['test'], index_col='id')
b_test = pd.read_csv(data_paths['B']['test'], index_col='id')
c_test = pd.read_csv(data_paths['C']['test'], index_col='id')

In [5]:
c_train.shape

(6469, 164)

In [6]:
c_test.shape

(3187, 163)

In [7]:
df = pd.concat([c_train,c_test])

In [8]:
class LabelCount(object):

    def __init__(self, columns, new_column=False):
        self.count_dict = {}
        self.columns = columns
        self.new_column = new_column
        
    
    def fit(self, df):

        for column in self.columns:
            count = df[column].value_counts()
            
            self.count_dict[column] = count.to_dict()
        
    def transform(self, df):
        for column in self.columns:
            
            new_column_name = column
            
            if self.new_column:
                new_column_name = column + "_label_count"

            missing = 1
            df[new_column_name] = df[column].apply(lambda x : self.count_dict[column].get(x, missing))            


###### Label Count


In [9]:
for i in list(df.select_dtypes(include=['object']).columns.values):
    lc = LabelCount([i])
    lc.fit(df)
    lc.transform(df)
    

In [10]:
y = c_train['poor']

In [11]:
df = df.drop(['poor','country'], axis = 1)

In [12]:
train = df[:len(c_train)]

In [13]:
test = df[len(c_train):]

##### Standard Scalar processing

In [14]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
scaler = MinMaxScaler()
scaler.fit(df)

train_data_scaled = scaler.transform(train)
test_data_scaled = scaler.transform(test)

In [15]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
class Ensemble(object):
    def __init__(self, n_splits, stacker, base_models):
        self.n_splits = n_splits
        self.stacker = stacker
        self.base_models = base_models

    def fit_predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)

        folds = list(StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=2016).split(X, y))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))
        for i, clf in enumerate(self.base_models):

            S_test_i = np.zeros((T.shape[0], self.n_splits))

            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
#                y_holdout = y[test_idx]

                print ("Fit %s fold %d" % (str(clf).split('(')[0], j+1))
                clf.fit(X_train, y_train)
#                cross_score = cross_val_score(clf, X_train, y_train, cv=3, scoring='roc_auc')
#                print("    cross_score: %.5f" % (cross_score.mean()))
                y_pred = clf.predict_proba(X_holdout)[:,1]                

                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = clf.predict_proba(T)[:,1]
            S_test[:, i] = S_test_i.mean(axis=1)

        results = cross_val_score(self.stacker, S_train, y, cv=3, scoring='neg_log_loss')
        print("Stacker score: %.5f" % (results.mean()))

        self.stacker.fit(S_train, y)
        res = self.stacker.predict_proba(S_test)[:,1]
        return res

In [None]:
from sklearn import model_selection
from sklearn.metrics import log_loss

#from sklearn.linear_model import LogisticRegression
#from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from xgboost import XGBClassifier

from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier


lgb_params = {}
lgb_params['learning_rate'] = 0.02
lgb_params['n_estimators'] = 650
lgb_params['max_bin'] = 10
lgb_params['subsample'] = 0.8
lgb_params['subsample_freq'] = 10
lgb_params['colsample_bytree'] = 0.8   
lgb_params['min_child_samples'] = 500
lgb_params['random_state'] = 99


lgb_params2 = {}
lgb_params2['n_estimators'] = 1090
lgb_params2['learning_rate'] = 0.02
lgb_params2['colsample_bytree'] = 0.3   
lgb_params2['subsample'] = 0.7
lgb_params2['subsample_freq'] = 2
lgb_params2['num_leaves'] = 16
lgb_params2['random_state'] = 99


lgb_params3 = {}
lgb_params3['n_estimators'] = 1100
lgb_params3['max_depth'] = 4
lgb_params3['learning_rate'] = 0.02
lgb_params3['random_state'] = 99

#incorporated one more layer of my defined lgb params 
lgb_params4 = {}
lgb_params4['n_estimators'] = 1450
lgb_params4['max_bin'] = 20
lgb_params4['max_depth'] = 6
lgb_params4['learning_rate'] = 0.25 # shrinkage_rate
lgb_params4['boosting_type'] = 'gbdt'
lgb_params4['objective'] = 'binary'
lgb_params4['min_data'] = 500         # min_data_in_leaf
lgb_params4['min_hessian'] = 0.05     # min_sum_hessian_in_leaf
lgb_params4['verbose'] = 0

lgb_model = LGBMClassifier(**lgb_params)

lgb_model2 = LGBMClassifier(**lgb_params2)

lgb_model3 = LGBMClassifier(**lgb_params3)

lgb_model4 = LGBMClassifier(**lgb_params4)

log_model = XGBClassifier(max_depth=100, learning_rate=0.5, n_estimators=50, 
                      silent=True, objective='binary:logistic',  
                      gamma=0.1, min_child_weight=1, max_delta_step=0.5, 
                      subsample=1, colsample_bytree=1, colsample_bylevel=1, 
                      reg_alpha=0, reg_lambda=1, scale_pos_weight= 1, 
                      base_score=0.3)
a = []
for i in range(2,40):
    stack = Ensemble(n_splits= i,
        stacker = log_model,
        base_models = (lgb_model, lgb_model2, lgb_model3, lgb_model4))        
        
    y_pred = stack.fit_predict(train_data_scaled, y, test_data_scaled)
    a.append(y_pred)



Fit LGBMClassifier fold 1
Fit LGBMClassifier fold 2
Fit LGBMClassifier fold 1
Fit LGBMClassifier fold 2
Fit LGBMClassifier fold 1
Fit LGBMClassifier fold 2
Fit LGBMClassifier fold 1
Fit LGBMClassifier fold 2
Stacker score: -0.02443
Fit LGBMClassifier fold 1
Fit LGBMClassifier fold 2
Fit LGBMClassifier fold 3
Fit LGBMClassifier fold 1
Fit LGBMClassifier fold 2
Fit LGBMClassifier fold 3
Fit LGBMClassifier fold 1
Fit LGBMClassifier fold 2
Fit LGBMClassifier fold 3
Fit LGBMClassifier fold 1
Fit LGBMClassifier fold 2
Fit LGBMClassifier fold 3
Stacker score: -0.02779
Fit LGBMClassifier fold 1
Fit LGBMClassifier fold 2
Fit LGBMClassifier fold 3
Fit LGBMClassifier fold 4
Fit LGBMClassifier fold 1
Fit LGBMClassifier fold 2
Fit LGBMClassifier fold 3
Fit LGBMClassifier fold 4
Fit LGBMClassifier fold 1
Fit LGBMClassifier fold 2
Fit LGBMClassifier fold 3
Fit LGBMClassifier fold 4
Fit LGBMClassifier fold 1
Fit LGBMClassifier fold 2
Fit LGBMClassifier fold 3
Fit LGBMClassifier fold 4
Stacker score: -

In [52]:
c_stack_11 = pd.DataFrame(a).T

In [91]:
c_stack_11.mean(axis = 1)

0        155.275149
1       1938.725078
2        870.850029
3       1866.250029
4       1634.375532
5       1711.599775
6        154.524778
7       2389.300029
8       1842.424761
9        776.728315
10       225.600050
11       697.325029
12      1854.150029
13      1183.575029
14       532.324777
15      1604.332256
16      1917.550029
17      1773.599787
18      1485.050029
19        80.574792
20      1523.500029
21       133.027555
22      1348.025120
23       558.225101
24      1625.449782
25      1558.425029
26       708.925029
27      2363.299780
28      1676.950029
29       390.625250
30      2319.600088
31      2233.200029
32      1013.900048
33      1094.250498
34      1793.625029
35      2252.099787
36      1983.375052
37       447.725029
38       920.075029
39      1824.054447
40      1151.349781
41        99.875066
42      2129.850029
43      1050.774788
44        16.024789
45       327.300030
46      2229.300029
47      1973.124780
48       438.650030
49       648.500925


In [79]:
lgbsub = pd.DataFrame()
lgbsub['id'] = c_test.index
lgbsub['country'] = "C"
lgbsub['poor'] = mean

In [81]:
lgbsub

Unnamed: 0,id,country,poor
0,6211,C,155.275149
1,77549,C,1938.725078
2,34834,C,870.850029
3,74650,C,1866.250029
4,65375,C,1634.375532
5,68425,C,1711.599775
6,6142,C,154.524778
7,95572,C,2389.300029
8,73658,C,1842.424761
9,31069,C,776.728315


In [64]:
c_sub_2.to_csv("c_sub_2.csv")

In [37]:
c_stack.to_csv("c_stack.csv")

In [65]:
c_stack['poor'] = c_stack.mean(axis=1)
c_stack['id'] = c_test.index
c_stack['country'] = "C"
c_sub_1 = c_stack[['id','country','poor']]
c_sub_1.to_csv("c_sub_1.csv")

In [51]:
c

Series([], dtype: float64)

In [37]:
lgbsub = pd.DataFrame()
lgbsub['id'] = c_test.index
lgbsub['country'] = "C"
lgbsub['poor'] = y_pred
lgbsub.to_csv('lgb_esm_submission_C_60.csv', index=False)

In [None]:
# 10 : -0.02
#15 : -.232
# 20 : -0.0281
#25 : -0.2898
#30 : -.0298
#35 : -.0243
#40 : -0.02898
#45: -o.02418
#50 : -.02598
#55: -.02517