In [1]:
import numpy as np
import pandas as pd
import json

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

# This class is for the preprocessing, take csv path as input, and aim to return a pandas data frame for trainning
class PreProcessing:

  # The constructor takes a pandas dataframe as input and save it to self.df
  def __init__(self, csvpath):
    self.df = pd.read_csv(csvpath)
    if "train" in csvpath:
      self.dftype = 1
    elif "test" in csvpath:
      self.dftype = 2
    else:
      self.dftype = -1

  # This method have deal with missing data before merge or drop
  def MissingData(self):
    self.df = self.df.replace(-1, np.NaN) #first, replace -1 to NaN
    #print (self.df.columns[self.df.isnull().any()])
    '''
    'ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 
    'ps_reg_03',
    'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_05_cat', 'ps_car_07_cat', 'ps_car_09_cat', 
    'ps_car_11', 'ps_car_12', 'ps_car_14'
    '''
    mean_imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    mdan_imp = Imputer(missing_values='NaN', strategy='median', axis=0)
    mfrq_imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)

    self.df["ps_ind_02_cat"].fillna(-1, inplace=True)
    self.df["ps_ind_04_cat"].fillna(-1, inplace=True)
    self.df["ps_ind_05_cat"].fillna(-1, inplace=True)
    #self.df["ps_reg_03"].fillna(self.df["ps_reg_03"].median(), inplace=True)
    #self.df['ps_reg_03'] = mean_imp.fit_transform(self.df[['ps_reg_03']]).ravel()
    self.df["ps_reg_03"].fillna(2 * self.df['ps_reg_03'].value_counts().idxmax(), inplace=True)
    self.df["ps_car_01_cat"].fillna(-1, inplace=True)
    self.df["ps_car_02_cat"].fillna(-1, inplace=True)
    #self.df["ps_car_03_cat"].fillna(self.df["ps_car_03_cat"].value_counts().idxmax(), inplace=True) # top 1 missing variable, drop
    #self.df["ps_car_05_cat"].fillna(self.df["ps_car_05_cat"].value_counts().idxmax(), inplace=True) # top 2 missing variable, drop
    self.df["ps_car_03_cat"].fillna(-1, inplace=True) # top 1 missing variable
    self.df["ps_car_05_cat"].fillna(-1, inplace=True) # top 2 missing variable
    self.df["ps_car_07_cat"].fillna(-1, inplace=True)
    self.df["ps_car_09_cat"].fillna(-1, inplace=True)
    #self.df["ps_car_11"].fillna(self.df["ps_car_11"].value_counts().idxmax(), inplace=True)
    self.df["ps_car_11"].fillna(-1, inplace=True) # this is catually a cat variable, inverted with ps_car_11_cat
    #self.df['ps_car_11'] = mfrq_imp.fit_transform(self.df[['ps_car_11']]).ravel()
    #self.df["ps_car_12"].fillna(self.df["ps_car_12"].median(), inplace=True)
    #self.df['ps_car_12'] = mean_imp.fit_transform(self.df[['ps_car_12']]).ravel()
    self.df['ps_car_12'].fillna(2 * self.df['ps_car_12'].value_counts().idxmax(), inplace=True)
    #self.df["ps_car_14"].fillna(self.df["ps_car_14"].median(), inplace=True)
    #self.df['ps_car_14'] = mean_imp.fit_transform(self.df[['ps_car_14']]).ravel()
    self.df["ps_car_14"].fillna(2 * self.df['ps_car_14'].value_counts().idxmax(), inplace=True)

    #self.df[""].fillna(self.df[""].mean(), inplace=True)
    #self.df[""].fillna(self.df[""].median(), inplace=True)
    #self.df[""].fillna(self.df[""].value_counts().idxmax(), inplace=True)
    return

  # This method drop the original catagory labels and replaced with one hot labels
  def OneHotReplacement(self):

    #onehot_cols = ['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_07_cat', 'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11']
    onehot_cols = ['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_07_cat', 'ps_car_09_cat', 'ps_car_11']

    self.df = pd.get_dummies(self.df, columns=onehot_cols, drop_first=True)
    #onehot = pd.get_dummies(self.df['ps_ind_02_cat'])
    #self.df.drop(['ps_ind_02_cat'], axis = 1, inplace = True)
    #self.df = self.df.join(onehot)
    return

  # This method drop or merge variables in dataframe accroding to corr map
  def CorrMergeDrop(self):
    #self.df['ps_ind_06070809_bin'] = self.df.apply(
    #  lambda x: 1 if x['ps_ind_06_bin'] == 1 
    #              else 
    #              (2 if x['ps_ind_07_bin'] == 1 
    #                 else 
    #                 ( 3 if x['ps_ind_08_bin'] == 1 
    #                     else 
    #                     (4 if x['ps_ind_09_bin'] == 1 else 5)
    #                 )
    #              ), axis = 1)
    #self.df.drop(['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin'], axis = 1, inplace = True)

    #self.df['ps_ind_161718_bin'] = self.df.apply(lambda x: 1 if x['ps_ind_16_bin'] == 1 
    #                                                         else (2 if x['ps_ind_17_bin'] == 1 else 3), axis = 1)
    #self.df.drop(['ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin'], axis = 1, inplace = True)
 
    # drop this variable from preprocessing study, top 3 missing data, and not important at all
    #self.df.drop(['ps_car_03_cat'], axis = 1, inplace = True)
    #self.df.drop(['ps_car_05_cat'], axis = 1, inplace = True)
    # drop less important features, random forest
    #'''
    #self.df.drop(['ps_calc_15_bin'], axis = 1, inplace = True)
    #self.df.drop(['ps_calc_16_bin'], axis = 1, inplace = True)
    #self.df.drop(['ps_calc_17_bin'], axis = 1, inplace = True)
    #self.df.drop(['ps_calc_18_bin'], axis = 1, inplace = True)
    #self.df.drop(['ps_calc_19_bin'], axis = 1, inplace = True)
    #self.df.drop(['ps_calc_20_bin'], axis = 1, inplace = True)
    self.df.drop(['ps_ind_10_bin'], axis = 1, inplace = True)
    self.df.drop(['ps_ind_11_bin'], axis = 1, inplace = True)
    self.df.drop(['ps_ind_12_bin'], axis = 1, inplace = True)
    self.df.drop(['ps_ind_13_bin'], axis = 1, inplace = True)
    #self.df.drop(['ps_ind_14'], axis = 1, inplace = True)
    #self.df.drop(['ps_car_04_cat'], axis = 1, inplace = True)
    self.df.drop(['ps_car_10_cat'], axis = 1, inplace = True)
    #'''
    #self.df['ps_car_13'] = (self.df['ps_car_13']*self.df['ps_car_13']*48400).round(0)
    #self.df['ps_car_12'] = (self.df['ps_car_12']*self.df['ps_car_12']).round(4) * 10000
    return

  # scale the features 
  def ScaleFeatures(self):
    scaler = StandardScaler(copy=False)
    if self.dftype == 1:
      scaler.fit_transform(self.df.drop(['id','target'], axis=1))
    elif self.dftype == 2:
      scaler.fit_transform(self.df.drop(['id'], axis=1))
    else:
      print ("neither train nor test!")
    return

  # this method pack all previous preprocessing all together and return the data frame
  def FinalFrameforTrainning(self):
    self.MissingData()
    self.OneHotReplacement()
    self.CorrMergeDrop()
    self.ScaleFeatures()
    #print (self.df)
    return self.df


In [2]:
  preprocessing = PreProcessing('../data/train.csv')
  train_p = preprocessing.FinalFrameforTrainning()
  print ("done with trainning set preprocessing!")
  #train_p.to_csv('train_p.csv', index = False)
  preprocessing = PreProcessing('../data/test.csv')
  test_p = preprocessing.FinalFrameforTrainning()
  print ("done with test set preprocessing!")
  #test_p.to_csv('test_p.csv', index = False)
  #train_p = pd.read_csv('train_p.csv')
  #test_p = pd.read_csv('test_p.csv')

done with trainning set preprocessing!
done with test set preprocessing!


In [3]:
print(train_p.head())
print(train_p.shape)
print(test_p.shape)

   id  target  ps_ind_01  ps_ind_03  ps_ind_06_bin  ps_ind_07_bin  \
0   7       0          2          5              0              1   
1   9       0          1          7              0              0   
2  13       0          5          9              0              0   
3  16       0          0          2              1              0   
4  17       0          0          0              1              0   

   ps_ind_08_bin  ps_ind_09_bin  ps_ind_14  ps_ind_15      ...        \
0              0              0          0         11      ...         
1              1              0          0          3      ...         
2              1              0          0         12      ...         
3              0              0          0          8      ...         
4              0              0          0          9      ...         

   ps_car_07_cat_1.0  ps_car_09_cat_0.0  ps_car_09_cat_1.0  ps_car_09_cat_2.0  \
0                  1                  1                  0             

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
import xgboost

'''
X_train, X_test, y_train, y_test = train_test_split(
                                                    train_p.drop(['id', 'target'],axis=1),
                                                    train_p.target,
                                                    train_size=0.7,
                                                    test_size=0.3,
                                                    random_state=0
                                                   )
#xgb = XGBClassifier(    
#                    n_estimators=1000,
#                    max_depth=6,
#                    objective="binary:logistic",
#                    learning_rate=0.07, 
#                    subsample=.8,
#                    min_child_weight=.77,
#                    colsample_bytree=.8,
#                    scale_pos_weight=1.6,
#                    gamma=10,
#                    reg_alpha=8,
#                    reg_lambda=1.3,
#                    )

xgb = XGBClassifier(n_estimators=1000, objective="binary:logistic", learning_rate=0.014, max_depth=4, min_child_weight=5, gamma=0, subsample=0.8, colsample_bytree=0.8,  reg_alpha=0, reg_lambda=1, nthread=6)
                    
xgb.fit(X_train, y_train)
print("----- Training Done -----")
ysc_pred = xgb.predict_proba(X_test)[:,1]
#print (ysc_pred)
#print (y_test)
gini = 2*roc_auc_score(y_test, ysc_pred)-1
print(gini)
'''

'\nX_train, X_test, y_train, y_test = train_test_split(\n                                                    train_p.drop([\'id\', \'target\'],axis=1),\n                                                    train_p.target,\n                                                    train_size=0.7,\n                                                    test_size=0.3,\n                                                    random_state=0\n                                                   )\n#xgb = XGBClassifier(    \n#                    n_estimators=1000,\n#                    max_depth=6,\n#                    objective="binary:logistic",\n#                    learning_rate=0.07, \n#                    subsample=.8,\n#                    min_child_weight=.77,\n#                    colsample_bytree=.8,\n#                    scale_pos_weight=1.6,\n#                    gamma=10,\n#                    reg_alpha=8,\n#                    reg_lambda=1.3,\n#                    )\n\nxgb = XGBClassifier(n_es

In [8]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

In [9]:
features = train_p.drop(['id', 'target'],axis=1).columns.values
x, y = (list(x) for x in zip(*sorted(zip(xgb.feature_importances_, features), 
                                                            reverse = False)))
#print(y)
indice = 0
for thisy in y:
    if 'ps_ind' in thisy:
        print(indice)
        print(y[indice])
        print(x[indice])
    indice = indice+1

trace2 = go.Bar(
    x=x ,
    y=y,
    marker=dict(
        color=x,
        colorscale = 'Viridis',
        reversescale = True
    ),
    name='XGBoost Feature importance',
    orientation='h',
)

layout = dict(
    title='Feature importances, XGBoost',
     width = 900, height = 2000,
    yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=True,
#         domain=[0, 0.85],
    ))

fig1 = go.Figure(data=[trace2])
fig1['layout'].update(layout)
py.iplot(fig1, filename='plots')

10
ps_ind_02_cat_4.0
0.000775248
12
ps_ind_05_cat_5.0
0.00098668
13
ps_ind_05_cat_1.0
0.00105716
14
ps_ind_14
0.00119811
17
ps_ind_02_cat_3.0
0.00140954
18
ps_ind_18_bin
0.00148002
24
ps_ind_05_cat_4.0
0.00204384
38
ps_ind_04_cat_1.0
0.00331243
41
ps_ind_05_cat_3.0
0.00373529
47
ps_ind_08_bin
0.00577912
50
ps_ind_02_cat_2.0
0.00690676
51
ps_ind_05_cat_6.0
0.00711819
52
ps_ind_02_cat_1.0
0.0074001
54
ps_ind_04_cat_0.0
0.00761153
58
ps_ind_09_bin
0.0092325
64
ps_ind_05_cat_2.0
0.010783
67
ps_ind_07_bin
0.0116287
72
ps_ind_16_bin
0.0134611
75
ps_ind_06_bin
0.0144478
82
ps_ind_05_cat_0.0
0.030869
83
ps_ind_17_bin
0.0317147
84
ps_ind_01
0.0329833
86
ps_ind_15
0.0435549
89
ps_ind_03
0.0666009


In [89]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
def xgb_param_selection(nfolds):
    #sc_mod = XGBClassifier(n_estimators=100, objective="binary:logistic", learning_rate=0.1, subsample=0.8, colsample_bytree=0.8, nthread=6)
    #sc_mod = XGBClassifier(n_estimators=100, objective="binary:logistic", learning_rate=0.1, max_depth=4, min_child_weight=5, subsample=0.8, colsample_bytree=0.8, nthread=6)
    #sc_mod = XGBClassifier(n_estimators=100, objective="binary:logistic", learning_rate=0.1, max_depth=4, min_child_weight=5, gamma=0, nthread=6)
    #sc_mod = XGBClassifier(n_estimators=100, objective="binary:logistic", learning_rate=0.1, max_depth=4, min_child_weight=5, gamma=0, subsample=0.8, colsample_bytree=0.8, nthread=6)
    #sc_mod = XGBClassifier(objective="binary:logistic", max_depth=4, min_child_weight=5, gamma=0, subsample=0.8, colsample_bytree=0.8, reg_alpha=0, reg_lambda=1, nthread=6)
    sc_mod = XGBClassifier(n_estimators=1000, objective="binary:logistic", learning_rate=0.14, subsample=0.8, colsample_bytree=0.8, nthread=6)

    mchdw = [1,3,5]
    mdeps = [3,4,5,6,8,10,12]
    #mchdw = [1,2,3,4,5,6,7]
    #mdeps = [4]
    #mchdw = [5]
    #mdeps = [4]
    param_grid = {'min_child_weight' : mchdw, 'max_depth': mdeps}
    #mnsps = [0.0, 0.05, 0.1, 0.15, 0.2, 0.25]
    #param_grid = {'gamma' : mnsps}
    #subss = [0.6, 0.7, 0.8, 0.9]
    #colst = [0.6, 0.7, 0.8, 0.9]
    #param_grid = {'subsample' : subss, 'colsample_bytree': colst}
    #regl1 = [0]
    #regl2 = [0, 0.01, 0.1, 1, 10, 100]
    #regl1 = [0, 0.1, 1, 10]
    #regl2 = [0.1, 1]
    #param_grid = {'reg_alpha' : regl1, 'reg_lambda': regl2}
    #nests = [100, 200, 500, 1000]
    #lrate = [0.01, 0.1, 1]
    #nests = [1000]
    #lrate = [0.005, 0.01, 0.015, 0.02]
    #nests = [1000]
    #lrate = [0.011, 0.012, 0.013, 0.014, 0.015]      
    #param_grid = {'n_estimators': nests, 'learning_rate' : lrate}
    grid_search = GridSearchCV(sc_mod, param_grid, scoring='roc_auc', cv=nfolds, verbose=2)
    grid_search.fit(train_p.drop(['id', 'target'],axis=1), train_p.target)
    #print ("CV results")
    #print (grid_search.cv_results_)
    print ("Grid Scores:")
    print (grid_search.grid_scores_)
    print ("Best parameters:")
    print (grid_search.best_params_)
    return grid_search.cv_results_

In [90]:
xgb_param_selection(3)

Fitting 3 folds for each of 21 candidates, totalling 63 fits
[CV] max_depth=3, min_child_weight=1 .................................
[CV] .................. max_depth=3, min_child_weight=1, total= 2.4min
[CV] max_depth=3, min_child_weight=1 .................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.5min remaining:    0.0s


[CV] .................. max_depth=3, min_child_weight=1, total= 2.4min
[CV] max_depth=3, min_child_weight=1 .................................
[CV] .................. max_depth=3, min_child_weight=1, total= 2.4min
[CV] max_depth=3, min_child_weight=3 .................................
[CV] .................. max_depth=3, min_child_weight=3, total= 2.5min
[CV] max_depth=3, min_child_weight=3 .................................
[CV] .................. max_depth=3, min_child_weight=3, total= 2.5min
[CV] max_depth=3, min_child_weight=3 .................................
[CV] .................. max_depth=3, min_child_weight=3, total= 2.3min
[CV] max_depth=3, min_child_weight=5 .................................
[CV] .................. max_depth=3, min_child_weight=5, total= 2.4min
[CV] max_depth=3, min_child_weight=5 .................................
[CV] .................. max_depth=3, min_child_weight=5, total= 2.3min
[CV] max_depth=3, min_child_weight=5 .................................
[CV] .

[CV] ................. max_depth=12, min_child_weight=3, total= 9.5min
[CV] max_depth=12, min_child_weight=5 ................................
[CV] ................. max_depth=12, min_child_weight=5, total= 9.4min
[CV] max_depth=12, min_child_weight=5 ................................
[CV] ................. max_depth=12, min_child_weight=5, total= 9.0min
[CV] max_depth=12, min_child_weight=5 ................................
[CV] ................. max_depth=12, min_child_weight=5, total= 9.0min


[Parallel(n_jobs=1)]: Done  63 out of  63 | elapsed: 329.5min finished


Grid Scores:
[mean: 0.62869, std: 0.00158, params: {'max_depth': 3, 'min_child_weight': 1}, mean: 0.62898, std: 0.00315, params: {'max_depth': 3, 'min_child_weight': 3}, mean: 0.62923, std: 0.00233, params: {'max_depth': 3, 'min_child_weight': 5}, mean: 0.61896, std: 0.00218, params: {'max_depth': 4, 'min_child_weight': 1}, mean: 0.61754, std: 0.00253, params: {'max_depth': 4, 'min_child_weight': 3}, mean: 0.61794, std: 0.00309, params: {'max_depth': 4, 'min_child_weight': 5}, mean: 0.60410, std: 0.00228, params: {'max_depth': 5, 'min_child_weight': 1}, mean: 0.60461, std: 0.00138, params: {'max_depth': 5, 'min_child_weight': 3}, mean: 0.60611, std: 0.00384, params: {'max_depth': 5, 'min_child_weight': 5}, mean: 0.58832, std: 0.00453, params: {'max_depth': 6, 'min_child_weight': 1}, mean: 0.59153, std: 0.00227, params: {'max_depth': 6, 'min_child_weight': 3}, mean: 0.59297, std: 0.00217, params: {'max_depth': 6, 'min_child_weight': 5}, mean: 0.56696, std: 0.00328, params: {'max_depth':


The grid_scores_ attribute was deprecated in version 0.18 in favor of the more elaborate cv_results_ attribute. The grid_scores_ attribute will not be available from 0.20


You are accessing a training score ('mean_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True


You are accessing a training score ('split0_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True


You are accessing a training score ('split1_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True


You are accessing a training score ('split2_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True


You are accessing a training score ('std_train_score'), which will not be available by default any m

{'mean_fit_time': array([ 142.2854383 ,  144.17087348,  141.63591385,  171.60373203,
         171.67185489,  179.82225339,  218.90471578,  214.93268283,
         212.76178845,  253.08278759,  251.54947464,  250.70621737,
         333.40255817,  313.80523213,  311.52601902,  438.41375772,
         450.0050981 ,  484.71996299,  561.27947744,  560.57713628,
         540.16661803]),
 'mean_score_time': array([  1.32738924,   1.36574443,   1.32200066,   1.6175789 ,
          1.61877116,   1.68157315,   2.01195558,   2.03653129,
          2.00443363,   2.29109979,   2.28668571,   2.28992796,
          3.26849786,   3.21615688,   3.19642178,   6.45610428,
          5.5384895 ,   5.98205447,  13.48791107,  10.54311673,   8.07341917]),
 'mean_test_score': array([ 0.6286892 ,  0.62898226,  0.62922913,  0.61896037,  0.61753828,
         0.61793601,  0.60410224,  0.60460654,  0.6061116 ,  0.58832175,
         0.59152685,  0.59296595,  0.56696262,  0.56844636,  0.57116509,
         0.56903645,  0.5

In [10]:
# final block, rm after tuning
xgb_final = XGBClassifier(n_estimators=1000, objective="binary:logistic", learning_rate=0.014, max_depth=4, min_child_weight=5, gamma=0, subsample=0.8, colsample_bytree=0.8,  reg_alpha=0, reg_lambda=1, nthread=6)
xgb_final.fit(train_p.drop(['id', 'target'],axis=1), train_p.target)
print("----- Training Done -----")
pred_final = xgb_final.predict_proba(test_p.drop(['id'],axis=1))
print(pred_final)

----- Training Done -----
[[ 0.97268504  0.02731498]
 [ 0.97487372  0.02512625]
 [ 0.9721278   0.0278722 ]
 ..., 
 [ 0.96019721  0.03980282]
 [ 0.97476858  0.02523141]
 [ 0.97046328  0.02953671]]


In [12]:
#generate submission file
sub = pd.DataFrame()
sub['id'] = test_p.id
sub['target'] = pred_final[:,1]
sub.to_csv('../data/res/submit_PreOneHotDrop_XGB_20171127.csv', index=False)
sub.head()

Unnamed: 0,id,target
0,0,0.027315
1,1,0.025126
2,2,0.027872
3,3,0.01516
4,4,0.034867
