In [46]:
import numpy as np
import pandas as pd
import json

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

# This class is for the preprocessing, take csv path as input, and aim to return a pandas data frame for trainning
class PreProcessing:

  # The constructor takes a pandas dataframe as input and save it to self.df
  def __init__(self, csvpath):
    self.df = pd.read_csv(csvpath)
    if "train" in csvpath:
      self.dftype = 1
    elif "test" in csvpath:
      self.dftype = 2
    else:
      self.dftype = -1

  # This method have deal with missing data before merge or drop
  def MissingData(self):
    self.df = self.df.replace(-1, np.NaN) #first, replace -1 to NaN
    #print (self.df.columns[self.df.isnull().any()])
    '''
    'ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 
    'ps_reg_03', # we have a good discussion : https://www.kaggle.com/pnagel/reconstruction-of-ps-reg-03, need special reconstruct
    'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_05_cat', 'ps_car_07_cat', 'ps_car_09_cat', 
    'ps_car_11', 'ps_car_12', 'ps_car_14'
    '''
    mean_imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    mdan_imp = Imputer(missing_values='NaN', strategy='median', axis=0)
    mfrq_imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)

    self.df["ps_ind_02_cat"].fillna(-1, inplace=True)
    self.df["ps_ind_04_cat"].fillna(-1, inplace=True)
    self.df["ps_ind_05_cat"].fillna(-1, inplace=True)
    #self.df["ps_reg_03"].fillna(self.df["ps_reg_03"].median(), inplace=True)
    #self.df['ps_reg_03'] = mean_imp.fit_transform(self.df[['ps_reg_03']]).ravel()
    self.df["ps_reg_03"].fillna(-1, inplace=True)
    self.df["ps_car_01_cat"].fillna(-1, inplace=True)
    self.df["ps_car_02_cat"].fillna(-1, inplace=True)
    #self.df["ps_car_03_cat"].fillna(self.df["ps_car_03_cat"].value_counts().idxmax(), inplace=True) # top 1 missing variable, drop
    #self.df["ps_car_05_cat"].fillna(self.df["ps_car_05_cat"].value_counts().idxmax(), inplace=True) # top 2 missing variable, drop
    self.df["ps_car_03_cat"].fillna(-1, inplace=True) # top 1 missing variable
    self.df["ps_car_05_cat"].fillna(-1, inplace=True) # top 2 missing variable
    self.df["ps_car_07_cat"].fillna(-1, inplace=True)
    self.df["ps_car_09_cat"].fillna(-1, inplace=True)
    #self.df["ps_car_11"].fillna(self.df["ps_car_11"].value_counts().idxmax(), inplace=True)
    self.df["ps_car_11"].fillna(-1, inplace=True) # this is catually a cat variable, inverted with ps_car_11_cat
    #self.df['ps_car_11'] = mfrq_imp.fit_transform(self.df[['ps_car_11']]).ravel()
    #self.df["ps_car_12"].fillna(self.df["ps_car_12"].median(), inplace=True)
    #self.df['ps_car_12'] = mean_imp.fit_transform(self.df[['ps_car_12']]).ravel()
    self.df['ps_car_12'].fillna(2 * self.df['ps_car_12'].value_counts().idxmax(), inplace=True)
    #self.df["ps_car_14"].fillna(self.df["ps_car_14"].median(), inplace=True)
    #self.df['ps_car_14'] = mean_imp.fit_transform(self.df[['ps_car_14']]).ravel()
    self.df["ps_car_14"].fillna(2 * self.df['ps_car_14'].value_counts().idxmax(), inplace=True)

    #self.df[""].fillna(self.df[""].mean(), inplace=True)
    #self.df[""].fillna(self.df[""].median(), inplace=True)
    #self.df[""].fillna(self.df[""].value_counts().idxmax(), inplace=True)
    return

  # This method drop the original catagory labels and replaced with one hot labels
  def OneHotReplacement(self):

    #onehot_cols = ['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_07_cat', 'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11']
    onehot_cols = ['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_07_cat', 'ps_car_09_cat', 'ps_car_11']

    self.df = pd.get_dummies(self.df, columns=onehot_cols, drop_first=True)
    #onehot = pd.get_dummies(self.df['ps_ind_02_cat'])
    #self.df.drop(['ps_ind_02_cat'], axis = 1, inplace = True)
    #self.df = self.df.join(onehot)
    return

  # This method drop or merge variables in dataframe accroding to corr map
  def CorrMergeDrop(self):
    #self.df['ps_ind_06070809_bin'] = self.df.apply(
    #  lambda x: 1 if x['ps_ind_06_bin'] == 1 
    #              else 
    #              (2 if x['ps_ind_07_bin'] == 1 
    #                 else 
    #                 ( 3 if x['ps_ind_08_bin'] == 1 
    #                     else 
    #                     (4 if x['ps_ind_09_bin'] == 1 else 5)
    #                 )
    #              ), axis = 1)
    #self.df.drop(['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin'], axis = 1, inplace = True)

    #self.df['ps_ind_161718_bin'] = self.df.apply(lambda x: 1 if x['ps_ind_16_bin'] == 1 
    #                                                         else (2 if x['ps_ind_17_bin'] == 1 else 3), axis = 1)
    #self.df.drop(['ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin'], axis = 1, inplace = True)
 
    # drop this variable from preprocessing study, top 3 missing data, and not important at all
    #self.df.drop(['ps_car_03_cat'], axis = 1, inplace = True)
    #self.df.drop(['ps_car_05_cat'], axis = 1, inplace = True)
    # drop less important features, random forest
    #'''
    #self.df.drop(['ps_calc_15_bin'], axis = 1, inplace = True)
    #self.df.drop(['ps_calc_16_bin'], axis = 1, inplace = True)
    #self.df.drop(['ps_calc_17_bin'], axis = 1, inplace = True)
    #self.df.drop(['ps_calc_18_bin'], axis = 1, inplace = True)
    #self.df.drop(['ps_calc_19_bin'], axis = 1, inplace = True)
    #self.df.drop(['ps_calc_20_bin'], axis = 1, inplace = True)
    self.df.drop(['ps_ind_10_bin'], axis = 1, inplace = True)
    self.df.drop(['ps_ind_11_bin'], axis = 1, inplace = True)
    self.df.drop(['ps_ind_12_bin'], axis = 1, inplace = True)
    self.df.drop(['ps_ind_13_bin'], axis = 1, inplace = True)
    #self.df.drop(['ps_ind_14'], axis = 1, inplace = True)
    #self.df.drop(['ps_car_04_cat'], axis = 1, inplace = True)
    self.df.drop(['ps_car_10_cat'], axis = 1, inplace = True)
    #'''
    #self.df['ps_car_13'] = (self.df['ps_car_13']*self.df['ps_car_13']*48400).round(0)
    #self.df['ps_car_12'] = (self.df['ps_car_12']*self.df['ps_car_12']).round(4) * 10000
    return

  def ps_reg_03_reconstruct(self):
    I = np.round((40*self.df['ps_reg_03'])**2)
    I = I.astype(int)
    M = (I - 1) // 27
    F = I - 27*M
    self.df['ps_reg_03_M'] = M
    self.df['ps_reg_03_F'] = F
    self.df[self.df['ps_reg_03'] == -1,('ps_reg_03_F', 'ps_reg_03_M')] = -1
    return 

  # scale the features 
  def ScaleFeatures(self):
    scaler = StandardScaler(copy=False)
    if self.dftype == 1:
      scaler.fit_transform(self.df.drop(['id','target'], axis=1))
    elif self.dftype == 2:
      scaler.fit_transform(self.df.drop(['id'], axis=1))
    else:
      print ("neither train nor test!")
    return

  # this method pack all previous preprocessing all together and return the data frame
  def FinalFrameforTrainning(self):
    self.MissingData()
    self.OneHotReplacement()
    self.CorrMergeDrop()
    #self.ps_reg_03_reconstruct()
    #self.ScaleFeatures()
    #print (self.df)
    return self.df


In [47]:
preprocessing = PreProcessing('../data/train.csv')
train_p = preprocessing.FinalFrameforTrainning()
print ("done with trainning set preprocessing!")
#train_p.to_csv('train_p.csv', index = False)
preprocessing = PreProcessing('../data/test.csv')
test_p = preprocessing.FinalFrameforTrainning()
print ("done with test set preprocessing!")
#test_p.to_csv('test_p.csv', index = False)
#train_p = pd.read_csv('train_p.csv')
#test_p = pd.read_csv('test_p.csv')

TypeError: 'Series' objects are mutable, thus they cannot be hashed

In [43]:
print(train_p.shape)
print(test_p.shape)
train_p.head()

(595212, 94)
(892816, 93)


Unnamed: 0,id,target,ps_ind_01,ps_ind_03,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_14,ps_ind_15,...,ps_car_07_cat_1.0,ps_car_09_cat_0.0,ps_car_09_cat_1.0,ps_car_09_cat_2.0,ps_car_09_cat_3.0,ps_car_09_cat_4.0,ps_car_11_0.0,ps_car_11_1.0,ps_car_11_2.0,ps_car_11_3.0
0,7,0,2,5,0,1,0,0,0,11,...,1,1,0,0,0,0,0,0,1,0
1,9,0,1,7,0,0,1,0,0,3,...,1,0,0,1,0,0,0,0,0,1
2,13,0,5,9,0,0,1,0,0,12,...,1,0,0,1,0,0,0,1,0,0
3,16,0,0,2,1,0,0,0,0,8,...,1,0,0,0,1,0,0,1,0,0
4,17,0,0,0,1,0,0,0,0,9,...,1,0,0,1,0,0,0,0,0,1


In [38]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
import lightgbm


'''
X_train, X_test, y_train, y_test = train_test_split(
                                                    train_p.drop(['id', 'target'],axis=1),
                                                    train_p.target,
                                                    train_size=0.7,
                                                    test_size=0.3,
                                                    random_state=0
                                                   )

#lgb = LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=4, subsample=0.7, subsample_freq=2, num_leaves=16, feature_fraction=0.8, bagging_freq=1)                
lgb = LGBMClassifier(n_estimators=1000, learning_rate=0.01, num_leaves=31, max_depth=8, min_child_weight=7, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.01, reg_lambda=1.0, n_jobs=6)

lgb.fit(X_train, y_train)
print("----- Training Done -----")
ysc_pred = lgb.predict_proba(X_test)[:,1]
#print (ysc_pred)
#print (y_test)
gini = 2*roc_auc_score(y_test, ysc_pred)-1
print(gini)
'''

'\nX_train, X_test, y_train, y_test = train_test_split(\n                                                    train_p.drop([\'id\', \'target\'],axis=1),\n                                                    train_p.target,\n                                                    train_size=0.7,\n                                                    test_size=0.3,\n                                                    random_state=0\n                                                   )\n\n#lgb = LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=4, subsample=0.7, subsample_freq=2, num_leaves=16, feature_fraction=0.8, bagging_freq=1)                \nlgb = LGBMClassifier(n_estimators=1000, learning_rate=0.01, num_leaves=31, max_depth=8, min_child_weight=7, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.01, reg_lambda=1.0, n_jobs=6)\n\nlgb.fit(X_train, y_train)\nprint("----- Training Done -----")\nysc_pred = lgb.predict_proba(X_test)[:,1]\n#print (ysc_pred)\n#print (y_test)\ngini = 2*r

In [39]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

In [40]:
features = train_p.drop(['id', 'target'],axis=1).columns.values
x, y = (list(x) for x in zip(*sorted(zip(lgb.feature_importances_, features), 
                                                            reverse = False)))
#print(y)
indice = 0
for thisy in y:
    if 'ps_calc_' in thisy:
        print(indice)
        print(y[indice])
        print(x[indice])
    indice = indice+1

trace2 = go.Bar(
    x=x ,
    y=y,
    marker=dict(
        color=x,
        colorscale = 'Viridis',
        reversescale = True
    ),
    name='lightGBM Feature importance',
    orientation='h',
)

layout = dict(
    title='Feature importances, LightGBM',
     width = 900, height = 2000,
    yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=True,
#         domain=[0, 0.85],
    ))

fig1 = go.Figure(data=[trace2])
fig1['layout'].update(layout)
py.iplot(fig1, filename='plots')

15
ps_calc_20_bin
30
20
ps_calc_16_bin
44
21
ps_calc_15_bin
47
25
ps_calc_18_bin
57
36
ps_calc_19_bin
83
39
ps_calc_17_bin
95
56
ps_calc_04
278
57
ps_calc_12
283
60
ps_calc_06
327
61
ps_calc_09
329
63
ps_calc_08
335
64
ps_calc_13
343
66
ps_calc_07
360
69
ps_calc_05
391
70
ps_calc_03
403
73
ps_calc_11
486
74
ps_calc_14
510
75
ps_calc_02
522
76
ps_calc_01
526
78
ps_calc_10
563


In [33]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
def lgb_param_selection(nfolds):
 
    #sc_mod = LGBMClassifier(n_estimators=300, learning_rate=0.03, num_leaves=31, subsample=0.8, colsample_bytree=0.8, n_jobs=6)
    #sc_mod = LGBMClassifier(n_estimators=300, learning_rate=0.03, num_leaves=31, max_depth=8, min_child_weight=7, n_jobs=6)
    #sc_mod = LGBMClassifier(n_estimators=300, learning_rate=0.03, num_leaves=31, max_depth=8, min_child_weight=7, subsample=0.8, colsample_bytree=0.8, n_jobs=6)
    #sc_mod = LGBMClassifier(num_leaves=31, max_depth=8, min_child_weight=7, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.01, reg_lambda=1.0, n_jobs=6)
    #sc_mod = LGBMClassifier(n_estimators=1000, learning_rate=0.01, num_leaves=31, max_depth=8, min_child_weight=7, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.01, reg_lambda=1.0, n_jobs=6)
    sc_mod = LGBMClassifier(boost='gbdt', metric='auc', objective='binary', n_estimators=300, learning_rate=0.03, max_depth=4, colsample_bytree=0.7, reg_alpha=16.7, reg_lambda=0, is_training_metric=True, scale_pos_weight=1.0, n_jobs=6)   

    mchdw = [500]
    numls = [16]
    param_grid = {'min_child_weight' : mchdw, 'num_leaves' : numls}
    #sposw = [1.0, 1.2, 1.4, 1.6, 1.8]
    #param_grid = {'scale_pos_weight': sposw}
    #nests = [300]
    #lrate = [0.01, 0.02, 0.03, 0.04]    
    #nests = [1000]
    #lrate = [0.005, 0.01, 0.015, 0.02, 0.03]      
    #param_grid = {'n_estimators': nests, 'learning_rate' : lrate}
    #mchdw = [6,7,8]
    #mdeps = [7,8,9]
    #param_grid = {'min_child_weight' : mchdw, 'max_depth': mdeps}
    #subss = [0.6, 0.7, 0.8, 0.9]
    #colst = [0.6, 0.7, 0.8, 0.9]
    #subss = [0.85, 0.9, 0.95]
    #colst = [0.55, 0.6, 0.65]   
    #param_grid = {'subsample' : subss, 'colsample_bytree': colst}
    #regl1 = [0.01]
    #regl2 = [0, 0.01, 0.1, 1, 10, 100]
    #regl1 = [0, 0.1, 1, 10]
    #regl2 = [0, 0.1, 1, 10]
    #param_grid = {'reg_alpha' : regl1, 'reg_lambda': regl2}   
    grid_search = GridSearchCV(sc_mod, param_grid, scoring='roc_auc', cv=nfolds, verbose=2)
    grid_search.fit(train_p.drop(['id', 'target'],axis=1), train_p.target)
    #print ("CV results")
    #print (grid_search.cv_results_)
    print ("Grid Scores:")
    print (grid_search.grid_scores_)
    print ("Best parameters:")
    print (grid_search.best_params_)
    return grid_search.cv_results_

In [34]:
lgb_param_selection(3)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] min_child_weight=500, num_leaves=16 .............................
[CV] .............. min_child_weight=500, num_leaves=16, total=  13.0s
[CV] min_child_weight=500, num_leaves=16 .............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   15.5s remaining:    0.0s


[CV] .............. min_child_weight=500, num_leaves=16, total=  13.0s
[CV] min_child_weight=500, num_leaves=16 .............................
[CV] .............. min_child_weight=500, num_leaves=16, total=  13.0s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   46.6s finished


Grid Scores:
[mean: 0.63719, std: 0.00185, params: {'min_child_weight': 500, 'num_leaves': 16}]
Best parameters:
{'min_child_weight': 500, 'num_leaves': 16}



The grid_scores_ attribute was deprecated in version 0.18 in favor of the more elaborate cv_results_ attribute. The grid_scores_ attribute will not be available from 0.20


You are accessing a training score ('mean_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True


You are accessing a training score ('split0_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True


You are accessing a training score ('split1_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True


You are accessing a training score ('split2_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True


You are accessing a training score ('std_train_score'), which will not be available by default any m

{'mean_fit_time': array([ 11.76973097]),
 'mean_score_time': array([ 1.25444428]),
 'mean_test_score': array([ 0.63719438]),
 'mean_train_score': array([ 0.65439854]),
 'param_min_child_weight': masked_array(data = [500],
              mask = [False],
        fill_value = ?),
 'param_num_leaves': masked_array(data = [16],
              mask = [False],
        fill_value = ?),
 'params': [{'min_child_weight': 500, 'num_leaves': 16}],
 'rank_test_score': array([1], dtype=int32),
 'split0_test_score': array([ 0.63766883]),
 'split0_train_score': array([ 0.655043]),
 'split1_test_score': array([ 0.63918853]),
 'split1_train_score': array([ 0.65285643]),
 'split2_test_score': array([ 0.63472577]),
 'split2_train_score': array([ 0.65529618]),
 'std_fit_time': array([ 0.01964949]),
 'std_score_time': array([ 0.00613264]),
 'std_test_score': array([ 0.00185254]),
 'std_train_score': array([ 0.00109532])}

In [51]:
# final block, rm after tuning
lgb_final = LGBMClassifier(n_estimators=1000, learning_rate=0.01, num_leaves=31, max_depth=8, min_child_weight=7, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.01, reg_lambda=1.0, scale_pos_weight=1.6, n_jobs=6)

lgb_final.fit(train_p.drop(['id', 'target'],axis=1), train_p.target)
print("----- Training Done -----")
pred_final = lgb_final.predict_proba(test_p.drop(['id'],axis=1))
print(pred_final)

----- Training Done -----
[[ 0.97347022  0.02652978]
 [ 0.97501326  0.02498674]
 [ 0.97238529  0.02761471]
 ..., 
 [ 0.96136917  0.03863083]
 [ 0.97587707  0.02412293]
 [ 0.96995808  0.03004192]]


In [52]:
#generate submission file
sub = pd.DataFrame()
sub['id'] = test_p.id
sub['target'] = pred_final[:,1]
sub.to_csv('../data/res/submit_PreOneHotDrop_LGB_20171128.csv', index=False)
sub.head()

Unnamed: 0,id,target
0,0,0.02653
1,1,0.024987
2,2,0.027615
3,3,0.015251
4,4,0.036103
