In [1]:
import numpy as np
import pandas as pd
import json

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

# This class is for the preprocessing, take csv path as input, and aim to return a pandas data frame for trainning
class PreProcessing:

  # The constructor takes a pandas dataframe as input and save it to self.df
  def __init__(self, csvpath):
    self.df = pd.read_csv(csvpath)
    if "train" in csvpath:
      self.dftype = 1
    elif "test" in csvpath:
      self.dftype = 2
    else:
      self.dftype = -1

  # This method have deal with missing data before merge or drop
  def MissingData(self):
    self.df = self.df.replace(-1, np.NaN) #first, replace -1 to NaN
    #print (self.df.columns[self.df.isnull().any()])
    '''
    'ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 
    'ps_reg_03',
    'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_05_cat', 'ps_car_07_cat', 'ps_car_09_cat', 
    'ps_car_11', 'ps_car_12', 'ps_car_14'
    '''
    mean_imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    mdan_imp = Imputer(missing_values='NaN', strategy='median', axis=0)
    mfrq_imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)

    self.df["ps_ind_02_cat"].fillna(-1, inplace=True)
    self.df["ps_ind_04_cat"].fillna(-1, inplace=True)
    self.df["ps_ind_05_cat"].fillna(-1, inplace=True)
    #self.df["ps_reg_03"].fillna(self.df["ps_reg_03"].median(), inplace=True)
    #self.df['ps_reg_03'] = mean_imp.fit_transform(self.df[['ps_reg_03']]).ravel()
    self.df["ps_reg_03"].fillna(2 * self.df['ps_reg_03'].value_counts().idxmax(), inplace=True)
    self.df["ps_car_01_cat"].fillna(-1, inplace=True)
    self.df["ps_car_02_cat"].fillna(-1, inplace=True)
    #self.df["ps_car_03_cat"].fillna(self.df["ps_car_03_cat"].value_counts().idxmax(), inplace=True) # top 1 missing variable, drop
    #self.df["ps_car_05_cat"].fillna(self.df["ps_car_05_cat"].value_counts().idxmax(), inplace=True) # top 2 missing variable, drop
    self.df["ps_car_03_cat"].fillna(-1, inplace=True) # top 1 missing variable
    self.df["ps_car_05_cat"].fillna(-1, inplace=True) # top 2 missing variable
    self.df["ps_car_07_cat"].fillna(-1, inplace=True)
    self.df["ps_car_09_cat"].fillna(-1, inplace=True)
    #self.df["ps_car_11"].fillna(self.df["ps_car_11"].value_counts().idxmax(), inplace=True)
    self.df["ps_car_11"].fillna(-1, inplace=True) # this is catually a cat variable, inverted with ps_car_11_cat
    #self.df['ps_car_11'] = mfrq_imp.fit_transform(self.df[['ps_car_11']]).ravel()
    #self.df["ps_car_12"].fillna(self.df["ps_car_12"].median(), inplace=True)
    #self.df['ps_car_12'] = mean_imp.fit_transform(self.df[['ps_car_12']]).ravel()
    self.df['ps_car_12'].fillna(2 * self.df['ps_car_12'].value_counts().idxmax(), inplace=True)
    #self.df["ps_car_14"].fillna(self.df["ps_car_14"].median(), inplace=True)
    #self.df['ps_car_14'] = mean_imp.fit_transform(self.df[['ps_car_14']]).ravel()
    self.df["ps_car_14"].fillna(2 * self.df['ps_car_14'].value_counts().idxmax(), inplace=True)

    #self.df[""].fillna(self.df[""].mean(), inplace=True)
    #self.df[""].fillna(self.df[""].median(), inplace=True)
    #self.df[""].fillna(self.df[""].value_counts().idxmax(), inplace=True)
    return

  # This method drop the original catagory labels and replaced with one hot labels
  def OneHotReplacement(self):

    onehot_cols = ['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_07_cat', 'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11']
    #onehot_cols = ['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_07_cat', 'ps_car_09_cat', 'ps_car_11']

    self.df = pd.get_dummies(self.df, columns=onehot_cols, drop_first=True)
    #onehot = pd.get_dummies(self.df['ps_ind_02_cat'])
    #self.df.drop(['ps_ind_02_cat'], axis = 1, inplace = True)
    #self.df = self.df.join(onehot)
    return

  # This method drop or merge variables in dataframe accroding to corr map
  def CorrMergeDrop(self):
    #self.df['ps_ind_06070809_bin'] = self.df.apply(
    #  lambda x: 1 if x['ps_ind_06_bin'] == 1 
    #              else 
    #              (2 if x['ps_ind_07_bin'] == 1 
    #                 else 
    #                 ( 3 if x['ps_ind_08_bin'] == 1 
    #                     else 
    #                     (4 if x['ps_ind_09_bin'] == 1 else 5)
    #                 )
    #              ), axis = 1)
    #self.df.drop(['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin'], axis = 1, inplace = True)

    #self.df['ps_ind_161718_bin'] = self.df.apply(lambda x: 1 if x['ps_ind_16_bin'] == 1 
    #                                                         else (2 if x['ps_ind_17_bin'] == 1 else 3), axis = 1)
    #self.df.drop(['ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin'], axis = 1, inplace = True)
 
    # drop this variable from preprocessing study, top 3 missing data, and not important at all
    #self.df.drop(['ps_car_03_cat'], axis = 1, inplace = True)
    #self.df.drop(['ps_car_05_cat'], axis = 1, inplace = True)
    # drop less important features, random forest
    #'''
    #self.df.drop(['ps_calc_15_bin'], axis = 1, inplace = True)
    #self.df.drop(['ps_calc_16_bin'], axis = 1, inplace = True)
    #self.df.drop(['ps_calc_17_bin'], axis = 1, inplace = True)
    #self.df.drop(['ps_calc_18_bin'], axis = 1, inplace = True)
    #self.df.drop(['ps_calc_19_bin'], axis = 1, inplace = True)
    #self.df.drop(['ps_calc_20_bin'], axis = 1, inplace = True)
    #self.df.drop(['ps_ind_10_bin'], axis = 1, inplace = True)
    #self.df.drop(['ps_ind_11_bin'], axis = 1, inplace = True)
    #self.df.drop(['ps_ind_12_bin'], axis = 1, inplace = True)
    #self.df.drop(['ps_ind_13_bin'], axis = 1, inplace = True)
    #self.df.drop(['ps_ind_14'], axis = 1, inplace = True)
    #self.df.drop(['ps_car_04_cat'], axis = 1, inplace = True)
    #self.df.drop(['ps_car_10_cat'], axis = 1, inplace = True)
    #'''
    #self.df['ps_car_13'] = (self.df['ps_car_13']*self.df['ps_car_13']*48400).round(0)
    #self.df['ps_car_12'] = (self.df['ps_car_12']*self.df['ps_car_12']).round(4) * 10000
    return

  # scale the features 
  def ScaleFeatures(self):
    scaler = StandardScaler(copy=False)
    if self.dftype == 1:
      scaler.fit_transform(self.df.drop(['id','target'], axis=1))
    elif self.dftype == 2:
      scaler.fit_transform(self.df.drop(['id'], axis=1))
    else:
      print ("neither train nor test!")
    return

  # this method pack all previous preprocessing all together and return the data frame
  def FinalFrameforTrainning(self):
    self.MissingData()
    self.OneHotReplacement()
    self.CorrMergeDrop()
    self.ScaleFeatures()
    #print (self.df)
    return self.df


In [2]:
  preprocessing = PreProcessing('../data/train.csv')
  train_p = preprocessing.FinalFrameforTrainning()
  print ("done with trainning set preprocessing!")
  #train_p.to_csv('train_p.csv', index = False)
  preprocessing = PreProcessing('../data/test.csv')
  test_p = preprocessing.FinalFrameforTrainning()
  print ("done with test set preprocessing!")
  #test_p.to_csv('test_p.csv', index = False)
  #train_p = pd.read_csv('train_p.csv')
  #test_p = pd.read_csv('test_p.csv')

done with trainning set preprocessing!
done with test set preprocessing!


In [3]:
print(train_p.head())
print(train_p.shape)
print(test_p.shape)

   id  target  ps_ind_01  ps_ind_03  ps_ind_06_bin  ps_ind_07_bin  \
0   7       0          2          5              0              1   
1   9       0          1          7              0              0   
2  13       0          5          9              0              0   
3  16       0          0          2              1              0   
4  17       0          0          0              1              0   

   ps_ind_08_bin  ps_ind_09_bin  ps_ind_10_bin  ps_ind_11_bin      ...        \
0              0              0              0              0      ...         
1              1              0              0              0      ...         
2              1              0              0              0      ...         
3              0              0              0              0      ...         
4              0              0              0              0      ...         

   ps_car_09_cat_1.0  ps_car_09_cat_2.0  ps_car_09_cat_3.0  ps_car_09_cat_4.0  \
0                  0   

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
import lightgbm


#'''
X_train, X_test, y_train, y_test = train_test_split(
                                                    train_p.drop(['id', 'target'],axis=1),
                                                    train_p.target,
                                                    train_size=0.7,
                                                    test_size=0.3,
                                                    random_state=0
                                                   )

lgb= LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=4, subsample=0.7, subsample_freq=2, num_leaves=16, feature_fraction=0.8, bagging_freq=1)                
lgb.fit(X_train, y_train)
print("----- Training Done -----")
ysc_pred = lgb.predict_proba(X_test)[:,1]
#print (ysc_pred)
#print (y_test)
gini = 2*roc_auc_score(y_test, ysc_pred)-1
print(gini)
#'''

----- Training Done -----
0.278097347261


In [5]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

In [7]:
features = train_p.drop(['id', 'target'],axis=1).columns.values
x, y = (list(x) for x in zip(*sorted(zip(lgb.feature_importances_, features), 
                                                            reverse = False)))
#print(y)
indice = 0
for thisy in y:
    if 'ps_ind' in thisy:
        print(indice)
        print(y[indice])
        print(x[indice])
    indice = indice+1

trace2 = go.Bar(
    x=x ,
    y=y,
    marker=dict(
        color=x,
        colorscale = 'Viridis',
        reversescale = True
    ),
    name='lightGBM Feature importance',
    orientation='h',
)

layout = dict(
    title='Feature importances, LightGBM',
     width = 900, height = 2000,
    yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=True,
#         domain=[0, 0.85],
    ))

fig1 = go.Figure(data=[trace2])
fig1['layout'].update(layout)
py.iplot(fig1, filename='plots')

10
ps_ind_05_cat_1.0
0
11
ps_ind_05_cat_5.0
0
12
ps_ind_10_bin
0
13
ps_ind_11_bin
0
14
ps_ind_13_bin
0
23
ps_ind_02_cat_3.0
1
24
ps_ind_02_cat_4.0
1
25
ps_ind_12_bin
1
26
ps_ind_14
1
32
ps_ind_05_cat_4.0
2
38
ps_ind_18_bin
3
47
ps_ind_04_cat_1.0
5
48
ps_ind_05_cat_3.0
5
55
ps_ind_02_cat_1.0
9
60
ps_ind_05_cat_6.0
10
61
ps_ind_08_bin
10
68
ps_ind_02_cat_2.0
13
70
ps_ind_04_cat_0.0
14
71
ps_ind_09_bin
15
78
ps_ind_05_cat_2.0
21
79
ps_ind_07_bin
21
82
ps_ind_16_bin
24
84
ps_ind_06_bin
28
88
ps_ind_01
52
91
ps_ind_05_cat_0.0
60
92
ps_ind_15
63
93
ps_ind_17_bin
68
96
ps_ind_03
105


In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
def lgb_param_selection(nfolds):
 
    sc_mod = LGBMClassifier(n_estimators=1000, learning_rate=0.14, subsample=0.8)


    nests = [100]
    lrate = [0.11, 0.12, 0.13]      
    param_grid = {'n_estimators': nests, 'learning_rate' : lrate}
    grid_search = GridSearchCV(sc_mod, param_grid, scoring='roc_auc', cv=nfolds, verbose=2)
    grid_search.fit(train_p.drop(['id', 'target'],axis=1), train_p.target)
    #print ("CV results")
    #print (grid_search.cv_results_)
    print ("Grid Scores:")
    print (grid_search.grid_scores_)
    print ("Best parameters:")
    print (grid_search.best_params_)
    return grid_search.cv_results_

In [12]:
lgb_param_selection(3)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] learning_rate=0.11, n_estimators=100 ............................
[CV] ............. learning_rate=0.11, n_estimators=100, total=   4.8s
[CV] learning_rate=0.11, n_estimators=100 ............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.6s remaining:    0.0s


[CV] ............. learning_rate=0.11, n_estimators=100, total=   4.8s
[CV] learning_rate=0.11, n_estimators=100 ............................
[CV] ............. learning_rate=0.11, n_estimators=100, total=   5.0s
[CV] learning_rate=0.12, n_estimators=100 ............................
[CV] ............. learning_rate=0.12, n_estimators=100, total=   4.7s
[CV] learning_rate=0.12, n_estimators=100 ............................
[CV] ............. learning_rate=0.12, n_estimators=100, total=   5.0s
[CV] learning_rate=0.12, n_estimators=100 ............................
[CV] ............. learning_rate=0.12, n_estimators=100, total=   5.1s
[CV] learning_rate=0.13, n_estimators=100 ............................
[CV] ............. learning_rate=0.13, n_estimators=100, total=   5.1s
[CV] learning_rate=0.13, n_estimators=100 ............................
[CV] ............. learning_rate=0.13, n_estimators=100, total=   4.8s
[CV] learning_rate=0.13, n_estimators=100 ............................
[CV] .

[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   51.8s finished


Grid Scores:
[mean: 0.63677, std: 0.00187, params: {'learning_rate': 0.11, 'n_estimators': 100}, mean: 0.63621, std: 0.00257, params: {'learning_rate': 0.12, 'n_estimators': 100}, mean: 0.63436, std: 0.00145, params: {'learning_rate': 0.13, 'n_estimators': 100}]
Best parameters:
{'learning_rate': 0.11, 'n_estimators': 100}



The grid_scores_ attribute was deprecated in version 0.18 in favor of the more elaborate cv_results_ attribute. The grid_scores_ attribute will not be available from 0.20


You are accessing a training score ('mean_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True


You are accessing a training score ('split0_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True


You are accessing a training score ('split1_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True


You are accessing a training score ('split2_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True


You are accessing a training score ('std_train_score'), which will not be available by default any m

{'mean_fit_time': array([ 4.40564728,  4.46297193,  4.5013241 ]),
 'mean_score_time': array([ 0.43999441,  0.4399062 ,  0.41896176]),
 'mean_test_score': array([ 0.63676587,  0.63620503,  0.63436396]),
 'mean_train_score': array([ 0.70664577,  0.71216872,  0.71632367]),
 'param_learning_rate': masked_array(data = [0.11 0.12 0.13],
              mask = [False False False],
        fill_value = ?),
 'param_n_estimators': masked_array(data = [100 100 100],
              mask = [False False False],
        fill_value = ?),
 'params': [{'learning_rate': 0.11, 'n_estimators': 100},
  {'learning_rate': 0.12, 'n_estimators': 100},
  {'learning_rate': 0.13, 'n_estimators': 100}],
 'rank_test_score': array([1, 2, 3], dtype=int32),
 'split0_test_score': array([ 0.63575523,  0.63541806,  0.63386322]),
 'split0_train_score': array([ 0.70749314,  0.71443966,  0.71856607]),
 'split1_test_score': array([ 0.63938221,  0.63967065,  0.63633349]),
 'split1_train_score': array([ 0.70546162,  0.70907625,  0

In [13]:
# final block, rm after tuning
lgb_final = LGBMClassifier(n_estimators=100, learning_rate=0.11, max_depth=4)
lgb_final.fit(train_p.drop(['id', 'target'],axis=1), train_p.target)
print("----- Training Done -----")
pred_final = lgb_final.predict_proba(test_p.drop(['id'],axis=1))
print(pred_final)

----- Training Done -----
[[ 0.97357593  0.02642407]
 [ 0.97640522  0.02359478]
 [ 0.97133139  0.02866861]
 ..., 
 [ 0.95469704  0.04530296]
 [ 0.97319962  0.02680038]
 [ 0.97068573  0.02931427]]


In [14]:
#generate submission file
sub = pd.DataFrame()
sub['id'] = test_p.id
sub['target'] = pred_final[:,1]
sub.to_csv('submit_PreOneHotDrop_LGB_20171128.csv', index=False)
sub.head()

Unnamed: 0,id,target
0,0,0.026424
1,1,0.023595
2,2,0.028669
3,3,0.015035
4,4,0.035273
