In [88]:
import numpy as np
import pandas as pd
import json

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

# This class is for the preprocessing, take csv path as input, and aim to return a pandas data frame for trainning
class PreProcessing:

  # The constructor takes a pandas dataframe as input and save it to self.df
  def __init__(self, csvpath):
    self.df = pd.read_csv(csvpath)
    if "train" in csvpath:
      self.dftype = 1
    elif "test" in csvpath:
      self.dftype = 2
    else:
      self.dftype = -1

  # This method have deal with missing data before merge or drop
  def MissingData(self):
    self.df = self.df.replace(-1, np.NaN) #first, replace -1 to NaN
    #print (self.df.columns[self.df.isnull().any()])
    '''
    'ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 
    'ps_reg_03',
    'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_05_cat', 'ps_car_07_cat', 'ps_car_09_cat', 
    'ps_car_11', 'ps_car_12', 'ps_car_14'
    '''
    mean_imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    mdan_imp = Imputer(missing_values='NaN', strategy='median', axis=0)
    mfrq_imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)

    self.df["ps_ind_02_cat"].fillna(-1, inplace=True)
    self.df["ps_ind_04_cat"].fillna(-1, inplace=True)
    self.df["ps_ind_05_cat"].fillna(-1, inplace=True)
    #self.df["ps_reg_03"].fillna(self.df["ps_reg_03"].median(), inplace=True)
    #self.df['ps_reg_03'] = mean_imp.fit_transform(self.df[['ps_reg_03']]).ravel()
    self.df["ps_reg_03"].fillna(2 * self.df['ps_reg_03'].value_counts().idxmax(), inplace=True)
    self.df["ps_car_01_cat"].fillna(-1, inplace=True)
    self.df["ps_car_02_cat"].fillna(-1, inplace=True)
    #self.df["ps_car_03_cat"].fillna(self.df["ps_car_03_cat"].value_counts().idxmax(), inplace=True) # top 1 missing variable, drop
    #self.df["ps_car_05_cat"].fillna(self.df["ps_car_05_cat"].value_counts().idxmax(), inplace=True) # top 2 missing variable, drop
    self.df["ps_car_03_cat"].fillna(-1, inplace=True) # top 1 missing variable
    self.df["ps_car_05_cat"].fillna(-1, inplace=True) # top 2 missing variable
    self.df["ps_car_07_cat"].fillna(-1, inplace=True)
    self.df["ps_car_09_cat"].fillna(-1, inplace=True)
    #self.df["ps_car_11"].fillna(self.df["ps_car_11"].value_counts().idxmax(), inplace=True)
    self.df["ps_car_11"].fillna(-1, inplace=True) # this is catually a cat variable, inverted with ps_car_11_cat
    #self.df['ps_car_11'] = mfrq_imp.fit_transform(self.df[['ps_car_11']]).ravel()
    #self.df["ps_car_12"].fillna(self.df["ps_car_12"].median(), inplace=True)
    #self.df['ps_car_12'] = mean_imp.fit_transform(self.df[['ps_car_12']]).ravel()
    self.df['ps_car_12'].fillna(2 * self.df['ps_car_12'].value_counts().idxmax(), inplace=True)
    #self.df["ps_car_14"].fillna(self.df["ps_car_14"].median(), inplace=True)
    #self.df['ps_car_14'] = mean_imp.fit_transform(self.df[['ps_car_14']]).ravel()
    self.df["ps_car_14"].fillna(2 * self.df['ps_car_14'].value_counts().idxmax(), inplace=True)

    #self.df[""].fillna(self.df[""].mean(), inplace=True)
    #self.df[""].fillna(self.df[""].median(), inplace=True)
    #self.df[""].fillna(self.df[""].value_counts().idxmax(), inplace=True)
    return

  # This method drop the original catagory labels and replaced with one hot labels
  def OneHotReplacement(self):

    #onehot_cols = ['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_07_cat', 'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11']
    onehot_cols = ['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_05_cat', 'ps_car_07_cat', 'ps_car_09_cat', 'ps_car_11']

    self.df = pd.get_dummies(self.df, columns=onehot_cols, drop_first=True)
    #onehot = pd.get_dummies(self.df['ps_ind_02_cat'])
    #self.df.drop(['ps_ind_02_cat'], axis = 1, inplace = True)
    #self.df = self.df.join(onehot)
    return

  # This method drop or merge variables in dataframe accroding to corr map
  def CorrMergeDrop(self):
    #self.df['ps_ind_06070809_bin'] = self.df.apply(
    #  lambda x: 1 if x['ps_ind_06_bin'] == 1 
    #              else 
    #              (2 if x['ps_ind_07_bin'] == 1 
    #                 else 
    #                 ( 3 if x['ps_ind_08_bin'] == 1 
    #                     else 
    #                     (4 if x['ps_ind_09_bin'] == 1 else 5)
    #                 )
    #              ), axis = 1)
    #self.df.drop(['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin'], axis = 1, inplace = True)

    #self.df['ps_ind_161718_bin'] = self.df.apply(lambda x: 1 if x['ps_ind_16_bin'] == 1 
    #                                                         else (2 if x['ps_ind_17_bin'] == 1 else 3), axis = 1)
    #self.df.drop(['ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin'], axis = 1, inplace = True)
 
    # drop this variable from preprocessing study, top 3 missing data, and not important at all
    #self.df.drop(['ps_car_03_cat'], axis = 1, inplace = True)
    #self.df.drop(['ps_car_05_cat'], axis = 1, inplace = True)
    # drop less important features, random forest
    self.df.drop(['ps_calc_15_bin'], axis = 1, inplace = True)
    self.df.drop(['ps_calc_16_bin'], axis = 1, inplace = True)
    self.df.drop(['ps_calc_17_bin'], axis = 1, inplace = True)
    self.df.drop(['ps_calc_18_bin'], axis = 1, inplace = True)
    self.df.drop(['ps_calc_19_bin'], axis = 1, inplace = True)
    self.df.drop(['ps_calc_20_bin'], axis = 1, inplace = True)
    self.df.drop(['ps_ind_10_bin'], axis = 1, inplace = True)
    self.df.drop(['ps_ind_11_bin'], axis = 1, inplace = True)
    self.df.drop(['ps_ind_12_bin'], axis = 1, inplace = True)
    self.df.drop(['ps_ind_13_bin'], axis = 1, inplace = True)
    self.df.drop(['ps_ind_14'], axis = 1, inplace = True)
    self.df.drop(['ps_car_04_cat'], axis = 1, inplace = True)
    self.df.drop(['ps_car_10_cat'], axis = 1, inplace = True)

    #self.df['ps_car_13'] = (self.df['ps_car_13']*self.df['ps_car_13']*48400).round(0)
    #self.df['ps_car_12'] = (self.df['ps_car_12']*self.df['ps_car_12']).round(4) * 10000
    return

  # scale the features 
  def ScaleFeatures(self):
    scaler = StandardScaler(copy=False)
    if self.dftype == 1:
      scaler.fit_transform(self.df.drop(['id','target'], axis=1))
    elif self.dftype == 2:
      scaler.fit_transform(self.df.drop(['id'], axis=1))
    else:
      print ("neither train nor test!")
    return

  # this method pack all previous preprocessing all together and return the data frame
  def FinalFrameforTrainning(self):
    self.MissingData()
    self.OneHotReplacement()
    self.CorrMergeDrop()
    self.ScaleFeatures()
    #print (self.df)
    return self.df


In [89]:
  preprocessing = PreProcessing('../data/train.csv')
  train_p = preprocessing.FinalFrameforTrainning()
  print ("done with trainning set preprocessing!")
  #train_p.to_csv('train_p.csv', index = False)
  preprocessing = PreProcessing('../data/test.csv')
  test_p = preprocessing.FinalFrameforTrainning()
  print ("done with test set preprocessing!")
  #test_p.to_csv('test_p.csv', index = False)
  #train_p = pd.read_csv('train_p.csv')
  #test_p = pd.read_csv('test_p.csv')

done with trainning set preprocessing!
done with test set preprocessing!


In [90]:
print(train_p.head())
print(train_p.shape)
print(test_p.shape)

   id  target  ps_ind_01  ps_ind_03  ps_ind_06_bin  ps_ind_07_bin  \
0   7       0          2          5              0              1   
1   9       0          1          7              0              0   
2  13       0          5          9              0              0   
3  16       0          0          2              1              0   
4  17       0          0          0              1              0   

   ps_ind_08_bin  ps_ind_09_bin  ps_ind_15  ps_ind_16_bin      ...        \
0              0              0         11              0      ...         
1              1              0          3              0      ...         
2              1              0         12              1      ...         
3              0              0          8              1      ...         
4              0              0          9              1      ...         

   ps_car_07_cat_1.0  ps_car_09_cat_0.0  ps_car_09_cat_1.0  ps_car_09_cat_2.0  \
0                  1                  1        

In [91]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

'''
X_train, X_test, y_train, y_test = train_test_split(
                                                    train_p.drop(['id', 'target'],axis=1),
                                                    train_p.target,
                                                    train_size=0.7,
                                                    test_size=0.3,
                                                    random_state=0
                                                   )
'''
'''
kf = KFold(n_splits=5)
#kf.get_n_splits(train_p.loc[train_p['target'] == 0])
train_p_sg = train_p.loc[train_p['target'] == 1]
#print(train_p_sg)
train_p_bg = train_p.loc[train_p['target'] == 0]
#train_p_bg.head(200)
kfres = next(kf.split(train_p_bg))
#print(train_p_bg.iloc[kfres[0]])
print(train_p_sg.shape)
print(train_p_bg.shape)
print(train_p_bg.iloc[kfres[1]].shape)

dfbalanced = pd.concat([train_p_sg, train_p_bg.iloc[kfres[1]]])
from sklearn.utils import shuffle
dfbalanced = shuffle(dfbalanced)
dfbalanced.head(100)

X_train = dfbalanced.drop(['id', 'target'],axis=1)
y_train = dfbalanced.target
'''
'''
rf = RandomForestClassifier(n_estimators=300, criterion='gini', max_features='auto', max_depth=7, min_samples_leaf=4, n_jobs=-1, random_state=0)
rf.fit(X_train, y_train)
print("----- Training Done -----")
ysc_pred = rf.predict_proba(X_test)[:,1]
#print (ysc_pred)
#print (y_test)
gini = 2*roc_auc_score(y_test, ysc_pred)-1
print(gini)
'''

'\nrf = RandomForestClassifier(n_estimators=300, criterion=\'gini\', max_features=\'auto\', max_depth=7, min_samples_leaf=4, n_jobs=-1, random_state=0)\nrf.fit(X_train, y_train)\nprint("----- Training Done -----")\nysc_pred = rf.predict_proba(X_test)[:,1]\n#print (ysc_pred)\n#print (y_test)\ngini = 2*roc_auc_score(y_test, ysc_pred)-1\nprint(gini)\n'

In [92]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

In [93]:
features = train_p.drop(['id', 'target'],axis=1).columns.values
x, y = (list(x) for x in zip(*sorted(zip(rf.feature_importances_, features), 
                                                            reverse = False)))
#print(y)
indice = 0
for thisy in y:
    if 'ps_car_04_cat' in thisy:
        print(indice)
        print(y[indice])
        print(x[indice])
    indice = indice+1

trace2 = go.Bar(
    x=x ,
    y=y,
    marker=dict(
        color=x,
        colorscale = 'Viridis',
        reversescale = True
    ),
    name='Random Forest Feature importance',
    orientation='h',
)

layout = dict(
    title='Feature importances, RandomForest',
     width = 900, height = 2000,
    yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=True,
#         domain=[0, 0.85],
    ))

fig1 = go.Figure(data=[trace2])
fig1['layout'].update(layout)
py.iplot(fig1, filename='plots')

In [132]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
def rf_param_selection(nfolds):
    sc_mod = RandomForestClassifier(n_estimators=100, criterion='gini', n_jobs=-1, random_state=0, verbose=True)
    #nmsls = [120]
    #mfeat = [0.3, 0.35, 0.40]
    #mdeps = [14]
    nmsls = [110, 115, 120, 125, 130]
    mfeat = [0.3]
    mdeps = [14]
    #nmsls = [120]
    #mfeat = [0.3]
    #mdeps = [12, 13, 14, 15, 16]
    #nmsls = [120]
    #mfeat = [0.3]
    #mdeps = [14]
    param_grid = {'min_samples_leaf': nmsls, 'max_features' : mfeat, 'max_depth': mdeps}
    grid_search = GridSearchCV(sc_mod, param_grid, scoring='roc_auc', cv=nfolds, verbose=2)
    grid_search.fit(train_p.drop(['id', 'target'],axis=1), train_p.target)
    #print ("CV results")
    #print (grid_search.cv_results_)
    print ("Grid Scores:")
    print (grid_search.grid_scores_)
    print ("Best parameters:")
    print (grid_search.best_params_)
    return grid_search.cv_results_

In [133]:
rf_param_selection(3)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] max_depth=14, max_features=0.3, min_samples_leaf=110 ............


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   30.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.3min finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.9s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    1.7s finished


[CV]  max_depth=14, max_features=0.3, min_samples_leaf=110, total= 1.3min
[CV] max_depth=14, max_features=0.3, min_samples_leaf=110 ............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.4min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   34.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.5min finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    1.0s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    2.2s finished


[CV]  max_depth=14, max_features=0.3, min_samples_leaf=110, total= 1.5min
[CV] max_depth=14, max_features=0.3, min_samples_leaf=110 ............


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   35.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.4min finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.8s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    1.6s finished


[CV]  max_depth=14, max_features=0.3, min_samples_leaf=110, total= 1.5min
[CV] max_depth=14, max_features=0.3, min_samples_leaf=115 ............


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   31.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.4min finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.9s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    1.8s finished


[CV]  max_depth=14, max_features=0.3, min_samples_leaf=115, total= 1.4min
[CV] max_depth=14, max_features=0.3, min_samples_leaf=115 ............


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   34.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.4min finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.7s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    1.5s finished


[CV]  max_depth=14, max_features=0.3, min_samples_leaf=115, total= 1.4min
[CV] max_depth=14, max_features=0.3, min_samples_leaf=115 ............


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   30.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.4min finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    1.1s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    2.0s finished


[CV]  max_depth=14, max_features=0.3, min_samples_leaf=115, total= 1.4min
[CV] max_depth=14, max_features=0.3, min_samples_leaf=120 ............


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   34.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.4min finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.8s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    1.6s finished


[CV]  max_depth=14, max_features=0.3, min_samples_leaf=120, total= 1.4min
[CV] max_depth=14, max_features=0.3, min_samples_leaf=120 ............


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   31.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.4min finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.8s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    1.6s finished


[CV]  max_depth=14, max_features=0.3, min_samples_leaf=120, total= 1.4min
[CV] max_depth=14, max_features=0.3, min_samples_leaf=120 ............


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   34.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.5min finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.8s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    2.0s finished


[CV]  max_depth=14, max_features=0.3, min_samples_leaf=120, total= 1.5min
[CV] max_depth=14, max_features=0.3, min_samples_leaf=125 ............


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   33.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.4min finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.8s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    1.6s finished


[CV]  max_depth=14, max_features=0.3, min_samples_leaf=125, total= 1.4min
[CV] max_depth=14, max_features=0.3, min_samples_leaf=125 ............


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   30.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.3min finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.7s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    1.5s finished


[CV]  max_depth=14, max_features=0.3, min_samples_leaf=125, total= 1.3min
[CV] max_depth=14, max_features=0.3, min_samples_leaf=125 ............


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   30.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.3min finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.8s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    1.8s finished


[CV]  max_depth=14, max_features=0.3, min_samples_leaf=125, total= 1.3min
[CV] max_depth=14, max_features=0.3, min_samples_leaf=130 ............


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   31.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.3min finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.8s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    1.5s finished


[CV]  max_depth=14, max_features=0.3, min_samples_leaf=130, total= 1.3min
[CV] max_depth=14, max_features=0.3, min_samples_leaf=130 ............


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   30.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.3min finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.7s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    1.6s finished


[CV]  max_depth=14, max_features=0.3, min_samples_leaf=130, total= 1.3min
[CV] max_depth=14, max_features=0.3, min_samples_leaf=130 ............


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   30.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.3min finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.7s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    1.5s finished


[CV]  max_depth=14, max_features=0.3, min_samples_leaf=130, total= 1.3min


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed: 21.2min finished
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   53.0s


Grid Scores:
[mean: 0.63200, std: 0.00128, params: {'max_depth': 14, 'max_features': 0.3, 'min_samples_leaf': 110}, mean: 0.63220, std: 0.00079, params: {'max_depth': 14, 'max_features': 0.3, 'min_samples_leaf': 115}, mean: 0.63301, std: 0.00183, params: {'max_depth': 14, 'max_features': 0.3, 'min_samples_leaf': 120}, mean: 0.63265, std: 0.00156, params: {'max_depth': 14, 'max_features': 0.3, 'min_samples_leaf': 125}, mean: 0.63277, std: 0.00101, params: {'max_depth': 14, 'max_features': 0.3, 'min_samples_leaf': 130}]
Best parameters:
{'max_depth': 14, 'max_features': 0.3, 'min_samples_leaf': 120}


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.2min finished

The grid_scores_ attribute was deprecated in version 0.18 in favor of the more elaborate cv_results_ attribute. The grid_scores_ attribute will not be available from 0.20


You are accessing a training score ('mean_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True


You are accessing a training score ('split0_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True


You are accessing a training score ('split1_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True


You are accessing a training score ('split2_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True


You are accessing a training 

{'mean_fit_time': array([ 84.49108879,  82.57688292,  84.68877657,  79.32415358,  78.85622795]),
 'mean_score_time': array([ 1.04220144,  1.07329249,  0.91828918,  0.93705678,  1.03152744]),
 'mean_test_score': array([ 0.63200492,  0.6322001 ,  0.63300883,  0.63265378,  0.63277014]),
 'mean_train_score': array([ 0.7809483 ,  0.77849903,  0.7760967 ,  0.77396004,  0.77152748]),
 'param_max_depth': masked_array(data = [14 14 14 14 14],
              mask = [False False False False False],
        fill_value = ?),
 'param_max_features': masked_array(data = [0.3 0.3 0.3 0.3 0.3],
              mask = [False False False False False],
        fill_value = ?),
 'param_min_samples_leaf': masked_array(data = [110 115 120 125 130],
              mask = [False False False False False],
        fill_value = ?),
 'params': [{'max_depth': 14, 'max_features': 0.3, 'min_samples_leaf': 110},
  {'max_depth': 14, 'max_features': 0.3, 'min_samples_leaf': 115},
  {'max_depth': 14, 'max_features': 0.3, 'min

In [134]:
# final block, rm after tuning
rf_final = RandomForestClassifier(n_estimators=1000, criterion='gini', max_features=0.3, max_depth=14, min_samples_leaf=120, n_jobs=-1, random_state=0)
rf_final.fit(train_p.drop(['id', 'target'],axis=1), train_p.target)
print("----- Training Done -----")
pred_final = rf_final.predict_proba(test_p.drop(['id'],axis=1))
print(pred_final)

----- Training Done -----
[[ 0.97403813  0.02596187]
 [ 0.97648011  0.02351989]
 [ 0.97070244  0.02929756]
 ..., 
 [ 0.96137054  0.03862946]
 [ 0.97604022  0.02395978]
 [ 0.96907262  0.03092738]]


In [139]:
#generate submission file
sub = pd.DataFrame()
sub['id'] = test_p.id
sub['target'] = pred_final[:,1]
sub.to_csv('this_submit.csv', index=False)
sub.head()

Unnamed: 0,id,target
0,0,0.025962
1,1,0.02352
2,2,0.029298
3,3,0.015921
4,4,0.037679
