In [3]:
import numpy as np
import pandas as pd
import json

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

# This class is for the preprocessing, take csv path as input, and aim to return a pandas data frame for trainning
class PreProcessing:

  # The constructor takes a pandas dataframe as input and save it to self.df
  def __init__(self, csvpath):
    self.df = pd.read_csv(csvpath)
    if "train" in csvpath:
      self.dftype = 1
    elif "test" in csvpath:
      self.dftype = 2
    else:
      self.dftype = -1

  # This method have deal with missing data before merge or drop
  def MissingData(self):
    self.df = self.df.replace(-1, np.NaN) #first, replace -1 to NaN
    #print (self.df.columns[self.df.isnull().any()])
    '''
    'ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 
    'ps_reg_03',
    'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_05_cat', 'ps_car_07_cat', 'ps_car_09_cat', 
    'ps_car_11', 'ps_car_12', 'ps_car_14'
    '''
    mean_imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    mdan_imp = Imputer(missing_values='NaN', strategy='median', axis=0)
    mfrq_imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)

    self.df["ps_ind_02_cat"].fillna(-1, inplace=True)
    self.df["ps_ind_04_cat"].fillna(-1, inplace=True)
    self.df["ps_ind_05_cat"].fillna(-1, inplace=True)
    #self.df["ps_reg_03"].fillna(self.df["ps_reg_03"].median(), inplace=True)
    #self.df['ps_reg_03'] = mean_imp.fit_transform(self.df[['ps_reg_03']]).ravel()
    self.df["ps_reg_03"].fillna(2 * self.df['ps_reg_03'].value_counts().idxmax(), inplace=True)
    self.df["ps_car_01_cat"].fillna(-1, inplace=True)
    self.df["ps_car_02_cat"].fillna(-1, inplace=True)
    #self.df["ps_car_03_cat"].fillna(self.df["ps_car_03_cat"].value_counts().idxmax(), inplace=True) # top 1 missing variable, drop
    #self.df["ps_car_05_cat"].fillna(self.df["ps_car_05_cat"].value_counts().idxmax(), inplace=True) # top 2 missing variable, drop
    self.df["ps_car_03_cat"].fillna(-1, inplace=True) # top 1 missing variable
    self.df["ps_car_05_cat"].fillna(-1, inplace=True) # top 2 missing variable
    self.df["ps_car_07_cat"].fillna(-1, inplace=True)
    self.df["ps_car_09_cat"].fillna(-1, inplace=True)
    #self.df["ps_car_11"].fillna(self.df["ps_car_11"].value_counts().idxmax(), inplace=True)
    self.df["ps_car_11"].fillna(-1, inplace=True) # this is catually a cat variable, inverted with ps_car_11_cat
    #self.df['ps_car_11'] = mfrq_imp.fit_transform(self.df[['ps_car_11']]).ravel()
    #self.df["ps_car_12"].fillna(self.df["ps_car_12"].median(), inplace=True)
    #self.df['ps_car_12'] = mean_imp.fit_transform(self.df[['ps_car_12']]).ravel()
    self.df['ps_car_12'].fillna(2 * self.df['ps_car_12'].value_counts().idxmax(), inplace=True)
    #self.df["ps_car_14"].fillna(self.df["ps_car_14"].median(), inplace=True)
    #self.df['ps_car_14'] = mean_imp.fit_transform(self.df[['ps_car_14']]).ravel()
    self.df["ps_car_14"].fillna(2 * self.df['ps_car_14'].value_counts().idxmax(), inplace=True)

    #self.df[""].fillna(self.df[""].mean(), inplace=True)
    #self.df[""].fillna(self.df[""].median(), inplace=True)
    #self.df[""].fillna(self.df[""].value_counts().idxmax(), inplace=True)
    return

  # This method drop the original catagory labels and replaced with one hot labels
  def OneHotReplacement(self):

    #onehot_cols = ['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_07_cat', 'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11']
    onehot_cols = ['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_07_cat', 'ps_car_09_cat', 'ps_car_11']

    self.df = pd.get_dummies(self.df, columns=onehot_cols, drop_first=True)
    #onehot = pd.get_dummies(self.df['ps_ind_02_cat'])
    #self.df.drop(['ps_ind_02_cat'], axis = 1, inplace = True)
    #self.df = self.df.join(onehot)
    return

  # This method drop or merge variables in dataframe accroding to corr map
  def CorrMergeDrop(self):
    #self.df['ps_ind_06070809_bin'] = self.df.apply(
    #  lambda x: 1 if x['ps_ind_06_bin'] == 1 
    #              else 
    #              (2 if x['ps_ind_07_bin'] == 1 
    #                 else 
    #                 ( 3 if x['ps_ind_08_bin'] == 1 
    #                     else 
    #                     (4 if x['ps_ind_09_bin'] == 1 else 5)
    #                 )
    #              ), axis = 1)
    #self.df.drop(['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin'], axis = 1, inplace = True)

    #self.df['ps_ind_161718_bin'] = self.df.apply(lambda x: 1 if x['ps_ind_16_bin'] == 1 
    #                                                         else (2 if x['ps_ind_17_bin'] == 1 else 3), axis = 1)
    #self.df.drop(['ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin'], axis = 1, inplace = True)
 
    # drop this variable from preprocessing study, top 3 missing data, and not important at all
    #self.df.drop(['ps_car_03_cat'], axis = 1, inplace = True)
    #self.df.drop(['ps_car_05_cat'], axis = 1, inplace = True)
    # drop less important features, random forest
    #'''
    #self.df.drop(['ps_calc_15_bin'], axis = 1, inplace = True)
    #self.df.drop(['ps_calc_16_bin'], axis = 1, inplace = True)
    #self.df.drop(['ps_calc_17_bin'], axis = 1, inplace = True)
    #self.df.drop(['ps_calc_18_bin'], axis = 1, inplace = True)
    #self.df.drop(['ps_calc_19_bin'], axis = 1, inplace = True)
    #self.df.drop(['ps_calc_20_bin'], axis = 1, inplace = True)
    self.df.drop(['ps_ind_10_bin'], axis = 1, inplace = True)
    self.df.drop(['ps_ind_11_bin'], axis = 1, inplace = True)
    self.df.drop(['ps_ind_12_bin'], axis = 1, inplace = True)
    self.df.drop(['ps_ind_13_bin'], axis = 1, inplace = True)
    #self.df.drop(['ps_ind_14'], axis = 1, inplace = True)
    #self.df.drop(['ps_car_04_cat'], axis = 1, inplace = True)
    self.df.drop(['ps_car_10_cat'], axis = 1, inplace = True)
    #'''
    #self.df['ps_car_13'] = (self.df['ps_car_13']*self.df['ps_car_13']*48400).round(0)
    #self.df['ps_car_12'] = (self.df['ps_car_12']*self.df['ps_car_12']).round(4) * 10000
    return

  # scale the features 
  def ScaleFeatures(self):
    scaler = StandardScaler(copy=False)
    if self.dftype == 1:
      scaler.fit_transform(self.df.drop(['id','target'], axis=1))
    elif self.dftype == 2:
      scaler.fit_transform(self.df.drop(['id'], axis=1))
    else:
      print ("neither train nor test!")
    return

  # this method pack all previous preprocessing all together and return the data frame
  def FinalFrameforTrainning(self):
    self.MissingData()
    self.OneHotReplacement()
    self.CorrMergeDrop()
    self.ScaleFeatures()
    #print (self.df)
    return self.df


In [4]:
preprocessing = PreProcessing('../data/train.csv')
train_p = preprocessing.FinalFrameforTrainning()
print ("done with trainning set preprocessing!")
#train_p.to_csv('train_p.csv', index = False)
preprocessing = PreProcessing('../data/test.csv')
test_p = preprocessing.FinalFrameforTrainning()
print ("done with test set preprocessing!")
#test_p.to_csv('test_p.csv', index = False)
#train_p = pd.read_csv('train_p.csv')
#test_p = pd.read_csv('test_p.csv')

done with trainning set preprocessing!
done with test set preprocessing!


In [5]:
print(train_p.shape)
print(test_p.shape)

(595212, 94)
(892816, 93)


In [18]:
#Load related packages
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
import lightgbm
from xgboost import XGBClassifier
import xgboost
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

In [19]:
# Class to extend the Sklearn classifier
class SklearnHelper(object):
  def __init__(self, clf, seed=0, params=None):
    #params['random_state'] = seed
    self.clf = clf(**params)

  def train(self, x_train, y_train):
    self.clf.fit(x_train, y_train)

  def predict(self, x):
    return self.clf.predict(x)
    
  def fit(self,x,y):
    return self.clf.fit(x,y)
    
  def feature_importances(self,x,y):
    print(self.clf.fit(x,y).feature_importances_)

  def model(self):
    return self.clf

In [20]:
#stack by logistic regression
class Ensemble(object):
  def __init__(self, n_splits, stacker, base_models):
    self.n_splits = n_splits
    self.stacker = stacker
    self.base_models = base_models

  def fit_predict(self, X, y, T):
    X = np.array(X)
    y = np.array(y)
    T = np.array(T)

    folds = list(StratifiedKFold(n_splits=self.n_splits, shuffle=True).split(X, y))

    S_train = np.zeros((X.shape[0], len(self.base_models)))
    S_test = np.zeros((T.shape[0], len(self.base_models)))
    
    for i, clf in enumerate(self.base_models):
      S_test_i = np.zeros((T.shape[0], self.n_splits))
      for j, (train_idx, test_idx) in enumerate(folds):
        X_train = X[train_idx]
        y_train = y[train_idx]
        X_holdout = X[test_idx]
        #y_holdout = y[test_idx]

        print ("Fit %s fold %d" % (str(clf).split('(')[0], j+1))
        clf.fit(X_train, y_train)
        #cross_score = cross_val_score(clf, X_train, y_train, cv=3, scoring='roc_auc')
        #print("    cross_score: %.5f" % (cross_score.mean()))
        y_pred = clf.predict_proba(X_holdout)[:,1]                

        S_train[test_idx, i] = y_pred
        S_test_i[:, j] = clf.predict_proba(T)[:,1]
      S_test[:, i] = S_test_i.mean(axis=1)

    results = cross_val_score(self.stacker, S_train, y, cv=3, scoring='roc_auc')
    print("Stacker score: %.5f" % (results.mean()))

    self.stacker.fit(S_train, y)
    res = self.stacker.predict_proba(S_test)[:,1]
    return res

In [21]:
# final block, after tuning, models for stack
lgb_params = {
    'n_estimators': 1000,
    'learning_rate': 0.01,
    'num_leaves': 31,
    'max_depth': 8,
    'min_child_weight': 7,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    #'scale_pos_weight': 1.6,
    'reg_alpha': 0.01,
    'reg_lambda': 1.0,
    'n_jobs': 6
}
lgb_final = SklearnHelper(clf=LGBMClassifier, seed=74, params=lgb_params)
xgb_params = {
    'n_estimators': 1000,
    'learning_rate': 0.014,
    'objective': "binary:logistic",
    'max_depth': 4,
    'min_child_weight': 5, 
    'gamma': 0,    
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    #'scale_pos_weight': 1.6,
    'reg_alpha': 0,
    'reg_lambda': 1,
    'nthread': 6    
  }
xgb_final = SklearnHelper(clf=XGBClassifier, seed=97, params=xgb_params)
#lgb_final = LGBMClassifier(n_estimators=1000, learning_rate=0.01, num_leaves=31, max_depth=8, min_child_weight=7, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.01, reg_lambda=1.0, n_jobs=6)
#xgb_final = XGBClassifier(n_estimators=1000, objective="binary:logistic", learning_rate=0.014, max_depth=4, min_child_weight=5, gamma=0, subsample=0.8, colsample_bytree=0.8,  reg_alpha=0, reg_lambda=1, nthread=6)


In [22]:
# stack
log_model = LogisticRegression()
stack = Ensemble( n_splits=3, stacker = log_model, base_models = ( lgb_final.model(), xgb_final.model() ) )
pred_final = stack.fit_predict(train_p.drop(['id', 'target'],axis=1), train_p.target, test_p.drop(['id'],axis=1))
print(pred_final)

Fit LGBMClassifier fold 1
Fit LGBMClassifier fold 2
Fit LGBMClassifier fold 3
Fit XGBClassifier fold 1
Fit XGBClassifier fold 2
Fit XGBClassifier fold 3
Stacker score: 0.64129
[ 0.02901898  0.02836118  0.02930822 ...,  0.03626242  0.02773826
  0.03178286]


In [23]:
#generate submission file
sub = pd.DataFrame()
sub['id'] = test_p.id
sub['target'] = pred_final
sub.to_csv('../data/res/submit_PreOnehotDrop_stack3lgb3xgb_20171128.csv', index=False)
sub.head()

Unnamed: 0,id,target
0,0,0.029019
1,1,0.028361
2,2,0.029308
3,3,0.02359
4,4,0.034266
