In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import time
from sklearn.model_selection import cross_val_score

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_rows', 1000)

In [2]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [4]:
traintest=train.drop(['id','target'], axis=1).append(test.drop(['id'], axis=1))
cols=traintest.columns

stat_cols= ['nunique','freq1','freq1_val', 'freq2', 'req2_val',
             'freq3', 'freq3_val'] + traintest[cols[0]].describe().index.tolist()[1:]


stat_cols=['feature']+stat_cols

feature_stat=pd.DataFrame(columns=stat_cols)
i=0

for col in cols:
    stat_vals=[]
    
    # get stat value
    stat_vals.append(col)
    stat_vals.append(traintest[col].nunique())
    stat_vals.append(traintest[col].value_counts().index[0])
    stat_vals.append(traintest[col].value_counts().iloc[0])
    stat_vals.append(traintest[col].value_counts().index[1])
    stat_vals.append(traintest[col].value_counts().iloc[1])
    
    if len(traintest[col].value_counts())>2:
        stat_vals.append(traintest[col].value_counts().index[2])
        stat_vals.append(traintest[col].value_counts().iloc[2])
    else:
        stat_vals.append(np.nan)
        stat_vals.append(np.nan)
            
    stat_vals+=traintest[col].describe().tolist()[1:]

    feature_stat.loc[i]=stat_vals
    i+=1

In [5]:
feature_stat[feature_stat['feature'].str.contains("cat")].sort_values(by=['nunique'])

Unnamed: 0,feature,nunique,freq1,freq1_val,freq2,req2_val,freq3,freq3_val,mean,std,min,25%,50%,75%,max
28,ps_car_08_cat,2,1,1238365,0,249663,,,0.832219,0.373672,0.0,1.0,1.0,1.0,1.0
3,ps_ind_04_cat,3,0,866864,1,620936,-1.0,228.0,0.417135,0.493396,-1.0,0.0,0.0,1.0,1.0
22,ps_car_02_cat,3,1,1234979,0,253039,-1.0,10.0,0.829937,0.375706,-1.0,1.0,1.0,1.0,1.0
23,ps_car_03_cat,3,-1,1028142,1,276842,0.0,183044.0,-0.504896,0.788713,-1.0,-1.0,-1.0,0.0,1.0
25,ps_car_05_cat,3,-1,666910,1,431560,0.0,389558.0,-0.158162,0.844506,-1.0,-1.0,0.0,1.0,1.0
27,ps_car_07_cat,3,1,1383070,0,76138,-1.0,28820.0,0.910097,0.347212,-1.0,1.0,1.0,1.0,1.0
30,ps_car_10_cat,3,1,1475460,0,12136,2.0,432.0,0.992135,0.091565,0.0,1.0,1.0,1.0,2.0
1,ps_ind_02_cat,5,1,1079327,2,309747,3.0,70172.0,1.358745,0.663639,-1.0,1.0,1.0,2.0,4.0
29,ps_car_09_cat,6,2,883326,0,486510,1.0,72947.0,1.328302,0.978743,-1.0,0.0,2.0,2.0,4.0
4,ps_ind_05_cat,8,0,1319412,6,51877,4.0,45706.0,0.406955,1.3533,-1.0,0.0,0.0,0.0,6.0


In [6]:
def freq_encoding(cols, train_df, test_df):
    # we are going to store our new dataset in these two resulting datasets
    result_train_df=pd.DataFrame()
    result_test_df=pd.DataFrame()
    
    # loop through each feature column to do this
    for col in cols:
        
        # capture the frequency of a feature in the training set in the form of a dataframe
        col_freq=col+'_freq'
        freq=train_df[col].value_counts()
        freq=pd.DataFrame(freq)
        freq.reset_index(inplace=True)
        freq.columns=[[col,col_freq]]

        # merge ths 'freq' datafarme with the train data
        temp_train_df=pd.merge(train_df[[col]], freq, how='left', on=col)
        temp_train_df.drop([col], axis=1, inplace=True)

        # merge this 'freq' dataframe with the test data
        temp_test_df=pd.merge(test_df[[col]], freq, how='left', on=col)
        temp_test_df.drop([col], axis=1, inplace=True)

        # if certain levels in the test dataset is not observed in the train dataset, 
        # we assign frequency of zero to them
        temp_test_df.fillna(0, inplace=True)
        temp_test_df[col_freq]=temp_test_df[col_freq].astype(np.int32)

        if result_train_df.shape[0]==0:
            result_train_df=temp_train_df
            result_test_df=temp_test_df
        else:
            result_train_df=pd.concat([result_train_df, temp_train_df],axis=1)
            result_test_df=pd.concat([result_test_df, temp_test_df],axis=1)
    
    return result_train_df, result_test_df

cat_cols=['ps_ind_02_cat','ps_car_04_cat', 'ps_car_09_cat',
          'ps_ind_05_cat', 'ps_car_01_cat', 'ps_car_11_cat']

# generate dataframe for frequency features for the train and test dataset
train_freq, test_freq=freq_encoding(cat_cols,train, test)

# merge them into the original train and test dataset
train=pd.concat([train, train_freq], axis=1)
test=pd.concat([test,test_freq], axis=1)

In [7]:
def binary_encoding(train_df, test_df, feat):
    # calculate the highest numerical value used for numeric encoding
    train_feat_max = train_df[feat].max()
    test_feat_max = test_df[feat].max()
    if train_feat_max > test_feat_max:
        feat_max = train_feat_max
    else:
        feat_max = test_feat_max
        
    # use the value of feat_max+1 to represent missing value
    train_df.loc[train_df[feat] == -1, feat] = feat_max + 1
    test_df.loc[test_df[feat] == -1, feat] = feat_max + 1
    
    # create a union set of all possible values of the feature
    union_val = np.union1d(train_df[feat].unique(), test_df[feat].unique())

    # extract the highest value from from the feature in decimal format.
    max_dec = union_val.max()
    
    # work out how the ammount of digtis required to be represent max_dev in binary representation
    max_bin_len = len("{0:b}".format(max_dec))
    index = np.arange(len(union_val))
    columns = list([feat])
    
    # create a binary encoding feature dataframe to capture all the levels for the feature
    bin_df = pd.DataFrame(index=index, columns=columns)
    bin_df[feat] = union_val
    
    # capture the binary representation for each level of the feature 
    feat_bin = bin_df[feat].apply(lambda x: "{0:b}".format(x).zfill(max_bin_len))
    
    # split the binary representation into different bit of digits 
    splitted = feat_bin.apply(lambda x: pd.Series(list(x)).astype(np.uint8))
    splitted.columns = [feat + '_bin_' + str(x) for x in splitted.columns]
    bin_df = bin_df.join(splitted)
    
    # merge the binary feature encoding dataframe with the train and test dataset - Done! 
    train_df = pd.merge(train_df, bin_df, how='left', on=[feat])
    test_df = pd.merge(test_df, bin_df, how='left', on=[feat])
    return train_df, test_df

In [8]:
cat_cols=['ps_ind_02_cat','ps_car_04_cat', 'ps_car_09_cat',
          'ps_ind_05_cat', 'ps_car_01_cat']

train, test=binary_encoding(train, test, 'ps_ind_02_cat')
train, test=binary_encoding(train, test, 'ps_car_04_cat')
train, test=binary_encoding(train, test, 'ps_car_09_cat')
train, test=binary_encoding(train, test, 'ps_ind_05_cat')
train, test=binary_encoding(train, test, 'ps_car_01_cat')

In [9]:
train[train.columns[train.columns.str.contains('ps_ind_02_cat')]].head(5)

Unnamed: 0,ps_ind_02_cat,ps_ind_02_cat_bin_0,ps_ind_02_cat_bin_1,ps_ind_02_cat_bin_2
0,2,0,1,0
1,1,0,0,1
2,4,1,0,0
3,1,0,0,1
4,2,0,1,0


In [10]:
col_to_drop = train.columns[train.columns.str.startswith('ps_calc_')]
train.drop(col_to_drop, axis=1, inplace=True)  
test.drop(col_to_drop, axis=1, inplace=True)  

In [11]:
cat_cols=['ps_ind_02_cat','ps_car_04_cat', 'ps_car_09_cat', 'ps_ind_05_cat', 'ps_car_01_cat']
train.drop(cat_cols, axis=1, inplace=True)  
test.drop(cat_cols, axis=1, inplace=True)  

In [12]:
localtrain, localval=train_test_split(train, test_size=0.25, random_state=2017)

drop_cols=['id','target']
y_localtrain=localtrain['target']
x_localtrain=localtrain.drop(drop_cols, axis=1)

y_localval=localval['target']
x_localval=localval.drop(drop_cols, axis=1)

In [13]:
from sklearn.model_selection import StratifiedKFold

In [49]:
class Ensemble(object):
    def __init__(self, n_splits, stacker, base_models):
        self.n_splits = n_splits
        self.stacker = stacker
        self.base_models = base_models

    def fit_predict(self, x_localtrain, y_localtrain, T):
        X = np.array(x_localtrain)
        y = np.array(y_localtrain)
        T = np.array(T)
        
        folds = list(StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=2016).split(X, y))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))
        for i, clf in enumerate(self.base_models):

            S_test_i = np.zeros((T.shape[0], self.n_splits))

            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
#                y_holdout = y[test_idx]

                print ("Fit %s fold %d" % (str(clf).split('(')[0], j+1))
                clf.fit(X_train, y_train)
#                cross_score = cross_val_score(clf, X_train, y_train, cv=3, scoring='roc_auc')
#                print("    cross_score: %.5f" % (cross_score.mean()))
                y_pred = clf.predict_proba(X_holdout)[:,1]                

                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = clf.predict_proba(T)[:,1]
                S_test[:, i] = S_test_i.mean(axis=1)

        results = cross_val_score(self.stacker, S_train, y, cv=folds, scoring='roc_auc')
        print("Stacker score: %.5f" % (results.mean()))

        self.stacker.fit(S_train, y)
        res = self.stacker.predict_proba(S_test)[:,1]
        return res

In [50]:
# LightGBM params
lgb_params = {}
lgb_params['learning_rate'] = 0.02
lgb_params['n_estimators'] = 650
lgb_params['max_bin'] = 10
lgb_params['subsample'] = 0.8
lgb_params['subsample_freq'] = 10
lgb_params['colsample_bytree'] = 0.8   
lgb_params['min_child_samples'] = 500
lgb_params['seed'] = 99


lgb_params2 = {}
lgb_params2['n_estimators'] = 1090
lgb_params2['learning_rate'] = 0.02
lgb_params2['colsample_bytree'] = 0.3   
lgb_params2['subsample'] = 0.7
lgb_params2['subsample_freq'] = 2
lgb_params2['num_leaves'] = 16
lgb_params2['seed'] = 99


#lgb_params3 = {}
#lgb_params3['n_estimators'] = 1100
#lgb_params3['max_depth'] = 4
#lgb_params3['learning_rate'] = 0.02
#lgb_params3['seed'] = 99


# RandomForest params
#rf_params = {}
#rf_params['n_estimators'] = 200
#rf_params['max_depth'] = 6
#rf_params['min_samples_split'] = 70
#rf_params['min_samples_leaf'] = 30


# ExtraTrees params
#et_params = {}
#et_params['n_estimators'] = 155
#et_params['max_features'] = 0.3
#et_params['max_depth'] = 6
#et_params['min_samples_split'] = 40
#et_params['min_samples_leaf'] = 18


# XGBoost params
xgb_params = {}
xgb_params['objective'] = 'binary:logistic'
xgb_params['learning_rate'] = 0.04
xgb_params['n_estimators'] = 490
xgb_params['max_depth'] = 4
xgb_params['subsample'] = 0.9
xgb_params['colsample_bytree'] = 0.9  
xgb_params['min_child_weight'] = 10


# CatBoost params
#cat_params = {}
#cat_params['iterations'] = 900
#cat_params['depth'] = 8
#cat_params['rsm'] = 0.95
#cat_params['learning_rate'] = 0.03
#cat_params['l2_leaf_reg'] = 3.5  
#cat_params['border_count'] = 8
#cat_params['gradient_iterations'] = 4


# Regularized Greedy Forest params
#rgf_params = {}
#rgf_params['max_leaf'] = 2000
#rgf_params['learning_rate'] = 0.5
#rgf_params['algorithm'] = "RGF_Sib"
#rgf_params['test_interval'] = 100
#rgf_params['min_samples_leaf'] = 3 
#rgf_params['reg_depth'] = 1.0
#rgf_params['l2'] = 0.5  
#rgf_params['sl2'] = 0.005

In [46]:
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

lgb_model = LGBMClassifier(**lgb_params)
lgb_model2 = LGBMClassifier(**lgb_params2)
#xgb_model = XGBClassifier(**xgb_params)

log_model = LogisticRegression()

In [51]:
target_train = train['target'].values

In [52]:
stack = Ensemble(n_splits=3,
        stacker = log_model ,
        base_models = (lgb_model,lgb_model2))      
        
y_pred = stack.fit_predict(train, target_train, test)

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Fit LGBMClassifier fold 1


ValueError: Number of features of the model must match the input. Model n_features_ is 51 and input n_features is 50 

In [21]:
id_test = test['id'].values
id_test

array([      0,       1,       2, ..., 1488024, 1488025, 1488026])

In [28]:
sub = pd.DataFrame()
sub['id'] = id_test
sub['target'] = y_pred
sub.to_csv('stacked_z.csv', index=False)

In [29]:
pd.read_csv('stacked_z.csv')

Unnamed: 0,id,target
0,0,0.000034
1,1,0.000034
2,2,0.000034
3,3,0.000034
4,4,0.000034
5,5,0.000034
6,6,0.000034
7,8,0.000034
8,10,0.000034
9,11,0.000034
