# Model
This notebook is for modeling, we use different base models and use stacking to ensemble them.

## Packages Load

In [1]:
import pandas as pd
import numpy as np
import multiprocessing
from datetime import datetime
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
import lightgbm as lgb
import xgboost as xgb
import catboost as cab
import time
from scipy.stats.mstats import winsorize
import matplotlib.pyplot as plt

In [2]:
# Read Files
events = pd.read_pickle('data/events.pkl')
attr = pd.read_pickle('data/attr.pkl')
session = pd.read_pickle('data/session.pkl')
submission = pd.read_csv('data/submission.csv')

## Load Data

In [3]:
training = pd.read_pickle('training.pkl')
prediction = pd.read_pickle('prediction.pkl')

In [4]:
training.head()

Unnamed: 0,user_id_hash,user_purchase_binary_7_days,user_purchase_binary_14_days,event_count,purchase_count,session_count,country,os_name,mean_sessions_duration,spend,...,event_6,event_14,event_4,event_40,event_7,event_41,event_3,event_42,region,purchase_gap
0,e1307a8f59cedf193a6b702914071b354618743ff55f52...,False,False,245,0,12,US,Android OS,7357318.5,0.0,...,7,3,10,6,4,6,6,5,tx,
1,a3fd9faf3fe536411c656fa4286e1f2830f6ab88a7c2bb...,False,False,5,0,2,US,iOS,24242.5,0.0,...,0,0,0,0,0,0,0,0,tn,
2,57fb27e539c679bb5ce5f8a0d975186008c42030061024...,False,False,21,0,1,US,Android OS,0.0,0.0,...,0,3,0,0,1,0,0,0,fl,
3,6c75912a77f358dea7b4cc146931017e5f7aec1d948ed9...,False,False,56,0,1,US,Android OS,0.0,0.0,...,1,0,1,0,2,0,0,0,va,
4,9f8bba6071e8d0f8ecb775c1a5ece35d5f6134e4870198...,False,False,14,0,1,US,Android OS,0.0,0.0,...,0,3,0,0,0,0,0,0,ca,


In [5]:
training.columns

Index(['user_id_hash', 'user_purchase_binary_7_days',
       'user_purchase_binary_14_days', 'event_count', 'purchase_count',
       'session_count', 'country', 'os_name', 'mean_sessions_duration',
       'spend', 'event_gap', 'session_gap', 'life_time', 'city', 'num_places',
       'event_45', 'event_1', 'event_5', 'event_6', 'event_14', 'event_4',
       'event_40', 'event_7', 'event_41', 'event_3', 'event_42', 'region',
       'purchase_gap'],
      dtype='object')

### Create More Features

In [6]:
def create_event_ratio(df):
    
    event_cols=['event_45', 'event_1', 'event_5', 'event_6', 'event_14',
            'event_4', 'event_40', 'event_7', 'event_41', 'event_3', 'event_42']
    
    for event_col in event_cols:
        
        df[event_col+'_ratio'] = df[event_col]/df['event_count']
    df['purchase_ratio'] = df['purchase_count']/df['event_count']
        
    return df
    

In [7]:
training = create_event_ratio(training)
prediction = create_event_ratio(prediction)

### Categorical Feature Encoding

In [8]:
# One-Hot Encoding
def oh_encoding(df, col='os_name', na_value='missing'):
#     df[col] = df[col].fillna(value=na_value)
#     dummy = pd.get_dummies(df[col]).drop(columns = [na_value])
    dummy = pd.get_dummies(df[col])
    dummy['iOS'], dummy['os_android'] = dummy['iOS']+dummy['iPhone OS'], dummy['Android OS']
    dummy = dummy.loc[:,['iOS']]
    df = pd.concat((df, dummy), axis=1)
    return df

In [9]:
training = oh_encoding(training)
prediction = oh_encoding(prediction)

In [10]:
small_country = set(training.country.value_counts()[30:].keys())
small_region = set(training.region.value_counts()[50:].keys())
small_city = set(training.city.value_counts()[50:].keys())

def other_country(df):
    df['country'] = df['country'].apply(lambda x: 'other' if x in small_country else x)
    return df

def other_region(df):
    df['region'] = df['region'].apply(lambda x: 'other' if x in small_region else x)
    return df

def other_city(df):
    df['city'] = df['city'].apply(lambda x: 'other' if x in small_city else x)
    return df

In [11]:
training = other_country(training)
training = other_region(training)
training = other_city(training)

prediction = other_country(prediction)
prediction = other_region(prediction)
prediction = other_city(prediction)

In [12]:
train_y2 = training.user_purchase_binary_14_days
# Mean Encoding Country
def reg_target_encoding(train, col = "country", splits=5):

    train[f'{col}_mean_enc'] = 0
#     train_y2 = training.user_purchase_binary_14_days.values
    skf = StratifiedKFold(splits, shuffle=True, random_state=111)
    
    for computing_index, encoding_index in skf.split(training,train_y2):
        computing, encoding = train.iloc[computing_index], train.iloc[encoding_index]
        means = computing.groupby(col).user_purchase_binary_14_days.mean()
        train.loc[encoding_index,f'{col}_mean_enc'] = encoding[col].map(means)
    return train

In [13]:
training =reg_target_encoding(training, col = "country")
training =reg_target_encoding(training, col = "region")
training =reg_target_encoding(training, col = "city")



In [14]:
def mean_encoding_test(test, train, col = "country"):

    global_mean = train.user_purchase_binary_14_days.mean()
    mean_device_type = train.groupby(col).user_purchase_binary_14_days.mean()
    test[f'{col}_mean_enc'] = test[col].map(mean_device_type)
    test[f'{col}_mean_enc'].fillna(global_mean, inplace=True)
    
    return test
    

def feature_encoding(df, training_df, events_df, session_df):
#     # One-Hot Encoding OS
#     df['os_name'] = df['os_name'].fillna(value='Missing')
#     os = pd.get_dummies(df['os_name']).drop(columns = ['Missing'])
#     os['os_ios'], os['os_android'] = os['iOS']+os['iPhone OS'], os['Android OS']
#     os = os.loc[:,['os_ios', 'os_android']]
#     df = pd.concat((df,os), axis=1)
    
    # Mean Encoding country
    df = mean_encoding_test(df, training_df, col = "country")   
    # Mean Encoding region   
    df = mean_encoding_test(df, training_df, col = "region")    
    # Mean Encoding region   
    df = mean_encoding_test(df, training_df, col = "city")
    return df

In [15]:
prediction = feature_encoding(prediction, training, events, session)

### Numeric Feature Scale

In [16]:
training.describe()

Unnamed: 0,event_count,purchase_count,session_count,mean_sessions_duration,spend,event_gap,session_gap,life_time,num_places,event_45,...,event_40_ratio,event_7_ratio,event_41_ratio,event_3_ratio,event_42_ratio,purchase_ratio,iOS,country_mean_enc,region_mean_enc,city_mean_enc
count,619423.0,619423.0,619423.0,618778.0,619423.0,618766.0,618778.0,618778.0,619423.0,619423.0,...,618766.0,618766.0,618766.0,618766.0,618766.0,618766.0,619423.0,618237.0,618500.0,618502.0
mean,157.847904,0.377758,7.288336,3821014.0,,2098771000.0,2272072000.0,2629768000.0,1.322316,107.419539,...,0.009357,0.025156,0.005905,0.005427,0.004404,0.000666,0.335346,0.009885,0.00988,0.009882
std,441.877253,3.064517,20.213372,12814790.0,,1475462000.0,1514580000.0,1473084000.0,0.956422,314.317186,...,0.013722,0.028377,0.010882,0.010242,0.008808,0.004881,0.472112,0.005825,0.004872,0.00378
min,0.0,0.0,0.0,0.0,0.0,5.0,249.0,31057.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00027,0.0
25%,14.0,0.0,1.0,0.0,0.0,768874600.0,924939900.0,1373587000.0,1.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003887,0.006013,0.009611
50%,36.0,0.0,2.0,44917.25,0.0,1951914000.0,2191941000.0,2613025000.0,1.0,23.0,...,0.0,0.017857,0.0,0.0,0.0,0.0,0.0,0.012795,0.010822,0.009696
75%,109.0,0.0,5.0,2262149.0,0.0,3333494000.0,3536015000.0,3877491000.0,1.0,74.0,...,0.020202,0.041667,0.008621,0.006993,0.001304,0.0,1.0,0.015168,0.014016,0.009709
max,23186.0,445.0,1153.0,811153900.0,1053.0,5245023000.0,5245191000.0,5270399000.0,80.0,18728.0,...,0.285714,0.333333,0.166667,0.142857,0.1,0.5,1.0,0.015402,0.022673,0.026174


Turn milliseconds gap into hour gap

In [17]:
training.columns

Index(['user_id_hash', 'user_purchase_binary_7_days',
       'user_purchase_binary_14_days', 'event_count', 'purchase_count',
       'session_count', 'country', 'os_name', 'mean_sessions_duration',
       'spend', 'event_gap', 'session_gap', 'life_time', 'city', 'num_places',
       'event_45', 'event_1', 'event_5', 'event_6', 'event_14', 'event_4',
       'event_40', 'event_7', 'event_41', 'event_3', 'event_42', 'region',
       'purchase_gap', 'event_45_ratio', 'event_1_ratio', 'event_5_ratio',
       'event_6_ratio', 'event_14_ratio', 'event_4_ratio', 'event_40_ratio',
       'event_7_ratio', 'event_41_ratio', 'event_3_ratio', 'event_42_ratio',
       'purchase_ratio', 'iOS', 'country_mean_enc', 'region_mean_enc',
       'city_mean_enc'],
      dtype='object')

In [18]:
# Turn gap into hours, and duration into minutes, life_time into days
def tranform_gap(df):
    gap_cols = [col for col in df.columns if 'gap' in col]
    df[gap_cols] = df[gap_cols]/3600000
    df['mean_sessions_duration'] = df['mean_sessions_duration']/60000
    df['life_time'] = df['life_time']/3600000/24
    return df

In [19]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [21]:
def fill_missing(df):
    values = {'country': 'other', 
              'os_name': 'other',
              'city': 'other',
              'region': 'other',
              'mean_sessions_duration': 0,
              'event_gap': df['event_gap'].mean(),              
              'session_gap': df['event_gap'].mean(),
              'life_time': df['life_time'].mean(),
              'purchase_gap':  df['purchase_gap'].max()*2,
              'event_45_ratio': df['event_45_ratio'].mean(),
              'event_1_ratio': df['event_1_ratio'].mean(),
              'event_5_ratio': df['event_5_ratio'].mean(),
              'event_6_ratio': df['event_6_ratio'].mean(),
              'event_14_ratio': df['event_14_ratio'].mean(),
              'event_4_ratio': df['event_4_ratio'].mean(),
              'event_40_ratio': df['event_40_ratio'].mean(),
              'event_7_ratio': df['event_7_ratio'].mean(),
              'event_41_ratio': df['event_41_ratio'].mean(),
              'event_3_ratio': df['event_3_ratio'].mean(),
              'event_42_ratio': df['event_42_ratio'].mean(), 
              'purchase_ratio': df['purchase_ratio'].mean(),
              'country_mean_enc': df['country_mean_enc'].mean(),
              'region_mean_enc': df['region_mean_enc'].mean(),
              'city_mean_enc': df['city_mean_enc'].mean()}
    df = df.fillna(value=values, inplace=False)
    
    return df

In [25]:
numeric_feature = ['event_count', 'purchase_count',
                   'session_count', 'mean_sessions_duration',
                   'spend', 'event_gap', 'session_gap', 'life_time', 'num_places',
                   'event_45', 'event_1', 'event_5', 'event_6', 'event_14', 'event_4',
                   'event_40', 'event_7', 'event_41', 'event_3', 'event_42', 'purchase_gap',
                   'event_45_ratio', 'event_1_ratio', 'event_5_ratio', 'event_6_ratio',
                   'event_14_ratio', 'event_4_ratio', 'event_40_ratio', 'event_7_ratio',
                   'event_41_ratio', 'event_3_ratio', 'event_42_ratio', 'purchase_ratio',
                  'country_mean_enc', 'region_mean_enc', 'city_mean_enc']

In [26]:
def feature_processing(df):
    scaler = MinMaxScaler()
    # Winsorization
    for col in numeric_feature:
        df[col] = np.clip(df[col], 0, np.quantile(df[col],0.98))
    
    df['event_count'] = np.log(df['event_count']+1)
    df[numeric_feature] = scaler.fit_transform(df[numeric_feature])
    df[numeric_feature] = fill_missing(df[numeric_feature])
    
    return df

In [27]:
training = feature_processing(training)

  interpolation=interpolation)


In [28]:
prediction = feature_processing(prediction)

  interpolation=interpolation)
  return self.partial_fit(X, y)


In [29]:
training.head()

Unnamed: 0,user_id_hash,user_purchase_binary_7_days,user_purchase_binary_14_days,event_count,purchase_count,session_count,country,os_name,mean_sessions_duration,spend,...,event_40_ratio,event_7_ratio,event_41_ratio,event_3_ratio,event_42_ratio,purchase_ratio,iOS,country_mean_enc,region_mean_enc,city_mean_enc
0,e1307a8f59cedf193a6b702914071b354618743ff55f52...,False,False,0.813545,0.0,0.1875,US,Android OS,0.00907,0.0,...,0.085714,0.04898,0.146939,0.171429,0.204082,0.0,0,0.986266,0.58456,0.370483
1,a3fd9faf3fe536411c656fa4286e1f2830f6ab88a7c2bb...,False,False,0.317933,0.0,0.03125,US,iOS,3e-05,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,1.0,0.458322,0.650888
2,57fb27e539c679bb5ce5f8a0d975186008c42030061024...,False,False,0.511288,0.0,0.015625,US,Android OS,0.0,0.0,...,0.0,0.142857,0.0,0.0,0.0,0.0,0,0.982486,0.566719,0.367478
3,6c75912a77f358dea7b4cc146931017e5f7aec1d948ed9...,False,False,0.638136,0.0,0.015625,US,Android OS,0.0,0.0,...,0.0,0.107143,0.0,0.0,0.0,0.0,0,0.98143,0.777737,0.37042
4,9f8bba6071e8d0f8ecb775c1a5ece35d5f6134e4870198...,False,False,0.456936,0.0,0.015625,US,Android OS,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.984835,0.54858,0.370948


In [30]:
prediction.head()

Unnamed: 0,user_id_hash,user_purchase_binary_7_days,user_purchase_binary_14_days,event_count,purchase_count,session_count,country,os_name,mean_sessions_duration,spend,...,event_40_ratio,event_7_ratio,event_41_ratio,event_3_ratio,event_42_ratio,purchase_ratio,iOS,country_mean_enc,region_mean_enc,city_mean_enc
0,e469dfaed039ead9110165d9bc457acb11609ca34057dc...,0.01,0.02,0.32578,0.0,0.001427,US,iOS,0.000434,0.0,...,0.0,0.115385,0.0,0.0,0.0,0.0,1,1.0,0.732323,0.49237
1,afcc639a324b6c598ef83d360450afa011cb2dd1358bf9...,0.01,0.02,0.388645,0.0,0.000713,MX,Android OS,0.0,0.0,...,0.0,0.06,0.0,0.0,0.0,0.0,0,0.197869,0.317323,0.49237
2,fd5a7cf211d08e3e00f7be6a9df6e6ea3d2e5c22a5d9c3...,0.01,0.02,0.342574,0.0,0.002853,MX,Android OS,0.000474,0.0,...,0.0,0.193548,0.0,0.0,0.0,0.0,0,0.197869,0.317323,0.49237
3,00bfff98b9d0329f014c2eeac7ce47cd18b2bc6e10d608...,0.01,0.02,0.527595,0.0,0.007133,other,Android OS,0.011211,0.0,...,0.144928,0.057971,0.144928,0.131401,0.115942,0.0,0,0.257818,0.317323,0.49237
4,0d298f3638c43e915c119d4935e1ce8d168f81b5e3e8c1...,0.01,0.02,0.177108,0.0,0.000713,other,Android OS,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.257818,0.284779,0.49237


## Modeling

In [32]:
X = training.drop(columns =['user_id_hash', 'user_purchase_binary_7_days', 'user_purchase_binary_14_days','country','os_name','city','region','iOS'])
y1, y2 = training['user_purchase_binary_7_days'], training['user_purchase_binary_14_days']
X_test = prediction.drop(columns = ['user_id_hash', 'user_purchase_binary_7_days', 'user_purchase_binary_14_days','country','os_name','city','region', 'iOS'])

### First Layer Model
On the first layer, we will use 3 tree boosting models, 2 neural networks, 2 linear models, 1 naive bayes model, 1 knn model.

In [34]:
from sklearn.neighbors import KNeighborsClassifier

In [35]:
X.columns

Index(['event_count', 'purchase_count', 'session_count',
       'mean_sessions_duration', 'spend', 'event_gap', 'session_gap',
       'life_time', 'num_places', 'event_45', 'event_1', 'event_5', 'event_6',
       'event_14', 'event_4', 'event_40', 'event_7', 'event_41', 'event_3',
       'event_42', 'purchase_gap', 'event_45_ratio', 'event_1_ratio',
       'event_5_ratio', 'event_6_ratio', 'event_14_ratio', 'event_4_ratio',
       'event_40_ratio', 'event_7_ratio', 'event_41_ratio', 'event_3_ratio',
       'event_42_ratio', 'purchase_ratio', 'country_mean_enc',
       'region_mean_enc', 'city_mean_enc'],
      dtype='object')

In [36]:
knn_model = KNeighborsClassifier(n_jobs=4)

In [39]:
def stack_prediction(model, X, y, text_X, fold=5, random_state=42):
    skf = StratifiedKFold(n_splits=fold, shuffle=True, random_state=random_state)
    validation = np.zeros(len(y))
    prediction = np.zeros(len(text_X))
    
    for fold_n, (train_index, valid_index) in enumerate(skf.split(X,y)):
        print('Fold', fold_n, 'started at', time.ctime())
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

        
        model.fit(X_train, y_train)

        prediction[valid_index] = model.predict_proba(X_valid)[:,1]
        
    return prediction

In [None]:
knn_validation_label_1, knn_prediction_label_1 = stack_prediction(knn_model, X, y1, X_test, fold=5, random_state=42)

Fold 0 started at Tue Feb 19 14:43:38 2019


In [None]:
np.save('knn_validation_label_1', knn_validation_label_1)
np.save('knn_prediction_label_1', knn_prediction_label_1)

In [None]:
knn_validation_label_2, knn_prediction_label_2 = stack_prediction(knn_model, X, y2, X_test, fold=5, random_state=42)

In [None]:
np.save('knn_validation_label_2', knn_validation_label_2)
np.save('knn_prediction_label_2', knn_prediction_label_2)

In [33]:
X.isna().apply(sum,axis=0)

event_count               0
purchase_count            0
session_count             0
mean_sessions_duration    0
spend                     0
event_gap                 0
session_gap               0
life_time                 0
num_places                0
event_45                  0
event_1                   0
event_5                   0
event_6                   0
event_14                  0
event_4                   0
event_40                  0
event_7                   0
event_41                  0
event_3                   0
event_42                  0
purchase_gap              0
event_45_ratio            0
event_1_ratio             0
event_5_ratio             0
event_6_ratio             0
event_14_ratio            0
event_4_ratio             0
event_40_ratio            0
event_7_ratio             0
event_41_ratio            0
event_3_ratio             0
event_42_ratio            0
purchase_ratio            0
country_mean_enc          0
region_mean_enc           0
city_mean_enc       