## Import Libraries

In [21]:
import pandas as pd
from xgboost import XGBClassifier
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import log_loss
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
import pickle

## Data Import

In [44]:
df_train_events = pd.read_pickle("dataset/train_events.pkl")
df_val_events = pd.read_pickle("dataset/test_events.pkl")

df_train_noevents = pd.read_pickle("dataset/train_noevents.pkl")
df_val_noevents = pd.read_pickle("dataset/test_noevents.pkl")

In [45]:
df_events_train, df_events_test = train_test_split(df_train_events, test_size = 0.2)
df_noevents_train, df_noevents_test = train_test_split(df_train_noevents, test_size = 0.2)

## Utility Function

In [26]:
def hyperparameter_tuning(classifier, hp, x_train, y_train, x_test, y_test):
    """Function to perform Hyperparameter tuning of the models. Takes the model name, hyperparameter name and list 
    along with the train, cross validation and test datasets as input"""
    

    if classifier == "lrb":
        clf = SGDClassifier()
    elif classifier == "svm":
        clf = SGDClassifier()
    elif classifier == "rf":
        clf = RandomForestClassifier()
    
    
    random_clf=RandomizedSearchCV(clf, param_distributions=hp, verbose=10, cv=3, n_jobs=8)
    random_clf.fit(x_train, y_train)
    cv_log_error_array = []
    
    clf_best = random_clf.best_estimator_
    
    sig_clf = CalibratedClassifierCV(clf_best, method="sigmoid")
    sig_clf.fit(x_train, y_train)
    
    predict_y = sig_clf.predict_proba(x_test)
    score = log_loss(y_test, predict_y)
    print("The test log loss is:",score)
    
    return score

## Encoding required variables

In [6]:
brand_vec = TfidfVectorizer()
brand_train = brand_vec.fit_transform(df_noevents_train['brand'])
brand_test = brand_vec.transform(df_noevents_test['brand'])

model_vec = TfidfVectorizer()
model_train = model_vec.fit_transform(df_noevents_train['model'])
model_test = model_vec.transform(df_noevents_test['model'])

## Standardizing required inputs

In [7]:
scaler = StandardScaler()

age_train = scaler.fit_transform(df_noevents_train['pred_age'].values.reshape(-1,1))
age_test = scaler.transform(df_noevents_test['pred_age'].values.reshape(-1,1))

screen_train = scaler.fit_transform(df_noevents_train['screen_size'].values.reshape(-1,1))
screen_test = scaler.transform(df_noevents_test['screen_size'].values.reshape(-1,1))

ram_train = scaler.fit_transform(df_noevents_train['ram_gb'].values.reshape(-1,1))
ram_test = scaler.transform(df_noevents_test['ram_gb'].values.reshape(-1,1))

camera_train = scaler.fit_transform(df_noevents_train['camera'].values.reshape(-1,1))
camera_test = scaler.transform(df_noevents_test['camera'].values.reshape(-1,1))

release_train = scaler.fit_transform(df_noevents_train['release_bin'].values.reshape(-1,1))
release_test = scaler.transform(df_noevents_test['release_bin'].values.reshape(-1,1))

## Input variables

In [8]:
x_train = hstack((brand_train, model_train, df_noevents_train['female_pred'].values.reshape(-1,1), 
                  df_noevents_train['male_pred'].values.reshape(-1,1), age_train, screen_train, ram_train, 
                  camera_train, release_train))
x_test = hstack((brand_test, model_test, df_noevents_test['female_pred'].values.reshape(-1,1), 
                 df_noevents_test['male_pred'].values.reshape(-1,1), age_test, screen_test, ram_test,
                 camera_test, release_test))

## Output variables

In [9]:
y_encoder = LabelEncoder()

y_train = y_encoder.fit_transform(df_noevents_train['group'])
y_test = y_encoder.transform(df_noevents_test['group'])

## Random Model

In [69]:
noevents_model_rand = "No Events - Random Model"

predicted_y = np.zeros((len(y_test),12))
for i in range(len(y_test)):
    rand_probs = np.random.rand(1,12)
    predicted_y[i] = ((rand_probs/sum(sum(rand_probs)))[0])

noevents_rand = log_loss(y_test, predicted_y, labels = list(range(12)), eps=1e-15)

## Logistic Regression

In [38]:
noevents_model_lrb = "No Events - Logistic Regression"

hp = {'alpha':[10 ** x for x in range(-6, 3)]}

noevents_lrb = hyperparameter_tuning("lrb", hp, x_train, y_train, x_test, y_test)

## Support Vector Machines

In [34]:
noevents_model_svm = "No Events - Support Vector Machines"

c = {'c':[10 ** x for x in range(-5, 3)]}

noevents_svm = hyperparameter_tuning("svm", hp, x_train, y_train, x_test, y_test)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:    2.8s
[Parallel(n_jobs=8)]: Done   9 tasks      | elapsed:    6.9s
[Parallel(n_jobs=8)]: Done  15 out of  27 | elapsed:    7.8s remaining:    6.2s
[Parallel(n_jobs=8)]: Done  18 out of  27 | elapsed:    8.7s remaining:    4.3s
[Parallel(n_jobs=8)]: Done  21 out of  27 | elapsed:    9.2s remaining:    2.6s
[Parallel(n_jobs=8)]: Done  24 out of  27 | elapsed:    9.3s remaining:    1.1s
[Parallel(n_jobs=8)]: Done  27 out of  27 | elapsed:   11.3s remaining:    0.0s
[Parallel(n_jobs=8)]: Done  27 out of  27 | elapsed:   11.3s finished


The test log loss is: 2.4178723686862655


## Random Forest

In [37]:
noevents_model_rf = "No Events - Random Forest"

hp = {'n_estimators':[100,200,500,1000,2000],'max_depth':[5, 10]}

noevents_rf = hyperparameter_tuning("rf", hp, x_train, y_train, x_test, y_test)

## XGBoost

In [17]:
noevents_model_xgb = "No Events - XGBoost"

x_clf=XGBClassifier()
prams={
'learning_rate':[0.01,0.03,0.05,0.1,0.15,0.2],
'n_estimators':[100,200,500,1000,2000],
'max_depth':[3,5,10],
'colsample_bytree':[0.1,0.3,0.5,1],
'subsample':[0.1,0.3,0.5,1]
}
random_clf=RandomizedSearchCV(x_clf, param_distributions=prams, verbose=10, cv=3, n_jobs=8, scoring = 'neg_log_loss')
random_clf.fit(x_train, y_train)

x_clf_best = random_clf.best_estimator_
predict_y = x_clf_best.predict_proba(x_test)
noevents_xgb = log_loss(y_test, predict_y)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:   39.0s
[Parallel(n_jobs=8)]: Done   9 tasks      | elapsed:  3.2min
[Parallel(n_jobs=8)]: Done  19 out of  30 | elapsed:  6.3min remaining:  3.6min
[Parallel(n_jobs=8)]: Done  23 out of  30 | elapsed:  6.8min remaining:  2.1min
[Parallel(n_jobs=8)]: Done  27 out of  30 | elapsed:  7.2min remaining:   48.2s
[Parallel(n_jobs=8)]: Done  30 out of  30 | elapsed: 10.0min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n...
                                           verbosity=None),
                   iid='deprecated', n_iter=10, n_jobs=8,
                   param_distributions={'colsample_bytree': [0

## Neural Network

In [None]:
num_classes=12
y_train = tf.keras.utils.to_categorical(y_train, num_classes)
y_test = tf.keras.utils.to_categorical(y_test, num_classes)

In [16]:
def baseline_model(nodes):

    model = Sequential()
    
    model.add(Dense(2048, input_dim=nodes, init='normal', activation='tanh'))
    model.add(Dropout(0.5))
    model.add(Dense(50, input_dim=nodes, init='normal', activation='tanh'))
    model.add(Dropout(0.5))
    model.add(Dense(12, init='normal', activation='sigmoid'))

    model.compile(loss='categorical_crossentropy', optimizer='adadelta', metrics=['accuracy'])  #logloss
    return model

In [17]:
tf.keras.backend.clear_session()
model=baseline_model(x_train.shape[1])
model.fit(x_train, y_train, epochs=15, batch_size = 64, validation_data=(x_test, y_test), verbose=1)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


  """
  import sys
  if __name__ == '__main__':



Train on 41084 samples, validate on 10271 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.callbacks.History at 0x126a6516780>

In [18]:
predict_y = model.predict_proba(x_test)
print("The test log loss is:",log_loss(y_test, predict_y))
noevents_nn = log_loss(y_test, predict_y)

The test log loss is: 2.428108946999212


## Results for models with No Events

In [41]:
#summarizing results
from prettytable import PrettyTable

table = PrettyTable()

table.field_names = ["Model", "LogLoss"]

table.add_row([noevents_model_rand, noevents_rand])
table.add_row([noevents_model_lrb, noevents_lrb])
table.add_row([noevents_model_svm, noevents_svm])
table.add_row([noevents_model_rf, noevents_rf])
table.add_row([noevents_model_xgb, noevents_xgb])

print(table)

+-------------------------------------+--------------------+
|                Model                |      LogLoss       |
+-------------------------------------+--------------------+
|       No Events - Random Model      | 2.770839388865056  |
|   No Events - Logistic Regression   | 2.417876851355263  |
| No Events - Support Vector Machines | 2.4178723686862655 |
|      No Events - Random Forest      | 2.3940721530512206 |
|         No Events - XGBoost         | 2.396952661610476  |
+-------------------------------------+--------------------+


## Encoding required variables

In [46]:
brand_vec = TfidfVectorizer()
train_brand = brand_vec.fit_transform(df_events_train['brand'])
test_brand = brand_vec.transform(df_events_test['brand'])

model_vec = TfidfVectorizer()
train_model = model_vec.fit_transform(df_events_train['model'])
test_model = model_vec.transform(df_events_test['model'])

installed_vec = CountVectorizer()
train_installed = installed_vec.fit_transform(df_events_train['installed_apps_string'])
test_installed = installed_vec.transform(df_events_test['installed_apps_string'])

active_vec = CountVectorizer()
train_active = active_vec.fit_transform(df_events_train['active_apps_string'])
test_active = active_vec.transform(df_events_test['active_apps_string'])

installed_labels_vec = TfidfVectorizer()
train_installed_labels = installed_labels_vec.fit_transform(df_events_train['installed_app_labels'])
test_installed_labels = installed_labels_vec.transform(df_events_test['installed_app_labels'])

active_labels_vec = TfidfVectorizer()
train_active_labels = active_labels_vec.fit_transform(df_events_train['active_app_labels'])
test_active_labels = active_labels_vec.transform(df_events_test['active_app_labels'])

## Standardizing required inputs

In [47]:
scaler = StandardScaler()

age_train = scaler.fit_transform(df_events_train['pred_age'].values.reshape(-1,1))
age_test = scaler.transform(df_events_test['pred_age'].values.reshape(-1,1))

lat_train = scaler.fit_transform(df_events_train['mean_latitude'].values.reshape(-1,1))
lat_test = scaler.transform(df_events_test['mean_latitude'].values.reshape(-1,1))

long_train = scaler.fit_transform(df_events_train['mean_longitude'].values.reshape(-1,1))
long_test = scaler.transform(df_events_test['mean_longitude'].values.reshape(-1,1))

travels_train = scaler.fit_transform(df_events_train['num_travels'].values.reshape(-1,1))
travels_test = scaler.transform(df_events_test['num_travels'].values.reshape(-1,1))

screen_train = scaler.fit_transform(df_events_train['screen_size'].values.reshape(-1,1))
screen_test = scaler.transform(df_events_test['screen_size'].values.reshape(-1,1))

ram_train = scaler.fit_transform(df_events_train['ram_gb'].values.reshape(-1,1))
ram_test = scaler.transform(df_events_test['ram_gb'].values.reshape(-1,1))

camera_train = scaler.fit_transform(df_events_train['camera'].values.reshape(-1,1))
camera_test = scaler.transform(df_events_test['camera'].values.reshape(-1,1))

release_train = scaler.fit_transform(df_events_train['release_bin'].values.reshape(-1,1))
release_test = scaler.transform(df_events_test['release_bin'].values.reshape(-1,1))

## Input Variables

In [57]:
x_train = hstack((train_brand, train_model, train_installed_labels, train_active_labels,
                  df_events_train['female_pred'].values.reshape(-1,1), df_events_train['male_pred'].values.reshape(-1,1), age_train,
                  lat_train, long_train, travels_train, 
                  np.array(df_events_train['activity_hour'].to_list()), np.array(df_events_train['activity_day'].to_list()), 
                  df_events_train['app_usage'].values.reshape(-1,1), df_events_train['app_usage_session'].values.reshape(-1,1), 
                  train_installed, train_active, 
                  np.array(df_events_train['active_app_usage'].to_list()), screen_train, ram_train,
                  camera_train, release_train))

In [58]:
x_test = hstack((test_brand, test_model, test_installed_labels, test_active_labels,
                 df_events_test['female_pred'].values.reshape(-1,1), df_events_test['male_pred'].values.reshape(-1,1), age_test,
                 lat_test, long_test, travels_test, 
                 np.array(df_events_test['activity_hour'].to_list()), np.array(df_events_test['activity_day'].to_list()), 
                 df_events_test['app_usage'].values.reshape(-1,1), df_events_test['app_usage_session'].values.reshape(-1,1), 
                 test_installed, test_active, 
                 np.array(df_events_test['active_app_usage'].to_list()), screen_test, ram_test,
                 camera_test, release_test))

## Output variables

In [50]:
y_encoder = LabelEncoder()

y_train = y_encoder.fit_transform(df_events_train['group'])
y_test = y_encoder.transform(df_events_test['group'])

## Random Model

In [64]:
events_model_rand = "Events - Random Model"

predicted_y = np.zeros((len(y_test),12))
for i in range(len(y_test)):
    rand_probs = np.random.rand(1,12)
    predicted_y[i] = ((rand_probs/sum(sum(rand_probs)))[0])

events_rand = log_loss(y_test, predicted_y, labels = list(range(12)), eps=1e-15)

## Logistic Regression

In [59]:
events_model_lrb = "Events - Logistic Regression"

hp = {'alpha':[10 ** x for x in range(-6, 3)]}

events_lrb = hyperparameter_tuning("lrb", hp, x_train, y_train, x_test, y_test)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:   22.5s
[Parallel(n_jobs=8)]: Done   9 tasks      | elapsed:   30.6s
[Parallel(n_jobs=8)]: Done  15 out of  27 | elapsed:   34.8s remaining:   27.8s
[Parallel(n_jobs=8)]: Done  18 out of  27 | elapsed:   36.1s remaining:   18.0s
[Parallel(n_jobs=8)]: Done  21 out of  27 | elapsed:   37.7s remaining:   10.7s
[Parallel(n_jobs=8)]: Done  24 out of  27 | elapsed:   38.4s remaining:    4.7s
[Parallel(n_jobs=8)]: Done  27 out of  27 | elapsed:   39.2s remaining:    0.0s
[Parallel(n_jobs=8)]: Done  27 out of  27 | elapsed:   39.2s finished


The test log loss is: 2.018236536480917


## Support Vector Machines

In [60]:
events_model_svm = "Events - Support Vector Machines"

c = {'c':[10 ** x for x in range(-5, 3)]}

events_svm = hyperparameter_tuning("svm", hp, x_train, y_train, x_test, y_test)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:   22.1s
[Parallel(n_jobs=8)]: Done   9 tasks      | elapsed:   30.6s
[Parallel(n_jobs=8)]: Done  15 out of  27 | elapsed:   34.7s remaining:   27.7s
[Parallel(n_jobs=8)]: Done  18 out of  27 | elapsed:   36.2s remaining:   18.1s
[Parallel(n_jobs=8)]: Done  21 out of  27 | elapsed:   37.6s remaining:   10.7s
[Parallel(n_jobs=8)]: Done  24 out of  27 | elapsed:   38.2s remaining:    4.7s
[Parallel(n_jobs=8)]: Done  27 out of  27 | elapsed:   39.1s remaining:    0.0s
[Parallel(n_jobs=8)]: Done  27 out of  27 | elapsed:   39.1s finished


The test log loss is: 2.0174097570891036


## Random Forest

In [66]:
events_model_rf = "Events - Random Forest"

hp = {'n_estimators':[100,200,500,1000,2000],'max_depth':[5, 10]}

events_rf = hyperparameter_tuning("rf", hp, x_train, y_train, x_test, y_test)

## XGBoost

In [17]:
events_model_xgb = "Events - XGBoost"

x_clf=XGBClassifier()
prams={
'learning_rate':[0.01,0.03,0.05,0.1,0.15,0.2],
'n_estimators':[100,200,500,1000,2000],
'max_depth':[3,5,10],
'colsample_bytree':[0.1,0.3,0.5,1],
'subsample':[0.1,0.3,0.5,1]
}
random_clf=RandomizedSearchCV(x_clf, param_distributions=prams, verbose=10, cv=3, n_jobs=8, scoring = 'neg_log_loss')
random_clf.fit(x_train, y_train)

x_clf_best = random_clf.best_estimator_
predict_y = x_clf_best.predict_proba(x_test)
events_xgb = log_loss(y_test, predict_y)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:  2.1min
[Parallel(n_jobs=8)]: Done   9 tasks      | elapsed: 30.1min
[Parallel(n_jobs=8)]: Done  19 out of  30 | elapsed: 70.0min remaining: 40.5min
[Parallel(n_jobs=8)]: Done  23 out of  30 | elapsed: 81.5min remaining: 24.8min
[Parallel(n_jobs=8)]: Done  27 out of  30 | elapsed: 86.6min remaining:  9.6min
[Parallel(n_jobs=8)]: Done  30 out of  30 | elapsed: 102.6min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n...
                                           verbosity=None),
                   iid='deprecated', n_iter=10, n_jobs=8,
                   param_distributions={'colsample_bytree': [0

## Neural Network

In [None]:
num_classes=12
y_train = tf.keras.utils.to_categorical(y_train, num_classes)
y_test = tf.keras.utils.to_categorical(y_test, num_classes)

In [None]:
def baseline_model(nodes):

    model = Sequential()
    
    model.add(Dense(2048, input_dim=nodes, init='normal', activation='tanh'))
    model.add(Dropout(0.5))
    model.add(Dense(50, input_dim=nodes, init='normal', activation='tanh'))
    model.add(Dropout(0.5))
    model.add(Dense(12, init='normal', activation='sigmoid'))

    model.compile(loss='categorical_crossentropy', optimizer='adadelta', metrics=['accuracy'])  #logloss
    return model

In [None]:
model=baseline_model(x_train.shape[1])

In [6]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 2048)              46759936  
_________________________________________________________________
dropout_1 (Dropout)          (None, 2048)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 50)                102450    
_________________________________________________________________
dropout_2 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 12)                612       
Total params: 46,862,998
Trainable params: 46,862,998
Non-trainable params: 0
_________________________________________________________________


In [18]:
tf.keras.backend.clear_session()
model=baseline_model(x_train.shape[1])
model.fit(x_train, y_train, epochs=15, batch_size = 64, validation_data=(x_test, y_test), verbose=1)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


  """
  import sys
  if __name__ == '__main__':



Train on 18632 samples, validate on 4658 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.callbacks.History at 0x1d73673e240>

In [19]:
predict_y = model.predict_proba(x_test)
print("The test log loss is:",log_loss(y_test, predict_y))
events_nn = log_loss(y_test, predict_y)

The test log loss is: 2.0420681420776043


## Results for models with Events

In [68]:
#summarizing results
from prettytable import PrettyTable

table = PrettyTable()

table.field_names = ["Model", "LogLoss"]

table.add_row([events_model_rand, events_rand])
table.add_row([events_model_lrb, events_lrb])
table.add_row([events_model_svm, events_svm])
table.add_row([events_model_rf, events_rf])
table.add_row([events_model_xgb, events_xgb])

print(table)

+----------------------------------+--------------------+
|              Model               |      LogLoss       |
+----------------------------------+--------------------+
|      Events - Random Model       | 2.7642369749111952 |
|      Events - Random Forest      | 2.018236536480917  |
| Events - Support Vector Machines | 2.0174097570891036 |
|      Events - Random Forest      | 2.043569156077825  |
|         Events - XGBoost         | 1.5679151635707391 |
+----------------------------------+--------------------+


## Overall Results

In [72]:
table = PrettyTable()

table.field_names = ["Model", "No Events", "Events"]

table.add_row(["Random Model", noevents_rand, events_rand])
table.add_row(["Logistic Regression", noevents_lrb, events_lrb])
table.add_row(["Support Vector Machines", noevents_svm, events_svm])
table.add_row(["Random Forest", noevents_rf, events_rf])
table.add_row(["XGBoost", noevents_xgb, events_xgb])
table.add_row(["Neural Network", noevents_nn, events_nn])

print(table)

+-------------------------+--------------------+--------------------+
|          Model          |     No Events      |       Events       |
+-------------------------+--------------------+--------------------+
|       Random Model      | 2.7587559091491074 | 2.7642369749111952 |
|   Logistic Regression   | 2.417876851355263  | 2.018236536480917  |
| Support Vector Machines | 2.4178723686862655 | 2.0174097570891036 |
|      Random Forest      | 2.3940721530512206 | 2.043569156077825  |
|         XGBoost         | 2.396952661610476  | 1.5679151635707391 |
|      Neural Network     | 2.428108946999212  | 2.0420681420776043 |
+-------------------------+--------------------+--------------------+


### Observations:

1. All the models outperform a random model.
2. There is not much improvement between the models for records with no events.
3. But for the records with events, XGBoost drastically outperforms all other models.

## Saving Best models

In [None]:
file_name = "xgb_model_noevents.pkl"
# save
pickle.dump(x_clf_best_noevents, open(file_name, "wb"))

In [19]:
file_name = "xgb_model_events.pkl"
# save
pickle.dump(x_clf_best_events, open(file_name, "wb"))