In [1]:
from datetime import datetime

import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
df = pd.read_csv('dataset_processed_v4.csv.zip')

In [4]:
print(df.shape)

(1286916, 22)


In [5]:
df.sample(5)

Unnamed: 0,UniqueID,SeatUniqueID,event_date,isAttended,event_name,PC1,Price,paid,status,isHost,TicketClass,Term,Season,Tenure,Resold,isSTM,acct_type_desc_processed,event_sale_date_diff,plan_event_name_processed,comp_name_processed,class_name_processed,TicketType_processed
9043,CLT21HOU-126-15-10,126-15-10,2021-10-17,Yes,CLT21HOU,F,111.0,Y,SOLD,0,Manifest,,2021,33.0,,0.0,Season,200.0,21FS,Not Comp,OPEN,Renewal
102926,CLT21JAX-436-2-3,436-2-3,2021-11-14,Yes,CLT21JAX,D,130.0,Y,SOLD,0,Manifest,,2021,3.0,,0.0,Season,228.0,21FS,Not Comp,OPEN,Renewal
714739,CLT22JAX-117-16-14,117-16-14,2022-10-16,Yes,CLT22JAX,B,161.0,Y,SOLD,0,Manifest,,2022,7.0,,1.0,Season,328.0,22FS,Not Comp,OPEN,Renewal
818036,CLT22KC-543-3-6,543-3-6,2022-09-25,Yes,CLT22KC,J,149.0,Y,SOLD,0,Manifest,,2022,10.0,,1.0,Season,157.0,22FS,Not Comp,ACCMAN RELO,Renewal
432676,CLT21PRE-604-6-1,604-6-1,2021-08-15,Yes,CLT21PRE,U,20.0,Y,SOLD,0,Manifest,,2021,1.0,,0.0,Personal,46.0,,Not Comp,OTHERS,Group


# Separate Train-Test

In [6]:
df[df.isAttended != '???'].isAttended.value_counts(normalize=True)

Yes    0.777865
No     0.222135
Name: isAttended, dtype: float64

In [7]:
train_df = df[df['isAttended'] != '???'].copy()
test_df = df[df['isAttended'] == '???'].copy()

# Label Encoding

In [8]:
from sklearn.preprocessing import LabelEncoder

In [9]:
# categorical_columns = ['PC1', 'paid', 'status', 'SeatType', 'TicketClass', 'Resold',
#                        'acct_type_desc_processed', 'plan_event_name_processed', 'comp_name_processed',
#                        'class_name_processed', 'TicketType_processed', 'Season']

categorical_columns = ['PC1', 'paid', 'status', 'TicketClass', 'Resold',
                       'acct_type_desc_processed', 'plan_event_name_processed', 'comp_name_processed',
                       'class_name_processed', 'TicketType_processed', 'Season']

label_encoder_objects = dict()
for col in categorical_columns:
    ob = LabelEncoder()
    ob.fit(train_df[col])
    train_df[col] = ob.transform(train_df[col])
    label_encoder_objects[col] = ob

In [10]:
label_encoder_objects

{'PC1': LabelEncoder(),
 'paid': LabelEncoder(),
 'status': LabelEncoder(),
 'TicketClass': LabelEncoder(),
 'Resold': LabelEncoder(),
 'acct_type_desc_processed': LabelEncoder(),
 'plan_event_name_processed': LabelEncoder(),
 'comp_name_processed': LabelEncoder(),
 'class_name_processed': LabelEncoder(),
 'TicketType_processed': LabelEncoder(),
 'Season': LabelEncoder()}

In [11]:
train_df['isAttended'] = train_df['isAttended'].replace({'Yes': 1, 'No': 0})

In [12]:
train_df.sample(5)

Unnamed: 0,UniqueID,SeatUniqueID,event_date,isAttended,event_name,PC1,Price,paid,status,isHost,TicketClass,Term,Season,Tenure,Resold,isSTM,acct_type_desc_processed,event_sale_date_diff,plan_event_name_processed,comp_name_processed,class_name_processed,TicketType_processed
105443,CLT21JAX-450-13-12,450-13-12,2021-11-14,1,CLT21JAX,18,0.0,2,1,0,2,,0,1.0,1,0.0,2,3.0,3,1,5,1
495904,CLT21SEA-537-4-11,537-4-11,2021-09-12,1,CLT21SEA,15,105.0,2,3,0,2,,0,36.0,1,0.0,6,164.0,0,0,7,6
510607,CLT21SEA-637-7-2,637-7-2,2021-09-12,1,CLT21SEA,20,80.0,2,3,0,2,,0,1.0,1,0.0,6,164.0,0,0,1,4
1210303,CLT22TEN-616-6-5,616-6-5,2022-10-02,1,CLT22TEN,18,71.0,2,3,0,2,,1,16.0,1,1.0,6,321.0,1,0,7,6
1119200,CLT22PR2-240-10-8,240-10-8,2022-08-27,1,CLT22PR2,0,194.0,2,3,0,2,3.0,1,38.0,1,0.0,0,283.0,1,0,6,6


# Merge Time Series Features

## Event Level

In [13]:
event_level_fill_df = pd.read_csv('event_level_fill_rate.csv')
print(event_level_fill_df.shape)

(20, 4)


In [14]:
train_df = train_df.merge(event_level_fill_df, on='event_date', how='left')

## Seat Level

In [15]:
seat_level_df = pd.read_csv('seat_level_lagged_isattended.csv.zip')
print(seat_level_df.shape)

(1286916, 5)


In [16]:
train_df = train_df.merge(seat_level_df, on=['event_date', 'SeatUniqueID'], how='left')

In [17]:
train_df.sample(5)

Unnamed: 0,UniqueID,SeatUniqueID,event_date,isAttended,event_name,PC1,Price,paid,status,isHost,TicketClass,Term,Season,Tenure,Resold,isSTM,acct_type_desc_processed,event_sale_date_diff,plan_event_name_processed,comp_name_processed,class_name_processed,TicketType_processed,fill_rate_lag_1,fill_rate_lag_2,fill_rate_lag_3,isAttended_lagg_1,isAttended_lagg_2,isAttended_lagg_3
171925,CLT21LAR-504-2-23,504-2-23,2021-09-19,1,CLT21LAR,23,59.0,2,3,0,2,,0,38.0,1,0.0,4,171.0,0,0,7,6,76.927859,49.401672,,1.0,0.0,
503902,CLT21TB-619-16-4,619-16-4,2021-11-28,1,CLT21TB,25,48.0,2,3,0,2,,0,9.0,1,0.0,3,193.0,3,0,5,0,75.221459,71.858391,75.944115,1.0,0.0,0.0
235019,CLT21NE-452-10-13,452-10-13,2021-12-18,1,CLT21NE,14,100.0,2,3,0,2,,0,11.0,1,0.0,6,261.0,0,0,7,6,89.024959,75.221459,71.858391,1.0,0.0,0.0
686868,CLT22KC-507-1-7,507-1-7,2022-09-25,1,CLT22KC,19,102.0,2,3,0,2,,1,0.0,0,1.0,6,181.0,1,0,7,6,71.177696,53.586859,,1.0,1.0,
100551,CLT21JAX-424-4-2,424-4-2,2021-11-14,1,CLT21JAX,14,89.0,2,3,0,2,,0,16.0,1,0.0,6,227.0,0,0,7,6,71.858391,75.944115,71.609735,1.0,1.0,1.0


# Modelling

## Oversampling

In [None]:
train_df = train_df.drop(columns=['UniqueID', 'event_name', 'SeatUniqueID', 'event_date']).copy()

In [None]:
df_majority = train_df[train_df.isAttended==1]
df_minority = train_df[train_df.isAttended==0]

In [None]:
from sklearn.utils import resample

In [None]:
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(df_majority),    # to match majority class
                                 random_state=1234) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled.isAttended.value_counts()

## Train-Test Split 

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
# X = train_df.drop(columns=['UniqueID', 'isAttended', 'event_name']).copy()
X = train_df.drop(columns=['UniqueID', 'event_name', 'SeatUniqueID', 'event_date', 'isAttended']).copy()
y = train_df[['isAttended']].copy()
# X = df_upsampled.drop(columns=['isAttended']).copy()
# y = df_upsampled[['isAttended']].copy()

In [20]:
X.shape

(1158228, 23)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99)

## XGBoost

In [22]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

In [30]:
xgb1 = XGBClassifier(
    n_estimators=1000,
    colsample_bytree=0.7,
    learning_rate=0.1,
    max_depth=6,
    n_jobs=-1,
    random_state=12,
)

xgb2 = XGBClassifier(
    n_estimators=1000,
    colsample_bytree=0.6,
    learning_rate=0.1,
    max_depth=6,
    n_jobs=-1,
    random_state=123,
)

xgb3 = XGBClassifier(
    n_estimators=1000,
    colsample_bytree=0.7,
    learning_rate=0.2,
    max_depth=5,
    n_jobs=-1,
    random_state=1234,
)

xgb4 = XGBClassifier(
    n_estimators=1000,
    colsample_bytree=0.6,
    learning_rate=0.2,
    max_depth=5,
    n_jobs=-1,
    random_state=12345,
)

In [None]:
# evaluation = [( X_train, y_train), ( X_test, y_test)]

# xgb5.fit(X_train, y_train,
#          eval_set=evaluation, eval_metric="auc",
#          early_stopping_rounds=7)

# xgbc.fit(X_train, y_train)

In [24]:
from sklearn.ensemble import VotingClassifier

In [31]:
v_clf = VotingClassifier(estimators=[
    ('xgb1', xgb1), ('xgb2', xgb2), ('xgb3', xgb3), ('xgb4', xgb4)], voting='soft')
v_clf = v_clf.fit(X_train.values, y_train.isAttended.values)

In [32]:
xgbc = v_clf

In [33]:
# from sklearn.metrics import roc_auc_score
auc_train = roc_auc_score(y_train, xgbc.predict_proba(X_train)[:,1])
auc_test = roc_auc_score(y_test, xgbc.predict_proba(X_test)[:,1])

print(auc_train, auc_test)

0.852365112388642 0.8377593239629888


In [34]:
xgbc.score(X_train, y_train)

0.8638900820434673

In [35]:
xgbc.score(X_test, y_test)

0.8596435941047978

In [None]:
importances = xgbc.feature_importances_
feature_names = list(X_train.columns)

In [None]:
import matplotlib.pyplot as plt

In [None]:
forest_importances = pd.Series(importances, index=feature_names)
forest_importances = forest_importances.sort_values(ascending=False)

fig, ax = plt.subplots()
forest_importances.plot.bar(ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

## Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# help(XGBClassifier)

In [None]:
params = { 'max_depth': [3,6,10],
           'learning_rate': [0.01, 0.05, 0.1],
           'n_estimators': [500, 1000],
           'colsample_bytree': [0.3, 0.7],
#            'early_stopping_rounds': [5, 10, 15]
         }

In [None]:
# xgbc = XGBClassifier(
#     eval_set=[(X_train, y_train), (X_test, y_test)],
#     random_state=99
# )

xgbc = XGBClassifier(random_state=99, n_job=-1)

In [None]:
# help(GridSearchCV)

In [None]:
clf = GridSearchCV(estimator=xgbc, 
                   param_grid=params,
                   scoring=('roc_auc', 'f1', 'precision', 'recall'),
                   verbose=1,
                   n_jobs=-1,
                   refit=False)

In [None]:
clf.fit(X, y)

In [None]:
pd.DataFrame(clf.cv_results_).to_csv('cv_results.csv', index=False)

# Threshold Experiments

In [None]:
def threshold_exp(y_test, y_pred_prob):
    for threshold in np.arange(0.1, 1.0, 0.1):
        y_pred = np.where(y_pred_prob[:,1] <= threshold, 0, 1)
        tem = pd.DataFrame()
        tem['y_test'] = y_test
        tem['y_pred'] = y_pred
        print(f'threshold = {threshold}')
        tem = tem.apply(
            lambda x: f"y_pred={x['y_pred']}, y_test={x['y_test']}",
            axis=1
        )
        print(round(tem.value_counts(normalize=True, sort=False)*100, 1))
        print()


In [None]:
threshold_exp(y_test['isAttended'].values, xgbc.predict_proba(X_test))