In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [4]:
df = spark.read.parquet('./dataset/refined_feature_dataset.parquet')
df.createOrReplaceTempView('session_insight')

df.columns

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

['user_session',
 'event_time_first',
 'event_time_last',
 'duration',
 'num_distinct_category',
 'num_distinct_cart_cat',
 'num_distinct_cart_product',
 'c_cart',
 'c_purchase',
 'c_view',
 'r_cart',
 'r_purchase',
 'r_view',
 'mean_cart',
 'mean_purchase',
 'mean_view',
 'hour',
 'day_name']

In [5]:
sql_script = \
"""
    SELECT
        ROUND(LOG(duration/c_view),2) AS log_avg_duration,
        ROUND(LOG(mean_cart),2) AS log_mean_cart,
        day_name,
        hour,
        num_distinct_category,
        num_distinct_cart_cat,
        num_distinct_cart_product,
        (CASE
            WHEN c_purchase <> 0 then 1
            ELSE c_purchase
        END) AS labels
    FROM session_insight
    WHERE user_session IS NOT NULL
    AND c_cart <> 0
    ;
"""

dataset = spark.sql(sql_script)
dataset.show(1,vertical =True)
dataset.coalesce(1).write.mode('overwrite').save('./temp/dataset.parquet', format='parquet') 

[Stage 1:>                                                          (0 + 1) / 1]                                                                                

-RECORD 0-------------------------
 log_avg_duration          | 4.78 
 log_mean_cart             | 5.12 
 day_name                  | Sun  
 hour                      | 8    
 num_distinct_category     | 1    
 num_distinct_cart_cat     | 1.0  
 num_distinct_cart_product | 1.0  
 labels                    | 1    
only showing top 1 row



                                                                                

In [6]:
dataset = pd.read_parquet('./temp/dataset.parquet',engine='pyarrow').fillna(0)
dataset.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1743342 entries, 0 to 1743341
Data columns (total 8 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   log_avg_duration           1743342 non-null  float64
 1   log_mean_cart              1743342 non-null  float64
 2   day_name                   1743342 non-null  object 
 3   hour                       1743342 non-null  int32  
 4   num_distinct_category      1743342 non-null  int64  
 5   num_distinct_cart_cat      1743342 non-null  float64
 6   num_distinct_cart_product  1743342 non-null  float64
 7   labels                     1743342 non-null  int64  
dtypes: float64(4), int32(1), int64(2), object(1)
memory usage: 99.8+ MB


In [7]:
scaler = MinMaxScaler()

scaler.fit(dataset[dataset.columns[:2]])
scaled_numeric_cols = pd.DataFrame(\
                               scaler.transform(dataset[dataset.columns[:2]]),\
                               columns=['scaled_'+col for col in dataset.columns[:2]])
scaled_numeric_cols.head()

Unnamed: 0,scaled_log_avg_duration,scaled_log_mean_cart
0,0.38386,0.685121
1,0.313684,0.685121
2,0.342456,0.889273
3,0.310175,0.574394
4,0.322807,0.785467


In [8]:
encoder = OneHotEncoder()

encoder.fit(dataset[dataset.columns[2:4]])
encoded_categories = pd.DataFrame(\
                             encoder.transform(dataset[dataset.columns[2:4]]).toarray(),
                             columns = encoder.get_feature_names(dataset.columns[2:4]))
encoded_categories.head()

Unnamed: 0,day_name_Fri,day_name_Mon,day_name_Sat,day_name_Sun,day_name_Thu,day_name_Tue,day_name_Wed,hour_0,hour_1,hour_2,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
X = pd.merge(scaled_numeric_cols,encoded_categories,left_index=True,right_index=True,how='inner')
X[['num_distinct_category','num_distinct_cart_cat','num_distinct_cart_product']] = dataset[['num_distinct_category','num_distinct_cart_cat','num_distinct_cart_product']]
y = dataset['labels']
del scaled_numeric_cols,encoded_categories
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2, stratify=y)

In [10]:
clf = XGBClassifier(
    eval_metric='logloss',
    learning_rate = 0.5,
    max_depth=6,
    reg_lambda=6,
    n_estimators=200,
    use_label_encoder=False)
clf.fit(X_train, y_train)

y_hat1 = clf.predict(X_train)
y_hat2 = clf.predict(X_test)

print("train accuracy: ",  round(accuracy_score(y_train,y_hat1),4))
print("train precision: ", round(precision_score(y_train,y_hat1),4))
print("train recall: ", round(recall_score(y_train,y_hat1),4))
print("train f1: ", round(f1_score(y_train,y_hat1),4))
print(confusion_matrix(y_train,y_hat1))
print('='*10)
print("test accuracy: ", round(accuracy_score(y_test,y_hat2),4))
print("test precision: ", round(precision_score(y_test,y_hat2),4))
print("test recall: ", round(recall_score(y_test,y_hat2),4))
print("train f1: ", round(f1_score(y_test,y_hat2),4))
print(confusion_matrix(y_test,y_hat2))

train accuracy:  0.7367
train precision:  0.6562
train recall:  0.6132
train f1:  0.634
[[709370 166613]
 [200634 318056]]
test accuracy:  0.7316
test precision:  0.6485
test recall:  0.6076
train f1:  0.6274
[[176288  42708]
 [ 50884  78789]]
