In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [2]:
df = spark.read.parquet('./dataset/refined_feature_dataset.parquet')
df.createOrReplaceTempView('session_insight')

df.columns

                                                                                

['user_session',
 'event_time_first',
 'event_time_last',
 'duration',
 'num_distinct_category',
 'num_distinct_cart_cat',
 'num_distinct_cart_product',
 'c_cart',
 'c_purchase',
 'c_view',
 'r_cart',
 'r_purchase',
 'r_view',
 'mean_cart',
 'mean_purchase',
 'mean_view',
 'hour',
 'day_name']

In [16]:
sql_script = \
"""
    SELECT
        ROUND(duration/c_view,2) AS avg_duration,
        mean_cart,
        day_name,
        hour,
        num_distinct_category,
        num_distinct_cart_cat,
        num_distinct_cart_product,
        (CASE
            WHEN c_purchase <> 0 then 1
            ELSE c_purchase
        END) AS labels
    FROM session_insight
    WHERE user_session IS NOT NULL
    AND c_cart <> 0
    ;
"""

dataset = spark.sql(sql_script)
dataset.show(1,vertical =True)
dataset.coalesce(1).write.mode('overwrite').save('./temp/dataset.parquet', format='parquet') 

-RECORD 0---------------------------
 avg_duration              | 119.0  
 mean_cart                 | 167.29 
 day_name                  | Sun    
 hour                      | 8      
 num_distinct_category     | 1      
 num_distinct_cart_cat     | 1.0    
 num_distinct_cart_product | 1.0    
 labels                    | 1      
only showing top 1 row



                                                                                

In [18]:
dataset = pd.read_parquet('./temp/dataset.parquet',engine='pyarrow').fillna(0)
dataset.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1743342 entries, 0 to 1743341
Data columns (total 8 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   avg_duration               1743342 non-null  float64
 1   mean_cart                  1743342 non-null  float64
 2   day_name                   1743342 non-null  object 
 3   hour                       1743342 non-null  int32  
 4   num_distinct_category      1743342 non-null  int64  
 5   num_distinct_cart_cat      1743342 non-null  float64
 6   num_distinct_cart_product  1743342 non-null  float64
 7   labels                     1743342 non-null  int64  
dtypes: float64(4), int32(1), int64(2), object(1)
memory usage: 99.8+ MB


In [25]:
print("number of purchase action: ",sum(dataset['labels'].values.tolist()))

number of purchase action:  648363


In [19]:
scaler = MinMaxScaler()

scaler.fit(dataset[dataset.columns[:2]])
scaled_numeric_cols = pd.DataFrame(\
                               scaler.transform(dataset[dataset.columns[:2]]),\
                               columns=['scaled_'+col for col in dataset.columns[:2]])
scaled_numeric_cols.head()

Unnamed: 0,scaled_avg_duration,scaled_mean_cart
0,0.000154,0.06499
1,5.7e-05,0.06499
2,8.5e-05,0.379889
3,5.4e-05,0.024929
4,6.5e-05,0.154448


In [20]:
encoder = OneHotEncoder()

encoder.fit(dataset[dataset.columns[2:4]])
encoded_categories = pd.DataFrame(\
                             encoder.transform(dataset[dataset.columns[2:4]]).toarray(),
                             columns = encoder.get_feature_names_out(dataset.columns[2:4]))
encoded_categories.head()

Unnamed: 0,day_name_Fri,day_name_Mon,day_name_Sat,day_name_Sun,day_name_Thu,day_name_Tue,day_name_Wed,hour_0,hour_1,hour_2,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
X = pd.merge(scaled_numeric_cols,encoded_categories,left_index=True,right_index=True,how='inner')
X[['num_distinct_category','num_distinct_cart_cat','num_distinct_cart_product']] = dataset[['num_distinct_category','num_distinct_cart_cat','num_distinct_cart_product']]
y = dataset['labels']
del scaled_numeric_cols,encoded_categories
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.3, stratify=y)

In [38]:
clf = XGBClassifier(
    eval_metric='logloss',
    learning_rate = 0.1,
    max_depth=12,
    reg_lambda=6,
    n_estimators=100,
    use_label_encoder=False,
    tree_method='gpu_hist',
    predictor='gpu_predictor')
clf.fit(X_train, y_train)

y_hat_train = clf.predict(X_train)
y_hat_test = clf.predict(X_test)

print("train accuracy: ",  round(100*accuracy_score(y_hat_train,y_train),2),'%')
print("test accuracy: ", round(100*accuracy_score(y_hat_test,y_test),2),'%')
print("train precision: ", round(100*precision_score(y_hat_train,y_train),2),'%')
print("test precision: ", round(100*precision_score(y_hat_test,y_test),2),'%')
print("train recall: ", round(100*recall_score(y_hat_train,y_train),2),'%')
print("test recall: ", round(100*recall_score(y_hat_test,y_test),2),'%')
print("train f1: ", round(100*f1_score(y_hat_train,y_train),2),'%')
print("test f1: ", round(100*f1_score(y_hat_test,y_test),2),'%')
print('train cm')
print(confusion_matrix(y_hat_train,y_train))
print('test cm')
print(confusion_matrix(y_hat_test,y_test))

y_hat = clf.predict(X)
print(confusion_matrix(y_hat,y))

train accuracy:  74.13 %
test accuracy:  73.24 %
train precision:  61.92 %
test precision:  60.84 %
train recall:  66.29 %
test recall:  64.97 %
train f1:  64.03 %
test f1:  62.84 %
train cm
[[623552 172822]
 [142933 281032]]
test cm
[[264690  76173]
 [ 63804 118336]]
[[888242 248995]
 [206737 399368]]
