In [104]:
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('gs://ecommerce-behavior-data/sampling_events', dtype={'event_type': str, 
                                                                        'category_code': str, 'brand': str})

In [3]:
df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-10-02 09:13:57 UTC,view,5000245,2053013566100866035,appliances.sewing_machine,janome,103.81,513657838,faaff184-ac64-44df-8aa5-dce7a06cd3f3
1,2019-10-02 09:44:08 UTC,view,5701086,2053013553970938175,auto.accessories.player,pioneer,129.94,516912983,cb7ee854-bf9e-4719-ac65-a91524b71e57
2,2019-10-02 07:11:24 UTC,view,6200431,2053013552293216471,appliances.environment.air_heater,saturn,10.79,513908207,a8d2f217-baca-479a-8f38-dafe40c5f878
3,2019-10-01 18:51:41 UTC,view,8800998,2053013555573162395,electronics.telephone,bq,8.75,544732539,4218d83a-7f65-43cc-8e95-668d59ce8ca4
4,2019-10-01 18:49:22 UTC,view,8800465,2053013555573162395,electronics.telephone,k-lite,7.7,544732539,4218d83a-7f65-43cc-8e95-668d59ce8ca4


# 1. Feature Engineering
- product_views
- cart_items
- basket_value
- event_weekday
- is_purchased (1,0)

In [4]:
user_sessions = df.drop_duplicates(["user_session"])["user_session"]

In [5]:
# Craeate data target
df_target = pd.DataFrame(user_sessions, columns = ["user_session", "product_views", "cart_items", "basket_value", "event_weekday", "is_purchased"])

# Set 'user_session' as an index
df_target.set_index("user_session", inplace=True)

# View data
df_target.head()

Unnamed: 0_level_0,product_views,cart_items,basket_value,event_weekday,is_purchased
user_session,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
faaff184-ac64-44df-8aa5-dce7a06cd3f3,,,,,
cb7ee854-bf9e-4719-ac65-a91524b71e57,,,,,
a8d2f217-baca-479a-8f38-dafe40c5f878,,,,,
4218d83a-7f65-43cc-8e95-668d59ce8ca4,,,,,
7cc9a07c-c59f-4cef-9b3d-6fcc395956a4,,,,,


In [6]:
# Page views count
page_views = df.loc[df["event_type"] == "view"].groupby(["user_session"])["event_type"].count()
df_page_views = pd.DataFrame(page_views, )
df_page_views.columns = ['product_views']

In [7]:
# Cart Items
cart_items = df.loc[df["event_type"] == "cart"].groupby(["user_session"])["event_type"].count()
df_cart_items = pd.DataFrame(cart_items)
df_cart_items.columns = ['cart_items']

In [8]:
# Basket Value
basket_value = df.loc[df["event_type"] == "cart"].groupby(["user_session"])["price"].sum()
df_basket_value = pd.DataFrame(basket_value)

In [9]:
# Event Weekday
df["event_weekday"] = df["event_time"].apply(lambda s: str(datetime.strptime(str(s)[0:10], "%Y-%m-%d").weekday()))
event_weekday = df.groupby(["user_session"])["event_weekday"].first()
df_event_weekday = pd.DataFrame(event_weekday)

In [10]:
# Is Purchase
is_purchase = df.loc[df["event_type"] == "purchase"].groupby(["user_session"])["event_type"].count()
df_is_purchase = pd.DataFrame(is_purchase)
df_is_purchase.columns = ['is_purchased']

In [11]:
# Create datasets
df_target["product_views"] = df_page_views["product_views"]
df_target["cart_items"] = df_cart_items["cart_items"]
df_target["basket_value"] = df_basket_value["price"]
df_target["event_weekday"] = df_event_weekday["event_weekday"]
df_target["is_purchased"] = df_is_purchase["is_purchased"]
df_target["is_purchased"].fillna(value=0, inplace=True)

In [39]:
df_target.head()

Unnamed: 0_level_0,product_views,cart_items,basket_value,event_weekday,is_purchased
user_session,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
faaff184-ac64-44df-8aa5-dce7a06cd3f3,7.0,5,519.05,2,0.0
cb7ee854-bf9e-4719-ac65-a91524b71e57,4.0,1,129.94,2,1.0
a8d2f217-baca-479a-8f38-dafe40c5f878,14.0,2,21.58,2,0.0
4218d83a-7f65-43cc-8e95-668d59ce8ca4,69.0,1,8.62,1,0.0
7cc9a07c-c59f-4cef-9b3d-6fcc395956a4,195.0,1,71.71,1,0.0


In [19]:
# Check where session comeback on cart stage and drop it.
df_target.loc[df_target["product_views"].isnull() == True]

Unnamed: 0_level_0,product_views,cart_items,basket_value,event_weekday,is_purchased
user_session,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1ff605e3-682e-4042-a256-31ec3f6d12b2,,1,92.64,1,0.0
31bd95eb-1c8a-4050-bbda-4df528ed302a,,2,113.20,5,1.0
5ddd1352-ec67-42b2-8fd8-fec1cbdf820f,,1,45.56,2,1.0
ca3f540e-974c-4e3a-8742-de441e8930a3,,1,384.54,6,1.0
ef067823-a37b-4c25-bd6d-df5251c832bb,,1,128.68,6,1.0
...,...,...,...,...,...
50d4d8b2-173b-4df3-b1bf-5d9d0dc53904,,5,1924.00,5,0.0
b1e0a44c-07b4-415a-ad17-268c7b83b39e,,1,1672.89,0,0.0
0fc88d28-a3ef-41ac-8010-bec3dfba0860,,2,216.18,4,0.0
c888b12c-e8fa-4c3e-8568-9d9a28375330,,3,2161.38,0,0.0


In [24]:
df_target.dropna(inplace=True)

In [25]:
df_target.info()

<class 'pandas.core.frame.DataFrame'>
Index: 199615 entries, faaff184-ac64-44df-8aa5-dce7a06cd3f3 to d3cd965c-af67-42db-9330-aef8ef14fa87
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   product_views  199615 non-null  float64
 1   cart_items     199615 non-null  int64  
 2   basket_value   199615 non-null  float64
 3   event_weekday  199615 non-null  object 
 4   is_purchased   199615 non-null  float64
dtypes: float64(3), int64(1), object(1)
memory usage: 9.1+ MB


In [26]:
# Save new data structure for modeling

df_target.to_csv('training_data.csv')

In [27]:
df_target.info()

<class 'pandas.core.frame.DataFrame'>
Index: 199615 entries, faaff184-ac64-44df-8aa5-dce7a06cd3f3 to d3cd965c-af67-42db-9330-aef8ef14fa87
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   product_views  199615 non-null  float64
 1   cart_items     199615 non-null  int64  
 2   basket_value   199615 non-null  float64
 3   event_weekday  199615 non-null  object 
 4   is_purchased   199615 non-null  float64
dtypes: float64(3), int64(1), object(1)
memory usage: 9.1+ MB


# 2. Resampling Training Set

In [34]:
is_purchased_set = df_target[df_target['is_purchased'] == 1]
is_purchased_set.shape[0]

85884

In [35]:
not_purchased_set = df_target[df_target['is_purchased'] == 0]
not_purchased_set.shape[0]

99615

In [36]:
n_samples = 85000
is_purchase_downsampled = resample(is_purchased_set,
                                  replace = False,
                                  n_samples = n_samples,
                                  random_state=25)
not_purchase_downsampled = resample(not_purchased_set,
                                  replace = False,
                                  n_samples = n_samples,
                                  random_state=25)

In [38]:
downsampled = pd.concat([is_purchase_downsampled, not_purchase_downsampled])
downsampled['is_purchased'].value_counts()

0.0    85000
1.0    85000
Name: is_purchased, dtype: int64

# 3. Split the Data

In [113]:
features = downsampled.loc[:,['product_views','cart_items','basket_value', 'event_weekday']]
features.head()

Unnamed: 0_level_0,product_views,cart_items,basket_value,event_weekday
user_session,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2d0100c1-4f38-43ff-a846-5e1218f218d8,1.0,1,51.46,0
5ac09ae9-1ac4-42bf-9f65-4f12b4399b4e,19.0,2,678.98,4
41fc1d89-42da-493c-93d3-2779e0c06a86,1.0,1,727.69,5
700a6a45-cb4f-493a-a4c4-976e799fc475,4.0,2,1081.46,5
e12b55fe-bcb3-49ea-bc19-1b5b9289d973,9.0,2,299.8,6


In [115]:
is_purchased = downsampled['is_purchased']

In [None]:
train_X, test_X, train_Y, test_Y = train_test_split(features, 
                                                   is_purchased,
                                                   test_size=0.3,
                                                   random_state=0)

# 4. Train the model

In [108]:
model = LogisticRegression()
model.fit(train_X, train_Y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

# 5. Measuring model accuracy

In [109]:
pred_train_Y = model.predict(train_X)
pred_test_Y = model.predict(test_X)

train_accuracy = accuracy_score(train_Y, pred_train_Y)
test_accuracy = accuracy_score(test_Y, pred_test_Y)

print('Training accuracy: ', round(train_accuracy, 4))
print('Test accuracy: ', round(test_accuracy, 4))

Training accuracy:  0.569
Test accuracy:  0.5715


# 6. Feature importance

In [110]:
model.coef_

array([[-3.39235076e-02,  5.18767128e-03,  1.63710633e-05,
        -3.29688472e-02]])

In [111]:
coefficients = pd.concat([pd.DataFrame(train_X.columns), 
                          pd.DataFrame(np.transpose(model.coef_))], 
                          axis = 1)
coefficients.columns = ['Features', 'Coefficient']
coefficients['Exp_Coefficient'] = np.exp(coefficients['Coefficient'])
coefficients = coefficients[coefficients['Coefficient'] != 0]
print(coefficients.sort_values(by=['Coefficient']))

        Features  Coefficient  Exp_Coefficient
0  product_views    -0.033924         0.966645
3  event_weekday    -0.032969         0.967569
2   basket_value     0.000016         1.000016
1     cart_items     0.005188         1.005201
