# Aula 10 - Recomendação baseada em sessão - exemplos

In [2]:
import pandas as pd
import numpy as np

### Importar base de dados (executar somente se não tiver a base 2019-Oct-sample.csv. Caso contrário, pular para etapa de leitura desse arquivo).

In [3]:
# Fazer download no link: https://www.kaggle.com/datasets/mkechinov/ecommerce-behavior-data-from-multi-category-store?resource=download&select=2019-Oct.csv
# !tar -xvzf 2019-Oct.csv.zip

In [4]:
data = pd.read_csv('data/2019-Oct.csv')
data.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-10-01 00:00:00 UTC,view,44600062,2103807459595387724,,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c
1,2019-10-01 00:00:00 UTC,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc
2,2019-10-01 00:00:01 UTC,view,17200506,2053013559792632471,furniture.living_room.sofa,,543.1,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8
3,2019-10-01 00:00:01 UTC,view,1307067,2053013558920217191,computers.notebook,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713
4,2019-10-01 00:00:04 UTC,view,1004237,2053013555631882655,electronics.smartphone,apple,1081.98,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d


In [5]:
# drop NaN values in specific columns
data = data.dropna(subset=["category_code", "brand", "user_session", "product_id"])

# keep only relevant columns in our dataset
data = data[["event_time", "event_type", "product_id", "category_code", "brand", "user_session"]]
data.head()

Unnamed: 0,event_time,event_type,product_id,category_code,brand,user_session
1,2019-10-01 00:00:00 UTC,view,3900821,appliances.environment.water_heater,aqua,9333dfbd-b87a-4708-9857-6336556b0fcc
3,2019-10-01 00:00:01 UTC,view,1307067,computers.notebook,lenovo,7c90fc70-0e80-4590-96f3-13c02c18c713
4,2019-10-01 00:00:04 UTC,view,1004237,electronics.smartphone,apple,c6bd7419-2748-4c56-95b4-8cec9ff8b80d
5,2019-10-01 00:00:05 UTC,view,1480613,computers.desktop,pulser,0d0d91c2-c9c2-4e81-90a5-86594dec0db9
8,2019-10-01 00:00:10 UTC,view,28719074,apparel.shoes.keds,baden,ac1cd4e5-a3ce-4224-a2d7-ff660a105880


In [6]:
data.sort_values(by=['user_session', 'event_time'], inplace=True, ignore_index=True)
data.head()

Unnamed: 0,event_time,event_type,product_id,category_code,brand,user_session
0,2019-10-31 06:23:12 UTC,view,1005115,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c
1,2019-10-31 06:23:52 UTC,view,1005105,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c
2,2019-10-31 06:25:30 UTC,view,1005105,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c
3,2019-10-31 06:26:58 UTC,view,1004858,electronics.smartphone,samsung,00000056-a206-40dd-b174-a072550fa38c
4,2019-10-31 06:28:21 UTC,view,1005104,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c


In [7]:
# select where to split the data
split_at = 2000000

#  make sure the split doesn't cut off session data
while data["user_session"].iloc[split_at-1] == data["user_session"].iloc[split_at]:
    split_at += 1
    
# perform the split
split_range = list(range(0, split_at))
subset = data.iloc[split_range]
subset.shape

(2000006, 6)

In [8]:
subset.to_csv('2019-Oct-sample.csv', index=False, header=['event_time', 'event_type', 'product_id', 'category_code', 'brand', 'user_session'], sep=',')


### Leitura do arquivo 2019-Oct-sample.csv

In [9]:
subset = pd.read_csv('./2019-Oct-sample.csv')
subset.head()

Unnamed: 0,event_time,event_type,product_id,category_code,brand,user_session
0,2019-10-31 06:23:12 UTC,view,1005115,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c
1,2019-10-31 06:23:52 UTC,view,1005105,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c
2,2019-10-31 06:25:30 UTC,view,1005105,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c
3,2019-10-31 06:26:58 UTC,view,1004858,electronics.smartphone,samsung,00000056-a206-40dd-b174-a072550fa38c
4,2019-10-31 06:28:21 UTC,view,1005104,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c


In [10]:
map_items = {item: idx for idx, item in enumerate(subset.product_id.unique())}
map_sessions = {item: idx for idx, item in enumerate(subset.user_session.unique())}
subset['itemId'] = subset['product_id'].map(map_items)
subset['sessionId'] = subset['user_session'].map(map_sessions)
subset.head()

Unnamed: 0,event_time,event_type,product_id,category_code,brand,user_session,itemId,sessionId
0,2019-10-31 06:23:12 UTC,view,1005115,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c,0,0
1,2019-10-31 06:23:52 UTC,view,1005105,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c,1,0
2,2019-10-31 06:25:30 UTC,view,1005105,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c,1,0
3,2019-10-31 06:26:58 UTC,view,1004858,electronics.smartphone,samsung,00000056-a206-40dd-b174-a072550fa38c,2,0
4,2019-10-31 06:28:21 UTC,view,1005104,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c,3,0


In [11]:
n_items = subset['itemId'].max()+1
print('No. items: ', n_items)
n_sessions = subset['sessionId'].max()+1
print('No. sessions: ', n_sessions)

No. items:  42581
No. sessions:  483508


In [12]:
# create a dataset
# remove sessions with less than 2 items
def create_data(df):
    df.sort_values(by=['sessionId', 'event_time'], inplace=True, ignore_index=True)
    sessions, session = [], []
    for index, value in df.iterrows():
        if index != 0:
            if value["sessionId"] == df.at[index-1, "sessionId"]:
                if value["event_type"] == 'view':
                    session.append(value["itemId"])
            else:
                if len(session) > 1:
                    sessions.append((df.at[index-1, "sessionId"], session))
                session = [value["itemId"]]
        else:
            session.append(value["itemId"])
    return sessions

In [13]:
sessions = create_data(subset)

In [14]:
print('No. sessions: ', len(sessions))
print('Session 1:', sessions[1])
subset.loc[subset.sessionId==1]

No. sessions:  296914
Session 1: (np.int64(1), [6, 7, 8, 9, 10, 11, 12, 9, 13, 9, 0, 14, 1, 15, 16, 17])


Unnamed: 0,event_time,event_type,product_id,category_code,brand,user_session,itemId,sessionId
7,2019-10-06 11:24:45 UTC,view,1004768,electronics.smartphone,samsung,00000083-8816-4d58-a9b8-f52f54186edc,6,1
8,2019-10-06 11:25:54 UTC,view,1005098,electronics.smartphone,samsung,00000083-8816-4d58-a9b8-f52f54186edc,7,1
9,2019-10-06 11:25:59 UTC,view,1005073,electronics.smartphone,samsung,00000083-8816-4d58-a9b8-f52f54186edc,8,1
10,2019-10-06 11:26:39 UTC,view,1004871,electronics.smartphone,samsung,00000083-8816-4d58-a9b8-f52f54186edc,9,1
11,2019-10-06 11:26:53 UTC,view,1004751,electronics.smartphone,samsung,00000083-8816-4d58-a9b8-f52f54186edc,10,1
12,2019-10-06 11:27:05 UTC,view,1004653,electronics.smartphone,samsung,00000083-8816-4d58-a9b8-f52f54186edc,11,1
13,2019-10-06 11:27:24 UTC,view,1005015,electronics.smartphone,samsung,00000083-8816-4d58-a9b8-f52f54186edc,12,1
14,2019-10-06 11:28:05 UTC,view,1004871,electronics.smartphone,samsung,00000083-8816-4d58-a9b8-f52f54186edc,9,1
15,2019-10-06 11:28:34 UTC,view,1003527,electronics.smartphone,xiaomi,00000083-8816-4d58-a9b8-f52f54186edc,13,1
16,2019-10-06 11:28:45 UTC,view,1004871,electronics.smartphone,samsung,00000083-8816-4d58-a9b8-f52f54186edc,9,1


In [15]:
import random

random.shuffle(sessions)
split = len(sessions) * 0.8
train = sessions[:int(split)]
test = sessions[int(split):]
print('No. train sessions: ', len(train))
print('No. test sessions: ', len(test))

No. train sessions:  237531
No. test sessions:  59383


In [16]:
def jaccard(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

In [17]:
actual_session = test[3]
target = actual_session[1][0:-1]
print(actual_session)
print(target)
subset.loc[subset.sessionId==actual_session[0]]

(np.int64(18614), [4232, 4232, 4232])
[4232, 4232]


Unnamed: 0,event_time,event_type,product_id,category_code,brand,user_session,itemId,sessionId
77453,2019-10-31 11:35:59 UTC,view,7002252,kids.carriage,wingoffly,00bec70b-2ca6-4fda-b824-04a73ce668f6,4232,18614
77454,2019-10-31 11:37:12 UTC,cart,7002252,kids.carriage,wingoffly,00bec70b-2ca6-4fda-b824-04a73ce668f6,4232,18614
77455,2019-10-31 11:38:34 UTC,cart,7002252,kids.carriage,wingoffly,00bec70b-2ca6-4fda-b824-04a73ce668f6,4232,18614
77456,2019-10-31 11:40:54 UTC,cart,7002252,kids.carriage,wingoffly,00bec70b-2ca6-4fda-b824-04a73ce668f6,4232,18614
77457,2019-10-31 11:41:56 UTC,view,7002252,kids.carriage,wingoffly,00bec70b-2ca6-4fda-b824-04a73ce668f6,4232,18614
77458,2019-10-31 11:42:44 UTC,cart,7002252,kids.carriage,wingoffly,00bec70b-2ca6-4fda-b824-04a73ce668f6,4232,18614
77459,2019-10-31 11:43:11 UTC,view,7002252,kids.carriage,wingoffly,00bec70b-2ca6-4fda-b824-04a73ce668f6,4232,18614
77460,2019-10-31 11:43:48 UTC,cart,7002252,kids.carriage,wingoffly,00bec70b-2ca6-4fda-b824-04a73ce668f6,4232,18614
77461,2019-10-31 11:44:17 UTC,cart,7002252,kids.carriage,wingoffly,00bec70b-2ca6-4fda-b824-04a73ce668f6,4232,18614
77462,2019-10-31 11:45:00 UTC,cart,7002252,kids.carriage,wingoffly,00bec70b-2ca6-4fda-b824-04a73ce668f6,4232,18614


In [18]:
def compute_score(train, target, itemId):
    candidate_sessions = []
    for s in range(len(train)):
        if itemId in train[s][1]:
            candidate_sessions.append(train[s][1])
    
    score = 0
    for n in range(len(candidate_sessions)):
        score += jaccard(candidate_sessions[n], target)
    
    return score
    
compute_score(train=train, target=target, itemId=931)

0.0

In [19]:
categories = subset.loc[subset.sessionId==actual_session[0]]['category_code'].unique().tolist()
candidate_items = subset.loc[subset.category_code.isin(categories)]['itemId'].unique().tolist()
candidate_items

[121,
 140,
 141,
 328,
 331,
 554,
 578,
 757,
 905,
 1207,
 1355,
 1504,
 1773,
 2016,
 2690,
 2720,
 2786,
 2787,
 2788,
 2809,
 3037,
 3063,
 3146,
 3167,
 3249,
 3347,
 3348,
 3349,
 3350,
 3351,
 3352,
 3353,
 3452,
 4050,
 4139,
 4231,
 4232,
 4233,
 4234,
 4235,
 4290,
 4291,
 4292,
 4293,
 4327,
 4328,
 4329,
 4389,
 4438,
 4439,
 4440,
 4441,
 4510,
 4579,
 4633,
 4890,
 4891,
 4918,
 4919,
 4920,
 4921,
 4922,
 4924,
 4952,
 5170,
 5197,
 5356,
 5357,
 5395,
 5396,
 5397,
 5398,
 5402,
 5403,
 5404,
 5405,
 5406,
 5407,
 5429,
 5430,
 5555,
 5556,
 5557,
 5558,
 5704,
 5705,
 5706,
 5721,
 5722,
 5741,
 5742,
 5743,
 5744,
 5965,
 6091,
 6092,
 6255,
 6256,
 6331,
 6332,
 6441,
 6515,
 6516,
 6517,
 6707,
 6708,
 6951,
 6952,
 6953,
 6955,
 6956,
 7001,
 7019,
 7211,
 7212,
 7213,
 7302,
 7594,
 7595,
 7596,
 7668,
 7704,
 7705,
 7927,
 7928,
 8036,
 8061,
 8062,
 8063,
 8064,
 8065,
 8066,
 8067,
 8068,
 8069,
 8070,
 8076,
 8077,
 8211,
 8212,
 8213,
 8214,
 8215,
 8216,
 

In [20]:
ranking = []
for i in range(len(candidate_items)):
    ranking.append((compute_score(train, target, candidate_items[i]), candidate_items[i]))

ranking.sort()
ranking.reverse()
print(ranking[0:10])

[(4.248424320242064, 4232), (1.4281539043834124, 331), (1.1561443779621214, 328), (1.1002169720347155, 6091), (0.9866633366633366, 2786), (0.6568696330991413, 16950), (0.6562881562881564, 2788), (0.6324829087124169, 3063), (0.42134156692980224, 3249), (0.33354978354978354, 1504)]


In [21]:
subset.loc[subset.itemId==1046]

Unnamed: 0,event_time,event_type,product_id,category_code,brand,user_session,itemId,sessionId
2519,2019-10-12 18:16:54 UTC,view,5100719,electronics.clocks,samsung,000729f8-7f6c-43f6-b1dd-aab45a29f5c1,1046,652
5660,2019-10-01 13:27:59 UTC,view,5100719,electronics.clocks,samsung,000fdfe4-e1f0-4a93-9c22-f04066ad895e,1046,1461
5705,2019-10-01 10:07:25 UTC,view,5100719,electronics.clocks,samsung,000ff41f-8d94-449e-86fc-25e0957da685,1046,1470
5706,2019-10-01 10:08:04 UTC,view,5100719,electronics.clocks,samsung,000ff41f-8d94-449e-86fc-25e0957da685,1046,1470
6330,2019-10-12 19:48:00 UTC,view,5100719,electronics.clocks,samsung,00118404-d57f-4480-b917-df839bc6a188,1046,1642
...,...,...,...,...,...,...,...,...
1998579,2019-10-31 03:54:14 UTC,view,5100719,electronics.clocks,samsung,1347912f-0cb1-4086-b174-d7f10f275ffa,1046,483132
1998631,2019-10-11 16:58:51 UTC,view,5100719,electronics.clocks,samsung,1347c04a-6cfd-4cca-925e-60cead2e1931,1046,483151
1998632,2019-10-11 16:59:10 UTC,view,5100719,electronics.clocks,samsung,1347c04a-6cfd-4cca-925e-60cead2e1931,1046,483151
1998633,2019-10-11 17:00:34 UTC,view,5100719,electronics.clocks,samsung,1347c04a-6cfd-4cca-925e-60cead2e1931,1046,483151
