In [13]:
import numpy as np
import pandas as pd
import torch
torch.set_printoptions(edgeitems=2, precision=2, linewidth=75)

Data source: https://www.kaggle.com/datasets/asaniczka/amazon-products-dataset-2023-1-4m-products?select=amazon_products.csv

In [16]:
import csv
amazon_product_path = "../data/p1ch4/amazon_products/amazon_products.csv"
amazon_product_df = pd.read_csv(amazon_product_path)
amazon_product_df.head()

Unnamed: 0,asin,title,imgUrl,productURL,stars,reviews,price,listPrice,category_id,isBestSeller,boughtInLastMonth
0,B014TMV5YE,"Sion Softside Expandable Roller Luggage, Black...",https://m.media-amazon.com/images/I/815dLQKYIY...,https://www.amazon.com/dp/B014TMV5YE,4.5,0,139.99,0.0,104,False,2000
1,B07GDLCQXV,Luggage Sets Expandable PC+ABS Durable Suitcas...,https://m.media-amazon.com/images/I/81bQlm7vf6...,https://www.amazon.com/dp/B07GDLCQXV,4.5,0,169.99,209.99,104,False,1000
2,B07XSCCZYG,Platinum Elite Softside Expandable Checked Lug...,https://m.media-amazon.com/images/I/71EA35zvJB...,https://www.amazon.com/dp/B07XSCCZYG,4.6,0,365.49,429.99,104,False,300
3,B08MVFKGJM,Freeform Hardside Expandable with Double Spinn...,https://m.media-amazon.com/images/I/91k6NYLQyI...,https://www.amazon.com/dp/B08MVFKGJM,4.6,0,291.59,354.37,104,False,400
4,B01DJLKZBA,Winfield 2 Hardside Expandable Luggage with Sp...,https://m.media-amazon.com/images/I/61NJoaZcP9...,https://www.amazon.com/dp/B01DJLKZBA,4.5,0,174.99,309.99,104,False,400


In [23]:
print("First row in dataset\n", amazon_product_df.iloc[0])
print("Shape of the dataset (number of records x number of features):", amazon_product_df.shape)

First row in dataset
 asin                                                        B014TMV5YE
title                Sion Softside Expandable Roller Luggage, Black...
imgUrl               https://m.media-amazon.com/images/I/815dLQKYIY...
productURL                        https://www.amazon.com/dp/B014TMV5YE
stars                                                              4.5
reviews                                                              0
price                                                           139.99
listPrice                                                          0.0
category_id                                                        104
isBestSeller                                                     False
boughtInLastMonth                                                 2000
Name: 0, dtype: object
Shape of the dataset (number of records x number of features): (1426337, 11)


In [30]:
# select numerical columns. tensor can include booleans but the tensors would become mixed types which is error-prone
amazon_product = torch.from_numpy(amazon_product_df[['stars', 'reviews', 'price', 'listPrice', 'category_id', 'boughtInLastMonth']].values)

amazon_product.shape, amazon_product.dtype

(torch.Size([1426337, 6]), torch.float64)

In [31]:
data = amazon_product[:, :-1] # deselect the last column
data, data.shape

(tensor([[  4.50,   0.00,  ...,   0.00, 104.00],
         [  4.50,   0.00,  ..., 209.99, 104.00],
         ...,
         [  4.50,   0.00,  ...,  57.39, 112.00],
         [  4.90,   0.00,  ...,   0.00, 112.00]], dtype=torch.float64),
 torch.Size([1426337, 5]))

In [32]:
target = amazon_product[:, -1] # select all rows for the last column
target, target.shape

(tensor([2000., 1000.,  ...,    0.,    0.], dtype=torch.float64),
 torch.Size([1426337]))

In [33]:
target = amazon_product[:, -1].long()
target

tensor([2000, 1000,  ...,    0,    0])

Transform target column, `boughtInLastMonth`, to 30 target classes.

In [51]:
unique_targets = target.unique().tolist()
print(f"Number of unique target values: {len(unique_targets)}")
print("Unique target values: ", unique_targets)

Number of unique target values: 30
Unique target values:  [0, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 100000]


In [53]:
from fastbook import *
unique_targets = L(unique_targets)
v2i = unique_targets.val2idx()
print(v2i)

{0: 0, 50: 1, 100: 2, 200: 3, 300: 4, 400: 5, 500: 6, 600: 7, 700: 8, 800: 9, 900: 10, 1000: 11, 2000: 12, 3000: 13, 4000: 14, 5000: 15, 6000: 16, 7000: 17, 8000: 18, 9000: 19, 10000: 20, 20000: 21, 30000: 22, 40000: 23, 50000: 24, 60000: 25, 70000: 26, 80000: 27, 90000: 28, 100000: 29}


In [72]:
amazon_product_df["boughtInLastMonth_class"] = amazon_product_df["boughtInLastMonth"].apply(lambda x: v2i[x])
col_list = ['stars', 'reviews', 'price', 'listPrice', 'category_id', 'boughtInLastMonth_class']
amazon_product = torch.from_numpy(amazon_product_df[col_list].values)
target = amazon_product[:, -1].long()
target

tensor([12, 11,  ...,  0,  0])

Continue with the codebook.

In [62]:
target_onehot = torch.zeros(target.shape[0], 30)

target_onehot.scatter_(1, target.unsqueeze(1), 1.0)

tensor([[0., 0.,  ..., 0., 0.],
        [0., 0.,  ..., 0., 0.],
        ...,
        [1., 0.,  ..., 0., 0.],
        [1., 0.,  ..., 0., 0.]])

In [63]:
target_unsqueezed = target.unsqueeze(1)
target_unsqueezed

tensor([[12],
        [11],
        ...,
        [ 0],
        [ 0]])

In [64]:
data = amazon_product[:, :-1] # get data again bc amazon_product is updated
data, data.shape

(tensor([[  4.50,   0.00,  ...,   0.00, 104.00],
         [  4.50,   0.00,  ..., 209.99, 104.00],
         ...,
         [  4.50,   0.00,  ...,  57.39, 112.00],
         [  4.90,   0.00,  ...,   0.00, 112.00]], dtype=torch.float64),
 torch.Size([1426337, 5]))

In [65]:
data_mean = torch.mean(data, dim=0)
data_mean

tensor([  4.00, 180.75,  43.38,  12.45, 123.74], dtype=torch.float64)

In [66]:
data_var = torch.var(data, dim=0)
data_var

tensor([1.81e+00, 3.10e+06, 1.70e+04, 2.13e+03, 5.35e+03], dtype=torch.float64)

In [67]:
data_normalized = (data - data_mean) / torch.sqrt(data_var)
data_normalized

tensor([[ 0.37, -0.10,  ..., -0.27, -0.27],
        [ 0.37, -0.10,  ...,  4.28, -0.27],
        ...,
        [ 0.37, -0.10,  ...,  0.97, -0.16],
        [ 0.67, -0.10,  ..., -0.27, -0.16]], dtype=torch.float64)

In [68]:
bad_indexes = target <= 15 # comparison operator
bad_indexes.shape, bad_indexes.dtype, bad_indexes.sum()

(torch.Size([1426337]), torch.bool, tensor(1422153))

Most products sold less than 5000 of them last month.

In [69]:
unique_targets[15]

5000

In [70]:
bad_data = data[bad_indexes]
bad_data.shape

torch.Size([1422153, 5])

In [73]:
bad_data = data[target <= 10]
mid_data = data[(target > 10) & (target < 20)] # <1>
good_data = data[target >= 20]

bad_mean = torch.mean(bad_data, dim=0)
mid_mean = torch.mean(mid_data, dim=0)
good_mean = torch.mean(good_data, dim=0)

for i, args in enumerate(zip(col_list, bad_mean, mid_mean, good_mean)):
    print('{:2} {:20} {:6.2f} {:6.2f} {:6.2f}'.format(i, *args))

 0 stars                  3.98   4.51   4.59
 1 reviews              136.44 1358.77 4357.15
 2 price                 44.09  22.83  18.23
 3 listPrice             12.36  15.12  12.26
 4 category_id          123.47 131.66 128.04


In [81]:
reviews_threshold = 1358.77
reviews_data = data[:,1]
predicted_indexes = torch.gt(reviews_data, reviews_threshold)

predicted_indexes.shape, predicted_indexes.dtype, predicted_indexes.sum()

(torch.Size([1426337]), torch.bool, tensor(37017))

In [77]:
print(v2i)

{0: 0, 50: 1, 100: 2, 200: 3, 300: 4, 400: 5, 500: 6, 600: 7, 700: 8, 800: 9, 900: 10, 1000: 11, 2000: 12, 3000: 13, 4000: 14, 5000: 15, 6000: 16, 7000: 17, 8000: 18, 9000: 19, 10000: 20, 20000: 21, 30000: 22, 40000: 23, 50000: 24, 60000: 25, 70000: 26, 80000: 27, 90000: 28, 100000: 29}


About 97.5k out of 1.4m products actually sold more than 500 products last month. 

In [78]:
actual_indexes = target > 5

actual_indexes.shape, actual_indexes.dtype, actual_indexes.sum()

(torch.Size([1426337]), torch.bool, tensor(97518))

Here we are using having equal or greater than 1359 reviews to predict sales. 1359 reviews is the mean of the number of reviews of the products that sold in between 1k and 20k items last month.
We are using actual good products based on if they sold more than 500 items last month.

In [82]:
n_matches = torch.sum(actual_indexes & predicted_indexes).item()
n_predicted = torch.sum(predicted_indexes).item()
n_actual = torch.sum(actual_indexes).item()

n_matches, n_matches / n_predicted, n_matches / n_actual

(11027, 0.2978901585757895, 0.11307656022477902)