In [13]:
import numpy as np
import pandas as pd
import torch
torch.set_printoptions(edgeitems=2, precision=2, linewidth=75)

Data source: https://www.kaggle.com/datasets/asaniczka/amazon-products-dataset-2023-1-4m-products?select=amazon_products.csv

In [16]:
import csv
amazon_product_path = "../data/p1ch4/amazon_products/amazon_products.csv"
amazon_product_df = pd.read_csv(amazon_product_path)
amazon_product_df.head()

Unnamed: 0,asin,title,imgUrl,productURL,stars,reviews,price,listPrice,category_id,isBestSeller,boughtInLastMonth
0,B014TMV5YE,"Sion Softside Expandable Roller Luggage, Black...",https://m.media-amazon.com/images/I/815dLQKYIY...,https://www.amazon.com/dp/B014TMV5YE,4.5,0,139.99,0.0,104,False,2000
1,B07GDLCQXV,Luggage Sets Expandable PC+ABS Durable Suitcas...,https://m.media-amazon.com/images/I/81bQlm7vf6...,https://www.amazon.com/dp/B07GDLCQXV,4.5,0,169.99,209.99,104,False,1000
2,B07XSCCZYG,Platinum Elite Softside Expandable Checked Lug...,https://m.media-amazon.com/images/I/71EA35zvJB...,https://www.amazon.com/dp/B07XSCCZYG,4.6,0,365.49,429.99,104,False,300
3,B08MVFKGJM,Freeform Hardside Expandable with Double Spinn...,https://m.media-amazon.com/images/I/91k6NYLQyI...,https://www.amazon.com/dp/B08MVFKGJM,4.6,0,291.59,354.37,104,False,400
4,B01DJLKZBA,Winfield 2 Hardside Expandable Luggage with Sp...,https://m.media-amazon.com/images/I/61NJoaZcP9...,https://www.amazon.com/dp/B01DJLKZBA,4.5,0,174.99,309.99,104,False,400


In [23]:
print("First row in dataset\n", amazon_product_df.iloc[0])
print("Shape of the dataset (number of records x number of features):", amazon_product_df.shape)

First row in dataset
 asin                                                        B014TMV5YE
title                Sion Softside Expandable Roller Luggage, Black...
imgUrl               https://m.media-amazon.com/images/I/815dLQKYIY...
productURL                        https://www.amazon.com/dp/B014TMV5YE
stars                                                              4.5
reviews                                                              0
price                                                           139.99
listPrice                                                          0.0
category_id                                                        104
isBestSeller                                                     False
boughtInLastMonth                                                 2000
Name: 0, dtype: object
Shape of the dataset (number of records x number of features): (1426337, 11)


In [30]:
# select numerical columns. tensor can include booleans but the tensors would become mixed types which is error-prone
amazon_product = torch.from_numpy(amazon_product_df[['stars', 'reviews', 'price', 'listPrice', 'category_id', 'boughtInLastMonth']].values)

amazon_product.shape, amazon_product.dtype

(torch.Size([1426337, 6]), torch.float64)

In [31]:
data = amazon_product[:, :-1] # deselect the last column
data, data.shape

(tensor([[  4.50,   0.00,  ...,   0.00, 104.00],
         [  4.50,   0.00,  ..., 209.99, 104.00],
         ...,
         [  4.50,   0.00,  ...,  57.39, 112.00],
         [  4.90,   0.00,  ...,   0.00, 112.00]], dtype=torch.float64),
 torch.Size([1426337, 5]))

In [32]:
target = amazon_product[:, -1] # select all rows for the last column
target, target.shape

(tensor([2000., 1000.,  ...,    0.,    0.], dtype=torch.float64),
 torch.Size([1426337]))

In [7]:
target = amazon_product[:, -1].long()
target

tensor([6, 6,  ..., 7, 6])

In [8]:
target_onehot = torch.zeros(target.shape[0], 10)

target_onehot.scatter_(1, target.unsqueeze(1), 1.0)

tensor([[0., 0.,  ..., 0., 0.],
        [0., 0.,  ..., 0., 0.],
        ...,
        [0., 0.,  ..., 0., 0.],
        [0., 0.,  ..., 0., 0.]])

In [9]:
target_unsqueezed = target.unsqueeze(1)
target_unsqueezed

tensor([[6],
        [6],
        ...,
        [7],
        [6]])

In [10]:
data_mean = torch.mean(data, dim=0)
data_mean

tensor([6.85e+00, 2.78e-01, 3.34e-01, 6.39e+00, 4.58e-02, 3.53e+01,
        1.38e+02, 9.94e-01, 3.19e+00, 4.90e-01, 1.05e+01])

In [11]:
data_var = torch.var(data, dim=0)
data_var

tensor([7.12e-01, 1.02e-02, 1.46e-02, 2.57e+01, 4.77e-04, 2.89e+02,
        1.81e+03, 8.95e-06, 2.28e-02, 1.30e-02, 1.51e+00])

In [12]:
data_normalized = (data - data_mean) / torch.sqrt(data_var)
data_normalized

tensor([[ 1.72e-01, -8.18e-02,  ..., -3.49e-01, -1.39e+00],
        [-6.57e-01,  2.16e-01,  ...,  1.35e-03, -8.24e-01],
        ...,
        [-1.61e+00,  1.17e-01,  ..., -9.63e-01,  1.86e+00],
        [-1.01e+00, -6.77e-01,  ..., -1.49e+00,  1.04e+00]])

In [13]:
bad_indexes = target <= 3 # <1>
bad_indexes.shape, bad_indexes.dtype, bad_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(20))

In [14]:
bad_data = data[bad_indexes]
bad_data.shape

torch.Size([20, 11])

In [15]:
bad_data = data[target <= 3]
mid_data = data[(target > 3) & (target < 7)] # <1>
good_data = data[target >= 7]

bad_mean = torch.mean(bad_data, dim=0)
mid_mean = torch.mean(mid_data, dim=0)
good_mean = torch.mean(good_data, dim=0)

for i, args in enumerate(zip(col_list, bad_mean, mid_mean, good_mean)):
    print('{:2} {:20} {:6.2f} {:6.2f} {:6.2f}'.format(i, *args))

 0 fixed acidity          7.60   6.89   6.73
 1 volatile acidity       0.33   0.28   0.27
 2 citric acid            0.34   0.34   0.33
 3 residual sugar         6.39   6.71   5.26
 4 chlorides              0.05   0.05   0.04
 5 free sulfur dioxide   53.33  35.42  34.55
 6 total sulfur dioxide 170.60 141.83 125.25
 7 density                0.99   0.99   0.99
 8 pH                     3.19   3.18   3.22
 9 sulphates              0.47   0.49   0.50
10 alcohol               10.34  10.26  11.42


In [16]:
total_sulfur_threshold = 141.83
total_sulfur_data = data[:,6]
predicted_indexes = torch.lt(total_sulfur_data, total_sulfur_threshold)

predicted_indexes.shape, predicted_indexes.dtype, predicted_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(2727))

In [17]:
actual_indexes = target > 5

actual_indexes.shape, actual_indexes.dtype, actual_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(3258))

In [18]:
n_matches = torch.sum(actual_indexes & predicted_indexes).item()
n_predicted = torch.sum(predicted_indexes).item()
n_actual = torch.sum(actual_indexes).item()

n_matches, n_matches / n_predicted, n_matches / n_actual

(2018, 0.74000733406674, 0.6193984039287906)