In [1]:
from pathlib import Path
import sys
import pandas as pd

PROJECT_ROOT = Path("..").resolve()
sys.path.append(str(PROJECT_ROOT))

from src.load_data import load_review, load_meta
from src.preprocessing import filter_users_items, time_split, expand_dict_column
from ml_utils.eda import basic_overview

In [2]:
meta = load_meta()
basic_overview(meta)

Shape: (84819, 19)

Info:
<class 'pandas.DataFrame'>
RangeIndex: 84819 entries, 0 to 84818
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   category         84819 non-null  object
 1   tech1            84819 non-null  str   
 2   description      84819 non-null  object
 3   fit              84819 non-null  str   
 4   title            84819 non-null  str   
 5   also_buy         84819 non-null  object
 6   tech2            84819 non-null  str   
 7   brand            84819 non-null  str   
 8   feature          84819 non-null  object
 9   rank             84819 non-null  object
 10  also_view        84819 non-null  object
 11  main_cat         84819 non-null  str   
 12  similar_item     84819 non-null  str   
 13  date             84819 non-null  str   
 14  price            84819 non-null  str   
 15  asin             84819 non-null  str   
 16  imageURL         84819 non-null  object
 17  imageURLHighRes 

In [3]:
df = load_review()
df = df.drop("image", axis=1)

# Overview

In [4]:
basic_overview(df)

Shape: (2565349, 11)

Info:
<class 'pandas.DataFrame'>
RangeIndex: 2565349 entries, 0 to 2565348
Data columns (total 11 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   overall         int64 
 1   verified        bool  
 2   reviewTime      str   
 3   reviewerID      str   
 4   asin            str   
 5   reviewerName    str   
 6   reviewText      str   
 7   summary         str   
 8   unixReviewTime  int64 
 9   vote            str   
 10  style           object
dtypes: bool(1), int64(2), object(1), str(7)
memory usage: 198.2+ MB
None

Missing values:
overall                 0
verified                0
reviewTime              0
reviewerID              0
asin                    0
reviewerName          253
reviewText           1715
summary               811
unixReviewTime          0
vote              2122633
style             1245377
dtype: int64

Describe:
            overall  unixReviewTime
count  2.565349e+06    2.565349e+06
mean   4.022095e+00    1.388666e+

In [5]:
n_users = df.reviewerID.nunique()
n_items = df.asin.nunique()

avg_interactions_per_user = len(df) / n_users
avg_interactions_per_item = len(df) / n_items


print(f"Users: {n_users}")
print(f"Items: {n_items}")
print(f"Interactions: {len(df)}")
print(f"Average interactions per user: {avg_interactions_per_user:.2f}")
print(f"Average interactions per item: {avg_interactions_per_item:.2f}")

Users: 1540618
Items: 71982
Interactions: 2565349
Average interactions per user: 1.67
Average interactions per item: 35.64


In [6]:
user_counts = df.groupby('reviewerID')['asin'].count()
(user_counts).describe()

count    1.540618e+06
mean     1.665143e+00
std      2.892944e+00
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      2.000000e+00
max      8.880000e+02
Name: asin, dtype: float64

In [7]:
df = filter_users_items(df)
df.shape

(549623, 11)

In [8]:
train, val, test = time_split(df)

n_users = df.reviewerID.nunique()
n_items = df.asin.nunique()
n_interactions = len(df)

sparsity = 1 - (n_interactions / (n_users * n_items))

print(f"Users: {n_users}\nItems: {n_items}\nInteractions: {len(df)}")
print("Sparsity:", sparsity)

Split sizes -> Train: 384785, Val: 32279, Test: 21537
Users: 64040
Items: 25481
Interactions: 549623
Sparsity: 0.9996631805927778


In [9]:
assert val['reviewerID'].isin(train['reviewerID']).all()
assert val['asin'].isin(train['asin']).all()

assert test['reviewerID'].isin(train['reviewerID']).all()
assert test['asin'].isin(train['asin']).all()

In [10]:
df.duplicated(["reviewerID", "asin", "reviewTime"]).sum()

np.int64(24781)

In [11]:
df = df.sort_values("reviewTime").drop_duplicates(["reviewerID", "asin"], keep="last")
print("Shape:", df.shape)
print("Duplicates:", df.duplicated(subset=["reviewerID", "asin"]).sum())

Shape: (524424, 11)
Duplicates: 0


After inspecting the dataset, I found ~24k exact duplicates. For simplicity and to focus on the latest user-item interactions, I kept only the most recent review per user-item. Remaining data has unique reviewer-item pairs suitable for collaborative filtering

In [12]:
basic_overview(df)

Shape: (524424, 11)

Info:
<class 'pandas.DataFrame'>
Index: 524424 entries, 21468 to 2562396
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   overall         524424 non-null  int64         
 1   verified        524424 non-null  bool          
 2   reviewTime      524424 non-null  datetime64[us]
 3   reviewerID      524424 non-null  str           
 4   asin            524424 non-null  str           
 5   reviewerName    524341 non-null  str           
 6   reviewText      524251 non-null  str           
 7   summary         524304 non-null  str           
 8   unixReviewTime  524424 non-null  int64         
 9   vote            111653 non-null  str           
 10  style           293887 non-null  object        
dtypes: bool(1), datetime64[us](1), int64(2), object(1), str(6)
memory usage: 44.5+ MB
None

Missing values:
overall                0
verified               0
reviewTime             0


In [13]:
df["style"].apply(type).value_counts()

style
<class 'dict'>     293887
<class 'float'>    230537
Name: count, dtype: int64

In [14]:
df["style"].isna().mean()

np.float64(0.4396003996765976)

In [17]:
res = expand_dict_column(df, "style")

In [19]:
all_keys = set()
for d in df['style']:
    if isinstance(d, dict):
        all_keys.update(d.keys())

print(all_keys)

{'Size:', 'Style:', 'Content:', 'Item Package Quantity:', 'Format:', 'Style Name:', 'Package Quantity:', 'Length:', 'Edition:', 'Configuration:', 'Color:', 'Platform for Display:', 'Color Name:', 'Pattern:', 'Denomination:', 'Platform:', 'Package Type:', 'Offer Type:', 'Subscription Length:'}


In [20]:
res.isna().sum()

overall                        0
verified                       0
reviewTime                     0
reviewerID                     0
asin                           0
reviewerName                  83
reviewText                   173
summary                      120
unixReviewTime                 0
vote                      412771
Size:                     519566
Style:                    513259
Content:                  524379
Item Package Quantity:    524373
Format:                   390191
Style Name:               524178
Package Quantity:         523883
Length:                   522945
Edition:                  458158
Configuration:            524142
Color:                    500918
Platform for Display:     524123
Color Name:               524400
Pattern:                  524244
Denomination:             524332
Platform:                 415562
Package Type:             523365
Offer Type:               524416
Subscription Length:      522997
dtype: int64

In [22]:
for col in res.columns:
    df[col] = df.groupby('asin')[col].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 'unknown'))

KeyError: 'Column not found: Size:'

In [None]:
### build function extender ###

# Preparation

reviews summary:
- separate verified and unverified as features for overall asin score
- fix column "reviewTime" and set type datetime.
- column "reviewerID" have huge amount of duplicates paired with asin,
  remove duplicates. keep sum of votes as a new feature.
  add condition if duplicates paired "reviwerID", "asin"
  keep "overall" by "reviewTime" and unique by "reviewerID", "asin" pair.
- handle missing values from reviewerName with reviewerID if Names needed
- set type "vote" to int where NaN = 0
- expend column "style" type dict. Contains missing values.

## Cleaning

- feature how old is review