In [122]:
from pathlib import Path
import sys
import pandas as pd

PROJECT_ROOT = Path("..").resolve()
sys.path.append(str(PROJECT_ROOT))

from src.load_data import load_review, load_meta
from ml_utils.eda import basic_overview

In [123]:
df = load_review()
df = df.drop("image", axis=1)

# Overview

In [124]:
basic_overview(df)

Shape: (2565349, 11)

Info:
<class 'pandas.DataFrame'>
RangeIndex: 2565349 entries, 0 to 2565348
Data columns (total 11 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   overall         int64 
 1   verified        bool  
 2   reviewTime      str   
 3   reviewerID      str   
 4   asin            str   
 5   reviewerName    str   
 6   reviewText      str   
 7   summary         str   
 8   unixReviewTime  int64 
 9   vote            str   
 10  style           object
dtypes: bool(1), int64(2), object(1), str(7)
memory usage: 198.2+ MB
None

Missing values:
overall                 0
verified                0
reviewTime              0
reviewerID              0
asin                    0
reviewerName          253
reviewText           1715
summary               811
unixReviewTime          0
vote              2122633
style             1245377
dtype: int64

Describe:
            overall  unixReviewTime
count  2.565349e+06    2.565349e+06
mean   4.022095e+00    1.388666e+

In [125]:
n_users = df.reviewerID.nunique()
n_items = df.asin.nunique()

avg_interactions_per_user = len(df) / n_users
avg_interactions_per_item = len(df) / n_items

user_counts = df.groupby('reviewerID')['asin'].count()

print(f"Users: {n_users}")
print(f"Items: {n_items}")
print(f"Interactions: {len(df)}")
print(f"Average interactions per user: {avg_interactions_per_user:.2f}")
print(f"Average interactions per item: {avg_interactions_per_item:.2f}")

Users: 1540618
Items: 71982
Interactions: 2565349
Average interactions per user: 1.67
Average interactions per item: 35.64


In [126]:
(user_counts).describe()

count    1.540618e+06
mean     1.665143e+00
std      2.892944e+00
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      2.000000e+00
max      8.880000e+02
Name: asin, dtype: float64

# Preparation

In [127]:
df = df[df.groupby('reviewerID')['asin'].transform('count') >= 5]
df = df[df.groupby('asin')['reviewerID'].transform('count') >= 10]

print(f"Users: {df.reviewerID.nunique()}\nItems: {df.asin.nunique()}\nInteractions: {len(df)}")

Users: 63778
Items: 11388
Interactions: 486104


reviews summary:
- separate verified and unverified as features for overall asin score
- fix column "reviewTime" and set type datetime.
- column "reviewerID" have huge amount of duplicates paired with asin,
  remove duplicates. keep sum of votes as a new feature.
  add condition if duplicates paired "reviwerID", "asin"
  keep "overall" by "reviewTime" and unique by "reviewerID", "asin" pair.
- handle missing values from reviewerName with reviewerID if Names needed
- set type "vote" to int where NaN = 0
- expend column "style" type dict. Contains missing values.

## Cleaning

- feature how old is review