### 0. Import packages

In [1]:
import logging
import missingno as msno
import numpy as np
import pandas as pd
import warnings

from pandarallel import pandarallel
from pathlib import Path
from pprint import pprint
from src.data import load_dataset
from src.features import build_features, preprocessing
from tqdm import tqdm

pandarallel.initialize()
tqdm.pandas()
warnings.filterwarnings("ignore")

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


### 1. Load datasets

In [2]:
# global variables
DATA_PATH = Path('data/interim/')
CATEGORY = 'Sports_and_Outdoors'

In [None]:
%%time
prod_meta, prod_reviews = load_dataset.load_dataset(CATEGORY, DATA_PATH)

In [None]:
print(prod_reviews.info(memory_usage="deep"))
prod_reviews.head()

In [None]:
print(prod_meta.info(memory_usage="deep"))
prod_meta.head()

In [None]:
# merge reviews and metadata
prod_merged = pd.merge(prod_reviews, prod_meta, how='inner', on='asin')

print(prod_merged.info(memory_usage="deep"))
prod_merged.head()

In [None]:
# visualise the null compositions
msno.matrix(prod_merged)

In [None]:
# NOTE: drop all in empty values in either overall, reviewText, title
prod_merged = prod_merged.dropna(subset=['title', 'overall', 'reviewText'], axis=0)
# first attempt at removing possible null values disguised as other value
prod_merged = prod_merged.replace({'': np.nan})
prod_merged = prod_merged.dropna(subset=['title', 'reviewText'], axis=0)
# drop duplicated reviews due to different item variant
prod_merged = prod_merged.drop_duplicates(subset=['asin', 'overall', 'reviewerID','reviewText'], keep='first')

print(prod_merged.info(memory_usage="deep"))
prod_merged.head()

### 2. Conduct text processing

In [None]:
%%time
prod_merged['processedReviewText'] = prod_merged['reviewText'].parallel_apply(build_features.text_preprocess)

In [None]:
# checking `processedReviewText`
print(prod_merged.shape)

prod_merged.head().append(prod_merged.tail())

In [None]:
# conduct second round of null value removal
# prod_merged = preprocessing.removing_missing_reviews(prod_merged, 'processedReviewText')
prod_merged.reset_index(inplace=True)

In [None]:
print(prod_merged.shape)

### 3. Creating `HDFStore` to store our intermediary dataframe

In [None]:
# create storage object with filename 'processed_overall'
data_store = pd.HDFStore(f"{DATA_PATH/'processed_overall.h5'}")

# put dataframe into the object setting the key as 'movie_overall_processed'
data_store[CATEGORY] = prod_merged
data_store.close()

In [None]:
%%time
# temporary store as pkl
prod_merged.to_pickle(f"{DATA_PATH}/{CATEGORY}_processed.pkl")

In [3]:
%%time
# loading processed dataframe from pkl
prod_merged = pd.read_pickle(f"{DATA_PATH}/{CATEGORY}_processed.pkl")

CPU times: user 1.63 s, sys: 834 ms, total: 2.47 s
Wall time: 2.49 s


In [4]:
print(prod_merged.shape)

prod_merged.head().append(prod_merged.tail())

(2677396, 8)


Unnamed: 0,index,title,brand,asin,overall,reviewerID,reviewText,processedReviewText
0,0,Adult Ballet Tutu Yellow,BubuBibi,0000032034,5.0,A180LQZBUWVOLF,What a spectacular tutu! Very slimming.,what spectacular tutu very slimming
1,1,Adult Ballet Tutu Yellow,BubuBibi,0000032034,1.0,ATMFGKU5SVEYY,What the heck? Is this a tutu for nuns? I know...,what heck is tutu nun know cut still also laye...
2,2,Adult Ballet Tutu Yellow,BubuBibi,0000032034,5.0,A1QE70QBJ8U6ZG,Exactly what we were looking for!,exactly look
3,3,Adult Ballet Tutu Yellow,BubuBibi,0000032034,5.0,A22CP6Z73MZTYU,I used this skirt for a Halloween costume and ...,skirt halloween costume glue bunch feather it ...
4,4,Adult Ballet Tutu Yellow,BubuBibi,0000032034,4.0,A22L28G8NRNLLN,This is thick enough that you can't see throug...,this long sure check dimension end cut shorter
2677391,2677391,2XU Men's Core Compression Shorts,2XU,B01HJGAJ9O,3.0,AK3DMNXEFD97W,I was not impressed by these compression short...,impress compression short they like good quali...
2677392,2677392,2XU Men's Core Compression Shorts,2XU,B01HJGAJ9O,5.0,A3VUVANLER4L19,2XU makes great compression shorts. A bit more...,xu make great compression short bit expensive ...
2677393,2677393,2XU Men's Core Compression Shorts,2XU,B01HJGAJ9O,5.0,A3060N0IGEMVZT,These are my favorite compression shorts for r...,these favorite compression short run thin mate...
2677394,2677394,2XU Men's Core Compression Shorts,2XU,B01HJGAJ9O,5.0,A3TN0U64HONOPB,"Love the shorts.. amazing comfortable, perfect...",love short amaze comfortable perfect compressi...
2677395,2677395,2XU Men's Core Compression Shorts,2XU,B01HJGAJ9O,5.0,AJDQLM8PT3YWT,Superb quality,superb quality


### 4. Generating train/test split

In [5]:
def train_test_split(df, sample_rate):
    # generating user-review mapping
    user_review_map = df.groupby(['reviewerID'])['index'].progress_apply(list).to_dict()

    test_review_index = set()
    for user in user_review_map:
        sample_number = int(len(user_review_map[user]) * sample_rate)
        test_review_index |= set(np.random.choice(user_review_map[user], sample_number, replace=False))

    train = df[~df['index'].isin(test_review_index)]
    test = df[df['index'].isin(test_review_index)]

    # removing any user/items that is in test but NOT in train
    train_prods = train['asin'].unique()
    train_users = train['reviewerID'].unique()

    test = test[(test['asin'].isin(train_prods)) & (test['reviewerID'].isin(train_users))]

    return train, test

In [6]:
train, test = train_test_split(prod_merged, .2)

100%|██████████| 332231/332231 [00:09<00:00, 35647.56it/s]


#### Checking train/test dataset

In [7]:
print(train.shape)
train.head().append(train.tail())

(2242666, 8)


Unnamed: 0,index,title,brand,asin,overall,reviewerID,reviewText,processedReviewText
0,0,Adult Ballet Tutu Yellow,BubuBibi,0000032034,5.0,A180LQZBUWVOLF,What a spectacular tutu! Very slimming.,what spectacular tutu very slimming
1,1,Adult Ballet Tutu Yellow,BubuBibi,0000032034,1.0,ATMFGKU5SVEYY,What the heck? Is this a tutu for nuns? I know...,what heck is tutu nun know cut still also laye...
2,2,Adult Ballet Tutu Yellow,BubuBibi,0000032034,5.0,A1QE70QBJ8U6ZG,Exactly what we were looking for!,exactly look
3,3,Adult Ballet Tutu Yellow,BubuBibi,0000032034,5.0,A22CP6Z73MZTYU,I used this skirt for a Halloween costume and ...,skirt halloween costume glue bunch feather it ...
5,5,Adult Ballet Tutu Yellow,BubuBibi,0000032034,3.0,A3O0KC7YKT5P9T,I had to add some designs to the tutu worked w...,add designs tutu work hold wear tear
2677391,2677391,2XU Men's Core Compression Shorts,2XU,B01HJGAJ9O,3.0,AK3DMNXEFD97W,I was not impressed by these compression short...,impress compression short they like good quali...
2677392,2677392,2XU Men's Core Compression Shorts,2XU,B01HJGAJ9O,5.0,A3VUVANLER4L19,2XU makes great compression shorts. A bit more...,xu make great compression short bit expensive ...
2677393,2677393,2XU Men's Core Compression Shorts,2XU,B01HJGAJ9O,5.0,A3060N0IGEMVZT,These are my favorite compression shorts for r...,these favorite compression short run thin mate...
2677394,2677394,2XU Men's Core Compression Shorts,2XU,B01HJGAJ9O,5.0,A3TN0U64HONOPB,"Love the shorts.. amazing comfortable, perfect...",love short amaze comfortable perfect compressi...
2677395,2677395,2XU Men's Core Compression Shorts,2XU,B01HJGAJ9O,5.0,AJDQLM8PT3YWT,Superb quality,superb quality


In [8]:
print(test.shape)
test.head().append(test.tail())

(434692, 8)


Unnamed: 0,index,title,brand,asin,overall,reviewerID,reviewText,processedReviewText
4,4,Adult Ballet Tutu Yellow,BubuBibi,0000032034,4.0,A22L28G8NRNLLN,This is thick enough that you can't see throug...,this long sure check dimension end cut shorter
13,13,Delorme New York State Atlas &amp; Gazetteer,Garmin,0899332757,5.0,A1LACH6MLQWZ,If you're interested in exploring any area of ...,if interested explore area new york metropolit...
17,17,Delorme New York State Atlas &amp; Gazetteer,Garmin,0899332757,5.0,A1EEXQY23P5XSE,"Really, 4.5/5 stars. Missing a little detail w...",really star missing little come access road pa...
18,18,Delorme New York State Atlas &amp; Gazetteer,Garmin,0899332757,5.0,A1IQJSM0YNQJTT,Arrived in tact and as expected,arrived tact expect
19,19,Delorme New York State Atlas &amp; Gazetteer,Garmin,0899332757,5.0,AXUAY9M189EJ1,"These are the best maps, I get one for any sta...",these best map state travel need pick florida ...
2677368,2677368,"Soft Cooling Towel for Instant Relief, 48inch ...",Pusdon,B01HJDGJ1E,5.0,A1V3TVLKZXNB31,works well and was well made.,work
2677379,2677379,"Soft Cooling Towel for Instant Relief, 48inch ...",Pusdon,B01HJDGJ1E,5.0,A1KJG1DXMN6YY9,My husband and I live in the High Desert in So...,my husband live high desert southern californi...
2677380,2677380,"Soft Cooling Towel for Instant Relief, 48inch ...",Pusdon,B01HJDGJ1E,5.0,A1HCLZ57P6O0YM,The shipping was fast and I'm not sure how lon...,the shipping fast sure long suppose stay cold ...
2677385,2677385,T&amp;s Shell Catcher Beretta A400 Multitarget...,STS,B01HJHHBHG,5.0,A3QK5ZLRE2KHLL,Works every time,works time
2677386,2677386,T&amp;s Shell Catcher Beretta A400 Multitarget...,STS,B01HJHHBHG,5.0,A3VDML80KNR9QQ,I have a briley bolt release paddle installed ...,briley bolt release paddle instal xplor sport ...


#### Saving train/test data to `data/processed/`

In [9]:
DATA_PATH = Path("data/processed/")

train.to_csv(f"{DATA_PATH}/{CATEGORY}_train.csv", index=False)
test.to_csv(f"{DATA_PATH}/{CATEGORY}_test.csv", index=False)