## Get to know the data

In [None]:
#| export
import os
import pandas as pd
import numpy as np
data_dir = 'devided_dataset_v2'
categories = ['CDs_and_Vinyl', 'Grocery_and_Gourmet_Food', 'Toys_and_Games']

: 

In [None]:
#| export
file_path = os.path.join(data_dir, categories[0], 'train', 'product_training.json')

In [None]:
#| export
c0_product_train = pd.read_json(file_path)
c0_product_train

In [None]:
file_path = os.path.join(data_dir, categories[0], 'train', 'review_training.json')
c0_review_train = pd.read_json(file_path)
c0_review_train

In [None]:
c0_review_train.shape

In [None]:
c0_review_train.iloc[3]

In [None]:
c0_review_train.iloc[3].reviewText

In [None]:
c0_product_test1 = pd.read_json(os.path.join(data_dir, categories[0], 'test1', 'product_test.json'))

In [None]:
c0_product_test1

## Merge review and product data into one dataframe

In [None]:
c0_train = c0_review_train.merge(c0_product_train, how='left', on='asin')

## Use sklearn's train_test_split to split into training and validation

In [None]:
from sklearn.model_selection import train_test_split

train, val = train_test_split(c0_train, test_size=0.15, random_state=888)
X_train = train.drop(['asin', 'awesomeness'], axis=1)
Y_train = train['awesomeness']

## Clean up the data

In [None]:
# some reviewText are missing, convert them to empty string

X_train['reviewText'] = X_train['reviewText'].fillna('')
X_train['summary'] = X_train['summary'].fillna('')

## Convert reviewText and summary into vectors
reviewText and summary are text, they can not be used directly in calculation, they must be converted into vectors

In [None]:

from sklearn.feature_extraction.text import CountVectorizer
c_vectorizer = CountVectorizer(stop_words='english')

# this will take a long time
reviewText_matrix = c_vectorizer.fit_transform(X_train['reviewText'])

##########################################
# This does not work because it use too many memory
#   reported it needed 1.5 TB of ram
###########################################
# reviewText_matrix = reviewText_matrix.toarray()

In [None]:
reviewText_matrix.shape

In [None]:
ft = c_vectorizer.get_feature_names_out()
len(ft)
# we got 309935
# this is too many, switch to HashingVectorizer

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer

# Create a CountVectorizer object
h_vectorizer = HashingVectorizer(n_features=30000)

# Fit the vectorizer to the text data and transform the data, will take 30s to 2min
reviewText_matrix = h_vectorizer.fit_transform(X_train['reviewText'])
reviewText_matrix.shape

## ~~convert sparse matrix to array~~
this does not work

In [None]:
# this require 146 GB of memory
#reviewText_dense = reviewText_matrix.toarray()

# this still give memoryerror
## Define a generator expression to iterate over the rows of the sparse matrix
# def row_generator(X):
#    for i in range(X.shape[0]):
#        yield X[i,:].toarray()
        
## Use the generator expression to convert the sparse matrix to a list of dense numpy arrays
#reviewText_list = list(row_generator(reviewText_matrix))

## convert to pandas sparse dataframe

In [None]:
df_reviewText = pd.DataFrame.sparse.from_spmatrix(reviewText_matrix,
                                                  columns=[f'r_{i}' for i in range(reviewText_matrix.shape[1])])

## Do the same for summary

In [None]:
# summary has less text so use less features
h_vectorizer = HashingVectorizer(n_features=5000)

# Fit the vectorizer to the text data and transform the data, will take 30s to 2min
summary_matrix = h_vectorizer.fit_transform(X_train['summary'])
df_summary = pd.DataFrame.sparse.from_spmatrix(summary_matrix,
                                               columns=[f's_{i}' for i in range(summary_matrix.shape[1])])

## Combine 2 sparse dataframe

In [None]:
from scipy import sparse
df_combined = sparse.hstack([df_reviewText, df_summary])

In [None]:
df_reviewText_dense = df_reviewText.sparse.to_dense()

## put them back to X_train

In [None]:
# This doesn't work because concat remove sparseness so it's very slow
X_train = X_train.drop(columns=['reviewText', 'summary'])
X_train = pd.concat([X_train, df_reviewText, df_summary], axis=1)

In [None]:
reviewText_df = pd.DataFrame.sparse.from_spmatrix(reviewText_matrix)

In [None]:
reviewText_df

## convert to dense matrix with limited memory
We can not do it in one go, have to do it in batches

In [None]:
batch_size = 5000
reviewText_dense_matrix = np.empty((reviewText_matrix.shape[0], vectorizer.vocabulary_.size), dtype=np.int)
for i in range(reviewText_matrix.shape[0], batch_size):
    reviewText_dense_matrix[i:i+batch_size] = reviewText_matrix[i:i+batch_size].todense()

In [None]:
reviewText_matrix

## Train the linear regression model

In [None]:
from sklearn.linear_model import LinearRegression

reg_model = LinearRegression().fit(X_train, Y_train)