<a href="https://colab.research.google.com/github/tmdang1101/amazon_product_recommender_system/blob/main/Model_1_Collaborative_Filtering_Item_Based.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preliminaries #

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import scipy.stats as ss
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error

import gzip
import json

In [None]:
# Connects this notebook to Google Drive
import os
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

pathname = "/content/drive/My Drive/Recommender System/"
os.chdir(pathname)

Mounted at /content/drive


# Data Pre-Processing #

## Ratings Dataset ##

In [None]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        new_d = []
        new_d.append(d['overall'])
        new_d.append(d['reviewerID'])
        new_d.append(d['asin'])
        df[i] = new_d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index', columns=['overall', 'reviewerID', 'asin'])

df = getDF('/content/drive/My Drive/Recommender System/Data/Movies_and_TV_5.json.gz')

In [None]:
df.head()

Unnamed: 0,overall,reviewerID,asin
0,5.0,A2M1CU2IRZG0K9,0005089549
1,5.0,AFTUJYISOFHY6,0005089549
2,5.0,A3JVF9Y53BEOGC,000503860X
3,5.0,A12VPEOEZS1KTC,000503860X
4,5.0,ATLZNVLYKP9AZ,000503860X


In [None]:
df.shape

(3410019, 3)

In [None]:
ss.describe(df['overall'])

DescribeResult(nobs=3410019, minmax=(1.0, 5.0), mean=4.221320174462371, variance=1.360619309461769, skewness=-1.4843835182448823, kurtosis=1.2016060314839265)

In [None]:
# Main Training Set
ratings = df[['reviewerID','asin','overall']]

ratings.rename(columns={'reviewerID':'user_id'}, inplace=True)
ratings.rename(columns={'asin':'product_id'}, inplace=True)
ratings.rename(columns={'overall':'rating'}, inplace=True)

In [None]:
ratings.head()

Unnamed: 0,user_id,product_id,rating
0,A2M1CU2IRZG0K9,0005089549,5.0
1,AFTUJYISOFHY6,0005089549,5.0
2,A3JVF9Y53BEOGC,000503860X,5.0
3,A12VPEOEZS1KTC,000503860X,5.0
4,ATLZNVLYKP9AZ,000503860X,5.0


## Metadata Dataset ##

In [None]:
def getMetadataDF(path):
    i = 0
    df = {}
    for d in parse(path):
        new_d = []
        new_d.append(d['title'])
        new_d.append(d['also_buy'])
        new_d.append(d['also_view'])
        new_d.append(d['similar_item'])
        new_d.append(d['asin'])
        df[i] = new_d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index', columns=['title', 'also_buy', 'also_view', 'similar_item', 'asin'])

metadata = getMetadataDF('/content/drive/My Drive/Recommender System/Data/meta_Movies_and_TV.json.gz')

In [None]:
metadata.head()

Unnamed: 0,title,also_buy,also_view,similar_item,asin
0,Understanding Seizures and Epilepsy,[],[],,695009
1,Spirit Led&mdash;Moving By Grace In The Holy S...,[],[],,791156
2,My Fair Pastry (Good Eats Vol. 9),[],[],,143529
3,"Barefoot Contessa (with Ina Garten), Entertain...","[B002I5GNW4, B005WXPVMM, B009UY3W8O, B00N27ID1...","[B002I5GNW4, 0804187045, B009UY3W8O, 060960219...",,143588
4,Rise and Swine (Good Eats Vol. 7),"[B000P1CKES, B000NR4CRM]",[B0015SVNXY],,143502


In [None]:
metadata.shape

(203766, 5)

## Sparse Matrix ##

In [None]:
# Creates Sparse Matrix as the Input
from scipy.sparse import csr_matrix

def create_X(df):
    """
    Generates a sparse matrix from ratings dataframe.
    
    Args:
        df: pandas dataframe
    
    Returns:
        X: sparse matrix
        user_mapper: dict that maps user id's to user indices
        user_inv_mapper: dict that maps user indices to user id's
        movie_mapper: dict that maps movie id's to movie indices
        movie_inv_mapper: dict that maps movie indices to movie id's
    """
    N = df['user_id'].nunique()
    M = df['product_id'].nunique()

    user_mapper = dict(zip(np.unique(df["user_id"]), list(range(N))))
    movie_mapper = dict(zip(np.unique(df["product_id"]), list(range(M))))
    
    user_inv_mapper = dict(zip(list(range(N)), np.unique(df["user_id"])))
    movie_inv_mapper = dict(zip(list(range(M)), np.unique(df["product_id"])))
    
    user_index = [user_mapper[i] for i in df['user_id']]
    movie_index = [movie_mapper[i] for i in df['product_id']]

    X = csr_matrix((df["rating"], (movie_index, user_index)), shape=(M, N))
    
    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

In [None]:
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_X(ratings)

In [None]:
X

<60175x297529 sparse matrix of type '<class 'numpy.float64'>'
	with 3282379 stored elements in Compressed Sparse Row format>

In [None]:
sparsity = X.count_nonzero()/(X.shape[0]*X.shape[1])

print(f"Matrix sparsity: {round(sparsity*100,2)}%")

Matrix sparsity: 0.02%


# Similiarity Model #

In [None]:
from sklearn.neighbors import NearestNeighbors

def find_similar_movies(movie_id, X, k, metric='cosine', show_distance=True):
    """
    Finds k-nearest neighbours for a given movie id.
    
    Args:
        movie_id: id of the movie of interest
        X: user-item utility matrix
        k: number of similar movies to retrieve
        metric: distance metric for kNN calculations
    
    Returns:
        list of k similar movie ID's
    """
    neighbour_ids = []
    
    movie_ind = movie_mapper[movie_id]
    movie_vec = X[movie_ind]
    k+=1
    kNN = NearestNeighbors(n_neighbors=k, algorithm="brute", metric=metric)
    kNN.fit(X)
    if isinstance(movie_vec, (np.ndarray)):
        movie_vec = movie_vec.reshape(1,-1)
    neighbour = kNN.kneighbors(movie_vec, return_distance=show_distance)
    for i in range(0,k):
        n = neighbour[1].item(i)
        neighbour_ids.append(movie_inv_mapper[n])
    neighbour_ids.pop(0)
    return neighbour_ids, neighbour[0]

# Recommendation #

In [None]:
#movie_id = 'B00IT70F3S'     # Avengers: Age of Ultron
#movie_id = 'B00LYYUTEI'    # The Dark Knight
movie_id = 'B00005JMAH'     # Harry Potter and the Prisoner of Azkaban

similar_ids, similarity_distances = find_similar_movies(movie_id, X, k=10)

print(f"Because you watched {movie_id}:\n")
for id in similar_ids:
    print(f"{id}\n{metadata.loc[metadata['asin'] == id]['title']}\n")

Because you watched B00005JMAH:

B000E6EK3S
79415    Harry Potter and the Goblet of Fire
Name: title, dtype: object

B00005JPI2
27164    Harry Potter and the Order of the Phoenix
49091    Harry Potter and the Order of the Phoenix
Name: title, dtype: object

B000ZECQ08
105857    Harry Potter and the Half-Blood Prince
Name: title, dtype: object

B001UV4XI8
127394     Harry Potter and the Deathly Hallows, Part 1 ...
Name: title, dtype: object

B001UV4XIS
127372    HP7: Deathly Hallows, P2 (DVD)
Name: title, dtype: object

B00005JMQW
26936    Spider-Man 2
48863    Spider-Man 2
Name: title, dtype: object

B00005JMQZ
26926    Shrek 2
48853    Shrek 2
Name: title, dtype: object

B00005JKZY
26792    The Lord of the Rings: The Return of the King
48719    The Lord of the Rings: The Return of the King
Name: title, dtype: object

B00005JNJV
26992    Batman Begins
48919    Batman Begins
Name: title, dtype: object

7799146915
19337    Star Wars: Episode III - Revenge of the Sith (...
Name: title, dt