# Setup

In [1]:
import numpy as np
import math
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer

from bs4 import BeautifulSoup

In [2]:
np.random.seed(42)

In [3]:
REVIEWS_PATH  = "./data/Video_Games_5.json.gz"
METADATA_PATH = "./data/meta_Video_Games.json.gz"

In [4]:
REVIEWS_FEATS = ['asin', 'reviewerID', 'overall']
METADATA_FEATS = ['asin', 'title', 'category']

In [5]:
NUM_PRODUCTS = 10000
K = 5

# Load

## Reviews

In [6]:
reviews = pd.read_json(REVIEWS_PATH, lines=True, compression='gzip', encoding = 'utf-8')
reviews = reviews[REVIEWS_FEATS]
reviews.head(5)

Unnamed: 0,asin,reviewerID,overall
0,700026657,A1HP7NVNPFMA4N,5
1,700026657,A1JGAP0185YJI6,4
2,700026657,A1YJWEXHQBWK2B,3
3,700026657,A2204E1TH211HT,2
4,700026657,A2RF5B5H74JLPE,5


## Metadata

In [7]:
metadata = pd.read_json(METADATA_PATH, lines=True, compression='gzip', encoding = 'utf-8')
metadata = metadata[METADATA_FEATS]
metadata.head(5)

Unnamed: 0,asin,title,category
0,42000742,Reversi Sensory Challenger,"[Video Games, PC, Games]"
1,78764343,Medal of Honor: Warfighter - Includes Battlefi...,"[Video Games, Xbox 360, Games, </span></span><..."
2,276425316,street fighter 2 II turbo super nintendo snes ...,"[Video Games, Retro Gaming & Microconsoles, Su..."
3,324411812,Xbox 360 MAS STICK,"[Video Games, Xbox 360, Accessories, Controlle..."
4,439335310,Phonics Alive! 3: The Speller,"[Video Games, PC, Games, </span></span></span>..."


### Clean category data

In [8]:
clean_HTML     = lambda vals: [BeautifulSoup(val, 'html.parser').get_text() for val in vals]
clean_empties  = lambda vals: [val for val in vals if val]

clean_pipeline = lambda vals: clean_empties(clean_HTML(vals))

In [9]:
metadata['category'] = metadata['category'].apply(clean_pipeline)
metadata.head(5)



Unnamed: 0,asin,title,category
0,42000742,Reversi Sensory Challenger,"[Video Games, PC, Games]"
1,78764343,Medal of Honor: Warfighter - Includes Battlefi...,"[Video Games, Xbox 360, Games]"
2,276425316,street fighter 2 II turbo super nintendo snes ...,"[Video Games, Retro Gaming & Microconsoles, Su..."
3,324411812,Xbox 360 MAS STICK,"[Video Games, Xbox 360, Accessories, Controlle..."
4,439335310,Phonics Alive! 3: The Speller,"[Video Games, PC, Games, Grades 2-12, Spelling..."


### Select category features

In [10]:
# Explode the 'cleaned_categories' column to create a row for each category
# Use value_counts to count the occurrences of each category
unique_cats = metadata['category'].explode().reset_index(drop=True).value_counts().reset_index()
unique_cats.columns = ['category', 'frequency']

In [11]:
select_cats = unique_cats[
                (unique_cats.frequency > 25) &
                # filter long descriptions
                (unique_cats.category.str.len() < 40) &
                # filter stray characters
                (unique_cats.category.str.len() > 1) &
                # custom stopwords
                (~unique_cats.category.isin({'none', 'Video Games'}))
            ]['category']
select_cats = set(select_cats.values)
select_cats

{'1 Player!',
 '10 Different levels',
 '10 Fun Levels',
 '100% satisfaction guaranteed',
 '160 Objects to Find',
 '3DO',
 '4 Different levels',
 '40 Objects Per Level',
 '400 Hidden Objects to Find',
 '400 Objects to Find',
 'Accessories',
 'Accessory Kits',
 'Adapters',
 'Anything else is just a sticker!!',
 'Atari 2600',
 'Atari 7800',
 'Atari Jaguar',
 'Atari Lynx',
 'Batteries',
 'Batteries & Chargers',
 'Brand new and high quality',
 'Cables',
 'Cables & Adapters',
 'Cartridge only.',
 'Cases & Storage',
 'Chargers',
 'Commodore 64',
 'Complete housing replacement set.',
 'Consoles',
 'Controllers',
 'Cooling Systems',
 'Currency & Subscription Cards',
 'Currency Cards',
 'Dance Pads',
 'Decorate and beautify your console',
 'Designed and sold by Demon Decal',
 'Digital Games',
 'Digital Games & DLC',
 'Downloadable Content',
 'Drums',
 'Easy to apply, clean, and remove',
 'Easy to use, stick, clean and remove',
 'Faceplates',
 'Faceplates, Protectors & Skins',
 'Features -',
 'Fi

In [12]:
select_pipeline = lambda vals: [val for val in vals if val in select_cats]

In [13]:
metadata['category'] = metadata['category'].apply(select_pipeline)
metadata.head(5)

Unnamed: 0,asin,title,category
0,42000742,Reversi Sensory Challenger,"[PC, Games]"
1,78764343,Medal of Honor: Warfighter - Includes Battlefi...,"[Xbox 360, Games]"
2,276425316,street fighter 2 II turbo super nintendo snes ...,"[Retro Gaming & Microconsoles, Super Nintendo,..."
3,324411812,Xbox 360 MAS STICK,"[Xbox 360, Accessories, Controllers, Joysticks]"
4,439335310,Phonics Alive! 3: The Speller,"[PC, Games]"


## Join datasets

In [14]:
data = reviews.merge(metadata, how='inner', on='asin')
data

Unnamed: 0,asin,reviewerID,overall,title,category
0,0700026657,A1HP7NVNPFMA4N,5,Anno 2070,"[PC, Games]"
1,0700026657,A1JGAP0185YJI6,4,Anno 2070,"[PC, Games]"
2,0700026657,A1YJWEXHQBWK2B,3,Anno 2070,"[PC, Games]"
3,0700026657,A2204E1TH211HT,2,Anno 2070,"[PC, Games]"
4,0700026657,A2RF5B5H74JLPE,5,Anno 2070,"[PC, Games]"
...,...,...,...,...,...
568981,B01H7VI5TC,A2Q5FXGX0VOWNV,4,Two Pack N64 Nintendo 64 Extension Cables,"[Retro Gaming & Microconsoles, Nintendo 64, Ac..."
568982,B01H7VI5TC,A2972RZ8R4SBSZ,5,Two Pack N64 Nintendo 64 Extension Cables,"[Retro Gaming & Microconsoles, Nintendo 64, Ac..."
568983,B01H7VI5TC,A1NBY361391RVJ,5,Two Pack N64 Nintendo 64 Extension Cables,"[Retro Gaming & Microconsoles, Nintendo 64, Ac..."
568984,B01H7VI5TC,A2TIZCOP1KN2YA,5,Two Pack N64 Nintendo 64 Extension Cables,"[Retro Gaming & Microconsoles, Nintendo 64, Ac..."


### Subset

In [15]:
# Subset the data
sample_asins = np.random.choice(data['asin'].unique(), size=NUM_PRODUCTS, replace=False)
data = data[data['asin'].isin(sample_asins)]
data.shape

(319672, 5)

## Helpers

In [16]:
def get_metadata(asin):
    return metadata[metadata['asin'] == asin].iloc[0]

## Compute features

### User rating features

In [17]:
user_item_matrix = data.pivot_table(index='reviewerID', columns='asin', values='overall').fillna(0)
item_user_matrix = user_item_matrix.T
item_user_matrix.shape

(10000, 54710)

In [18]:
rating_similarity = cosine_similarity(item_user_matrix)
rating_similarity.shape

(10000, 10000)

### Category features

In [19]:
categories = data[['asin', 'category']]
categories = categories.set_index('asin')
categories = categories[~categories.index.duplicated()]

# concatenate the categories into single strings
SEP = ";"
categories['category_str'] = categories['category'].apply(lambda x: SEP.join(x))

print(categories.shape)
categories.head(5)

(10000, 2)


Unnamed: 0_level_0,category,category_str
asin,Unnamed: 1_level_1,Unnamed: 2_level_1
700026657,"[PC, Games]",PC;Games
804161380,[],
3828770193,"[Kids & Family, Nintendo DS, Games]",Kids & Family;Nintendo DS;Games
6050036071,"[PlayStation 3, Accessories, Controllers, Musi...",PlayStation 3;Accessories;Controllers;Music Co...
8176503290,"[PC, Games]",PC;Games


In [20]:
# TF-IDF using a custom tokenizer
tokenizer = lambda s: [val for val in s.split(SEP) if val]
vectorizer = TfidfVectorizer(tokenizer=tokenizer, use_idf=False)
tfidf_matrix = vectorizer.fit_transform(categories['category_str'])
tfidf_matrix = pd.DataFrame(tfidf_matrix.toarray(), index=categories.index, columns=vectorizer.get_feature_names_out())
tfidf_matrix.head(5)

Unnamed: 0_level_0,1 player!,10 different levels,10 fun levels,3do,40 objects per level,400 hidden objects to find,400 objects to find,accessories,accessory kits,adapters,...,stylus pens,super nintendo,third party product,thumb grips,turbografx 16,wii,wii u,xbox,xbox 360,xbox one
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
700026657,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
804161380,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3828770193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6050036071,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8176503290,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
cat_similarity = linear_kernel(tfidf_matrix)
cat_similarity.shape

(10000, 10000)

### Combine into one similarity matrix

In [22]:
cat_mean_sim = cat_similarity.mean()
cat_mean_sim

0.28370662669114266

In [23]:
rating_mean_sim = rating_similarity.mean()
rating_mean_sim

0.0010358775382226122

In [24]:
# weight ratings to based on mean similarities
rating_sim_wt = rating_mean_sim / (rating_mean_sim + cat_mean_sim)
similarity_matrix = rating_sim_wt*rating_similarity + (1-rating_sim_wt)*cat_similarity
# similarity_matrix = rating_similarity
similarity_matrix = pd.DataFrame(similarity_matrix, index=user_item_matrix.columns, columns=user_item_matrix.columns)
similarity_matrix.head(5)

asin,0700026657,0804161380,3828770193,6050036071,8176503290,907843905X,9629971372,9882100848,9882106463,9882155456,...,B01H3NU4OS,B01H3Z4MYE,B01H482N6E,B01H4RS5AI,B01H5BJNXG,B01H6DHITE,B01H6GUCCQ,B01H6SM5CY,B01HDJFJKG,B01HDJFJLK
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
700026657,1.0,0.0,0.406763,0.0,0.996362,0.996362,0.000128,0.0,0.0,0.406763,...,0.0,0.996362,0.0,0.498181,0.406763,0.996362,0.498181,0.406763,0.0,0.0
804161380,0.0,0.003638,0.0,0.0,0.0,0.0,0.0,0.0,0.000123,0.0,...,0.0,0.0,9.1e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3828770193,0.406763,0.0,1.0,0.0,0.406763,0.406763,0.0,0.0,0.0,0.332121,...,0.0,0.406763,0.0,0.406763,0.664241,0.406763,0.406763,0.332121,0.0,0.0
6050036071,0.0,0.0,0.0,1.0,0.0,0.0,0.25726,0.0,0.25726,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25726,0.0
8176503290,0.996362,0.0,0.406763,0.0,1.0,0.996362,0.0,0.0,0.0,0.406763,...,0.0,0.996362,0.0,0.498181,0.406763,0.996362,0.498181,0.406763,0.0,0.0


In [25]:
# cosine similarity is in range (0, 1), so compute distance as 1 - similarity
# use clip to handle floating point precision errors
dist_matrix = np.clip((1 - similarity_matrix.values), 0, 1).round(5)
dist_matrix = pd.DataFrame(dist_matrix, index=similarity_matrix.index, columns=similarity_matrix.columns)
dist_matrix.head(5)

asin,0700026657,0804161380,3828770193,6050036071,8176503290,907843905X,9629971372,9882100848,9882106463,9882155456,...,B01H3NU4OS,B01H3Z4MYE,B01H482N6E,B01H4RS5AI,B01H5BJNXG,B01H6DHITE,B01H6GUCCQ,B01H6SM5CY,B01HDJFJKG,B01HDJFJLK
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
700026657,0.0,1.0,0.59324,1.0,0.00364,0.00364,0.99987,1.0,1.0,0.59324,...,1.0,0.00364,1.0,0.50182,0.59324,0.00364,0.50182,0.59324,1.0,1.0
804161380,1.0,0.99636,1.0,1.0,1.0,1.0,1.0,1.0,0.99988,1.0,...,1.0,1.0,0.99991,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3828770193,0.59324,1.0,0.0,1.0,0.59324,0.59324,1.0,1.0,1.0,0.66788,...,1.0,0.59324,1.0,0.59324,0.33576,0.59324,0.59324,0.66788,1.0,1.0
6050036071,1.0,1.0,1.0,0.0,1.0,1.0,0.74274,1.0,0.74274,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.74274,1.0
8176503290,0.00364,1.0,0.59324,1.0,0.0,0.00364,1.0,1.0,1.0,0.59324,...,1.0,0.00364,1.0,0.50182,0.59324,0.00364,0.50182,0.59324,1.0,1.0


# Item-based collaborative filtering - Using k-NN
Similarity and ranking using learned k-NN model

In [26]:
knn = NearestNeighbors(metric='precomputed')
knn.fit(dist_matrix.values)

NearestNeighbors(metric='precomputed')

## Find recommendations based on user reviews

In [27]:
query_users = np.random.choice(user_item_matrix.index, size=25, replace=False)

for query_user in query_users:
    print(f'Querying user: {query_user}')

    print('\nPositive reviews: ')
    query_user_reviews = user_item_matrix.loc[query_user].T
    query_user_reviews = query_user_reviews.loc[query_user_reviews > 3]
    print(query_user_reviews)

    if not query_user_reviews.any():
        print('No positive reviews to reference.')
    else:
        recommendations = dict()

        print('\nReference titles:')
        for query_asin in query_user_reviews.index:
            print(f'  - {get_metadata(query_asin).title} (item {query_asin})')

        query_matrix = dist_matrix.loc[query_user_reviews.index]
        res_dists, res_idx = knn.kneighbors(query_matrix.values, n_neighbors=K)  # rows=queries, cols=k-NN
        res_dists = res_dists.flatten()
        res_idx = res_idx.flatten()

        recommendations_attr = dict()
        for i in range(len(res_idx)):
            item = item_user_matrix.iloc[res_idx[i]].name
            if item not in query_user_reviews.index:
                recommendations[item] = res_dists[i]
                recommendations_attr[item] = query_user_reviews.index[i // K]

        recommendations_sorted = sorted(recommendations, key=recommendations.get)[:K]
        print(f'\nTop {K} recommendations')
        print('-'*100)
        for i, rec in enumerate(recommendations_sorted):
            query_asin = recommendations_attr[rec]
            print(f"  {i+1}) {get_metadata(rec).title} (item {rec})")
            print(f"    -- Based on {get_metadata(query_asin).title} (item {query_asin})")
            print(f"       Similarity: {similarity_matrix[query_asin].loc[rec]}")
        print('-'*100)
        
    print('~'*100)
    print()

Querying user: A1LP04HH1DYZLB

Positive reviews: 
asin
B000B9RI14    5.0
Name: A1LP04HH1DYZLB, dtype: float64

Reference titles:
  - Xbox 360 LIVE 1600 Points (item B000B9RI14)

Top 5 recommendations
----------------------------------------------------------------------------------------------------
  1) SpongeBob SquarePants: Revenge of the Flying Dutchman (item B00006ZCCB)
    -- Based on Xbox 360 LIVE 1600 Points (item B000B9RI14)
       Similarity: 0.9964694481123779
  2) Tenchu: Return From Darkness (item B00009KO3J)
    -- Based on Xbox 360 LIVE 1600 Points (item B000B9RI14)
       Similarity: 0.9964572068874413
  3) Halo Wars - Xbox 360 (item B0017HW5LM)
    -- Based on Xbox 360 LIVE 1600 Points (item B000B9RI14)
       Similarity: 0.9964567767838036
  4) BOB - Screen Time Manager - Manage Your TV Time &amp; Video Game Time (item B000GU78UY)
    -- Based on Xbox 360 LIVE 1600 Points (item B000B9RI14)
       Similarity: 0.9964356867371739
-----------------------------------------


Top 5 recommendations
----------------------------------------------------------------------------------------------------
  1) Blades of Steel (item B000KJF7YA)
    -- Based on Kinect Sesame Street TV - Xbox 360 (item B0050SW9OC)
       Similarity: 0.9966406633290645
  2) Mad Catz Rock Band 3 Guitar Bundle - Red Hot Chili Peppers Bonus Tracks, Full Game, and Fender Stratocaster Guitar Controller for Xbox 360 (item B006603V1I)
    -- Based on Lego Star Wars: The Complete Saga - Xbox 360 (item B000R0SRNU)
       Similarity: 0.9966224338553793
  3) NameStar , Personalized Stainless Steel Kid's Water Bottle, Silver, 12.5 oz (item B00I7IUZ3E)
    -- Based on Lego Star Wars: The Complete Saga - Xbox 360 (item B000R0SRNU)
       Similarity: 0.9966200116748697
  4) Table Tennis - Xbox 360 (item B000F0UT38)
    -- Based on Kinect Sesame Street TV - Xbox 360 (item B0050SW9OC)
       Similarity: 0.9965930287547262
  5) Nintendo Wii Remote Plus - Blue (item B0094X20IO)
    -- Based on Kinect Ses

  1) Batman: Arkham Origins (item B00C7107DU)
    -- Based on Nintendo Wii U Console - 32GB Black Deluxe Set (item B009AGXH64)
       Similarity: 0.9966749054513786
  2) Wheel of Fortune - Nintendo Wii U (item B0090PX8AU)
    -- Based on Nintendo Wii U Console - 32GB Black Deluxe Set (item B009AGXH64)
       Similarity: 0.9965473693637292
  3) Nintendo 3DS XL - Yellow Pikachu (item B00BUSLSAC)
    -- Based on Nintendo Wii U Console - 32GB Black Deluxe Set (item B009AGXH64)
       Similarity: 0.9965375903082134
  4) Nintendo 3DS Handheld Console with Nintendogs Cats | Pink (item B006FQQMZA)
    -- Based on Nintendo Wii U Console - 32GB Black Deluxe Set (item B009AGXH64)
       Similarity: 0.9965357898996129
  5) Nintendo Wii Remote Plus, Yoshi (item B00K73DP5W)
    -- Based on Mario Kart 8 - Nintendo Wii U (item B00DC7G2W8)
       Similarity: 0.9964785174440411
----------------------------------------------------------------------------------------------------
~~~~~~~~~~~~~~~~~~~~~~~~~~


Top 5 recommendations
----------------------------------------------------------------------------------------------------
  1) NCAA Football 08 - Playstation 3 (item B000P0XA34)
    -- Based on Brink (item B002DC8GKE)
       Similarity: 0.9964087790217347
  2) Rare Replay - Xbox One (item B00ZMBMO06)
    -- Based on Dishonored 2 - PlayStation 4 (item B00ZM5OXD8)
       Similarity: 0.9963680618004193
  3) The Sims Castaway Stories - PC (item B000WQWQ36)
    -- Based on Brink (item B002DC8GKE)
       Similarity: 0.9963620551100154
  4) PlayStation Portable Limited Edition Madden NFL 09 Entertainment Pack- Metallic Blue (item B001A3DKGC)
    -- Based on Brink (item B002DC8GKE)
       Similarity: 0.9963620551100154
  5) Imagine: Fashion Designer (item B000SQ5LMI)
    -- Based on Brink (item B002DC8GKE)
       Similarity: 0.9963620551100154
----------------------------------------------------------------------------------------------------
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

  3) Escape from Monkey Island - PC (item B00004U8H1)
    -- Based on Shogun 2: Fall of the Samurai, Limited Edition - PC (item B006JSXVA8)
       Similarity: 0.9967897231898296
  4) Metal Fatigue - PC (item B0000296ZN)
    -- Based on Shogun 2: Fall of the Samurai, Limited Edition - PC (item B006JSXVA8)
       Similarity: 0.996758104385252
----------------------------------------------------------------------------------------------------
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Querying user: A1U82PE26WH8JR

Positive reviews: 
asin
B0009O7HVM    5.0
B0050SXI6K    5.0
Name: A1U82PE26WH8JR, dtype: float64

Reference titles:
  - Star Wars Battlefront II - PlayStation 2 (item B0009O7HVM)
  - Ratchet &amp; Clank Collection (item B0050SXI6K)

Top 5 recommendations
----------------------------------------------------------------------------------------------------
  1) SOCOM 3 U.S. Navy Seals - PlayStation 2 (item B0009EHQVI)
    