In [None]:
!pip install scikit-surprise
import numpy as np
import pandas as pd
from surprise import Dataset
from surprise import Reader



In [None]:
data_main = pd.read_csv('https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Software_v1_00.tsv.gz',
                 sep='\t',
                 compression='gzip',
                 error_bad_lines=False,
                 warn_bad_lines=False)

In [None]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(data_main[["customer_id", "product_id", "star_rating"]], reader)

In [None]:
trainingSet = data.build_full_trainset()

In [None]:
from surprise import KNNWithMeans

# To use item-based cosine similarity
sim_options = {
    "name": "cosine",
    "user_based": False,  # Compute  similarities between items
}
algo = KNNWithMeans(k=30, min_k=5, sim_options=sim_options)
algo.fit(trainingSet)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f5fb2c872d0>

In [None]:
predictions = algo.test(trainingSet.build_testset())

In [None]:
from surprise.dump import *

In [None]:
dump(file_name='knn.model', predictions=predictions, algo= algo, verbose=1)

The dump has been saved as file knn.model


In [None]:
model = load(file_name='knn.model')

In [None]:
similarity_matrix = model[1].compute_similarities()

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [None]:
# test_subject = 11635690
test_subject = 46098046
k = 15

In [None]:
import heapq
from collections import defaultdict

test_subject_iid = trainingSet.to_inner_uid(test_subject)
test_subject_ratings = trainingSet.ur[test_subject_iid]
k_neighbors = heapq.nlargest(k, test_subject_ratings, key=lambda t: t[1])

In [None]:
candidates = defaultdict(float)

for itemID, rating in k_neighbors:
    try:
      similaritities = similarity_matrix[itemID]
      for innerID, score in enumerate(similaritities):
          candidates[innerID] += score * (rating / 5.0)
    except:
      continue

In [None]:
products = data_main[['product_id', 'product_title']].drop_duplicates()

In [None]:
def getProductName(product_id):
      return products.loc[products['product_id'] == product_id, 'product_title'].iloc[0]

In [None]:
from operator import itemgetter

watched = {}
for itemID, rating in trainingSet.ur[test_subject_iid]:
  watched[itemID] = 1

recommendations = []

position = 0
for itemID, rating_sum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
  if not itemID in watched:
    recommendations.append(getProductName(trainingSet.to_raw_iid(itemID)))
    position += 1
    if (position > 10): break # We only want top 10

for recommendation in recommendations:
  print(recommendation)

Product:  TurboTax Deluxe Federal + E-File + State 2012
Product:  TurboTax Deluxe Fed, Efile and State 2013
Product:  TurboTax Deluxe Federal + State + eFile 2008
Product:  CCleaner Free [Download]
Product:  ResumeMaker Professional Deluxe 18
Product:  Amazon Drive Desktop [PC]
Product:  Norton Internet Security 1 User 3 Licenses
Product:  SecureAnywhere Intermet Security Complete 5 Device
Product:  Pc Matic Performance Utility Suite (5 User Edition)
Product:  Microsoft OneNote 2013 (1PC/1User)
Product:  Intuit Quicken Rental Property Manager 2015
