In [15]:
!pip install scikit-surprise
import numpy as np
import pandas as pd
from surprise import Dataset
from surprise import Reader



In [16]:
data_main = pd.read_csv('https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Software_v1_00.tsv.gz',
                 sep='\t',
                 compression='gzip',
                 error_bad_lines=False,
                 warn_bad_lines=False)

In [17]:
def process_dataframe(data_main):
  # Drop Null Values
    combine_product_rating = data_main.dropna(axis=0, subset=['product_title'])

    # Get Rating Count Per Product
    product_ratingCount = (combine_product_rating.
        groupby(by=['product_title'])['star_rating'].
        count().
        reset_index().
        rename(columns={'star_rating': 'totalRatingCount'})
    [['product_title', 'totalRatingCount']]
        )

    # Combine Rating count to get record
    rating_with_totalRatingCount = combine_product_rating.merge(product_ratingCount, left_on='product_title',
                                                                right_on='product_title', how='left')
    # Set threshold for required review count
    popularity_threshold = 50

    # Get records with product popularity threshold & ignore star_ratings less than 3
    cleaned_data = rating_with_totalRatingCount.query(
        "totalRatingCount >= @popularity_threshold")
    
    return cleaned_data

In [18]:
data_main = process_dataframe(data_main)

In [19]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(data_main[["customer_id", "product_id", "star_rating"]], reader)

In [20]:
trainingSet = data.build_full_trainset()

In [21]:
from surprise import KNNWithMeans

# To use item-based cosine similarity
sim_options = {
    "name": "cosine",
    "user_based": False,  # Compute  similarities between items
}
algo = KNNWithMeans(k=30, min_k=5, sim_options=sim_options)
algo.fit(trainingSet)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f47aca19250>

In [22]:
predictions = algo.test(trainingSet.build_testset())

In [23]:
from surprise.dump import *

In [24]:
dump(file_name='knn.model', predictions=predictions, algo= algo, verbose=1)

The dump has been saved as file knn.model


In [25]:
model = load(file_name='knn.model')

In [26]:
similarity_matrix = model[1].compute_similarities()

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [27]:
test_subject = 164400
k = 15

In [28]:
import heapq
from collections import defaultdict

test_subject_iid = trainingSet.to_inner_uid(test_subject)
test_subject_ratings = trainingSet.ur[test_subject_iid]
k_neighbors = heapq.nlargest(k, test_subject_ratings, key=lambda t: t[1])

In [29]:
candidates = defaultdict(float)

for itemID, rating in k_neighbors:
    try:
      similaritities = similarity_matrix[itemID]
      for innerID, score in enumerate(similaritities):
          candidates[innerID] += score * (rating / 5.0)
    except:
      continue

In [30]:
products = data_main[['product_id', 'product_title']].drop_duplicates()

In [31]:
def getProductName(product_id):
      return products.loc[products['product_id'] == product_id, 'product_title'].iloc[0]

In [32]:
from operator import itemgetter

watched = {}
for itemID, rating in trainingSet.ur[test_subject_iid]:
  watched[itemID] = 1

recommendations = []

position = 0
for itemID, rating_sum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
  if not itemID in watched:
    recommendations.append(getProductName(trainingSet.to_raw_iid(itemID)))
    position += 1
    if (position > 10): break # We only want top 10

for recommendation in recommendations:
  print(recommendation)

Avast Free Antivirus 2015 [Download]
Kaspersky Anti-Virus 2015 3 User, 1 Year
Block Financial H&R Block Tax Software 14 Deluxe + State
Amazon Drive Desktop [Mac]
Intuit Quicken Home & Business 2015
Dragon Premium 13.0
TurboTax Deluxe Federal + E-File + State 2012
Microsoft Office Home and Student 2013 (1PC/1User) [Download]
Malwarebytes Anti-Malware Premium LIFETIME Activation Key! (GENUINE & AUTHORIZED, Immediate Key Issued, No Waiting for CD) -- previously Professional [Download]
TurboTax Deluxe Fed + Efile + State
Quicken Premier 2
