## Install Surpriselib/ Environment Preparation

In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.4.2/spark-2.4.2-bin-hadoop2.7.tgz
!tar xf spark-2.4.2-bin-hadoop2.7.tgz
!pip install -q findspark

!pip install scikit-surprise

!pip install lightfm

!pip install PyDrive
!pip install msgpack



In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.2-bin-hadoop2.7"

In [0]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import urllib.request
import zipfile

from collections import defaultdict

import msgpack

import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split as sk_tts

import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.functions import isnan
from pyspark.sql import functions as F
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, IndexToString
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark import SparkContext
from pyspark.sql.functions import lit
from pyspark.ml.recommendation import ALS

from surprise import Reader, Dataset
from surprise import SVD, SVDpp, NMF, KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly
from surprise import accuracy
from surprise.model_selection import train_test_split as surprise_tts
from surprise.model_selection import GridSearchCV

from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score
from scipy.sparse.coo import coo_matrix

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Datasets download

In [0]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
file_list = drive.ListFile({'q': "'root' in parents and trashed=false"}).GetList()
for file1 in file_list:
  print('title: %s, id: %s' % (file1['title'], file1['id']))

title: LU08.ipynb, id: 1DQoWWBwFeaXI6CumEPKT9CpLQ7pCdLIz
title: SPAAI, id: 15xHXty3TvHwIS6hSAgt1lq-fXg6H-XaO
title: BookCrossing, id: 1UBntfrt6niZGad3p_BbysJGpmKVg7fkq
title: Yelp2, id: 1t7_Z9fEp207WlAPIcw8iLChcW71DtINT
title: Colab Notebooks, id: 1D1Kjnr4ZJkGlIzTqHV3A7swMZCZdUvn8
title: Yelp, id: 1G0-MKdJMIzHCcEnP_8_6Gk3aQKyl_435
title: DLSD_Project, id: 1_aiIGyPkP5ojOrZLhMWb2eP-J_ayI_dB
title: Getting started, id: 0BzLIx_JXOsMsc3RhcnRlcl9maWxl


In [0]:
file_list = drive.ListFile({'q': "'1G0-MKdJMIzHCcEnP_8_6Gk3aQKyl_435' in parents and trashed=false"}).GetList()
for file1 in file_list:
  print('title: %s, id: %s' % (file1['title'], file1['id']))

title: user_mapping.pickle, id: 1ZVPjsW6wK0qXg3Wh5e2SnQYstk6tFSfp
title: business_mapping.pickle, id: 1Oj46ti8KLY4QFH3x8_PORvwOmsXBRPME
title: review_enc_30K.csv, id: 1mhSipRAxg3_Nt2bDBqUB1irHbdiqXG9F
title: review_enc_20K.csv, id: 1mTdoiOiV4zr8M7PDnOzW42JHtvPy5klF
title: review_enc_10K.csv, id: 1XfKApQYeMl4Gl1OOQ-8qfyaC2L9PyXnQ
title: review_enc_3M.csv, id: 1_YubYVgKsbpAVhsVEfyORzGWSf_th0EC
title: review_enc_1M.csv, id: 1JSzg4xaooCd14JzOWuwybSDXl8fqsRct
title: cftest_enc.csv, id: 1aGVcnTfEjdSwIH9ZU5MBveiSjKwrn2BK
title: review_enc_100K.csv, id: 17snwxn2nlMba2P6MGoQg5my5yIw9WB-B
title: user_enc.csv, id: 1nEtiOwwp_ym08duyybaDrEK9DM4SEem1
title: business_enc.csv, id: 1BAHxH3p5NI6PWqI8NGI_GieXfM5fc02F
title: review_enc.csv, id: 112uBfXYPtgkS3dIIOkVwxWGj_m6V-UZL
title: review_50000.csv, id: 11k1Xl9ijq2qqM-xP-y1qvC-fVMOslZF9
title: review_15000.csv, id: 1j_urXqXqw2FA_WlPx-6P55-zlufmzEjg
title: review_10000.csv, id: 1g6h0a0LMV-Jkc93z7YnqMF6Id8CShnyG
title: review_25000.csv, id: 1TnUcOitaUX_f

In [0]:
for file1 in file_list:
  if('_enc') in file1['title']:
    id = file1['id']
    fileId = drive.CreateFile({'id': id})

    filename = file1['title']
    print(filename)
    fileId.GetContentFile(filename)  # Save Drive file as a local file

review_enc_30K.csv
review_enc_20K.csv
review_enc_10K.csv
review_enc_3M.csv
review_enc_1M.csv
cftest_enc.csv
review_enc_100K.csv
user_enc.csv
business_enc.csv
review_enc.csv


## Loading and parsing datasets

In [0]:
ratings_raw_data = pd.read_csv('review_enc_10K.csv')
ratings_raw_data_header = ratings_raw_data.columns

ratings_data = ratings_raw_data[['user_id', 'business_id', 'stars']]
ratings_data.columns = ['user_id', 'business_id', 'stars']

ratings_data.head(3)

Unnamed: 0,user_id,business_id,stars
0,1158189,176750,1.0
1,1599679,74082,5.0
2,1307908,100927,5.0


In [0]:
ratings_data.shape

(10000, 3)

In [0]:
business_raw_data = pd.read_csv('business_enc.csv', encoding = "latin-1")
business_raw_data_header = business_raw_data.columns

business_data = business_raw_data.replace(to_replace='None', value=np.nan)

business_raw_data.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,address,attributes,attributes_AcceptsInsurance,attributes_AgesAllowed,attributes_Alcohol,attributes_Ambience,attributes_BYOB,attributes_BYOBCorkage,attributes_BestNights,attributes_BikeParking,attributes_BusinessAcceptsBitcoin,attributes_BusinessAcceptsCreditCards,attributes_BusinessParking,attributes_ByAppointmentOnly,attributes_Caters,attributes_CoatCheck,attributes_Corkage,attributes_DietaryRestrictions,attributes_DogsAllowed,attributes_DriveThru,attributes_GoodForDancing,attributes_GoodForKids,attributes_GoodForMeal,attributes_HairSpecializesIn,attributes_HappyHour,attributes_HasTV,attributes_Music,attributes_NoiseLevel,attributes_Open24Hours,attributes_OutdoorSeating,attributes_RestaurantsAttire,attributes_RestaurantsCounterService,attributes_RestaurantsDelivery,attributes_RestaurantsGoodForGroups,attributes_RestaurantsPriceRange2,attributes_RestaurantsReservations,attributes_RestaurantsTableService,attributes_RestaurantsTakeOut,attributes_Smoking,attributes_WheelchairAccessible,attributes_WiFi,categories,city,hours,hours_Friday,hours_Monday,hours_Saturday,hours_Sunday,hours_Thursday,hours_Tuesday,hours_Wednesday,is_open,latitude,longitude,name,postal_code,review_count,stars,state,business_id
0,2818 E Camino Acequia Drive,,,,,,,,,,,,,,,,,,,,,False,,,,,,,,,,,,,,,,,,,,"Golf, Active Life",Phoenix,,,,,,,,,0,33.522143,-112.018481,Arizona Biltmore Golf Club,85016,5,3.0,AZ,7340
1,30 Eglinton Avenue W,,,,u'full_bar',"{'romantic': False, 'intimate': False, 'classy...",,,,False,,,"{'garage': False, 'street': False, 'validated'...",,True,,,,,,,True,"{'dessert': False, 'latenight': False, 'lunch'...",,,False,,u'loud',,False,u'casual',,False,True,2.0,True,True,True,,,u'no',"Specialty Food, Restaurants, Dim Sum, Imported...",Mississauga,,9:0-1:0,9:0-0:0,9:0-1:0,9:0-0:0,9:0-0:0,9:0-0:0,9:0-0:0,1,43.605499,-79.652289,Emerald Chinese Restaurant,L5R 3E7,128,2.5,ON,82973
2,"10110 Johnston Rd, Ste 15",,,,u'beer_and_wine',"{'romantic': False, 'intimate': False, 'touris...",,,,True,,True,"{'garage': False, 'street': False, 'validated'...",,False,,,,,,,True,"{'dessert': False, 'latenight': False, 'lunch'...",,,True,,u'average',,False,'casual',,False,True,2.0,True,True,True,,,u'no',"Sushi Bars, Restaurants, Japanese",Charlotte,,17:30-22:0,17:30-21:30,17:30-22:0,17:30-21:0,17:30-21:30,,17:30-21:30,1,35.092564,-80.859132,Musashi Japanese Restaurant,28210,170,4.0,NC,134808
3,"15655 W Roosevelt St, Ste 237",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"Insurance, Financial Services",Goodyear,,8:0-17:0,8:0-17:0,,,8:0-17:0,8:0-17:0,8:0-17:0,1,33.455613,-112.395596,Farmers Insurance - Paul Lorenz,85338,3,5.0,AZ,186365
4,"4209 Stuart Andrew Blvd, Ste F",,,,,,,,,,False,True,,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,"Plumbing, Shopping, Local Services, Home Servi...",Charlotte,,7:0-23:0,7:0-23:0,7:0-23:0,7:0-23:0,7:0-23:0,7:0-23:0,7:0-23:0,1,35.190012,-80.887223,Queen City Plumbing,28217,4,4.0,NC,56458


In [0]:
business_data.shape

(192609, 60)

In [0]:
user_raw_data = pd.read_csv('user_enc.csv')
user_raw_data_header = user_raw_data.columns

user_data = user_raw_data[['user_id', 'name']]

user_raw_data.head()

Unnamed: 0,user_id,name,review_count,yelping_since,useful,fans,average_stars
0,1256813,Rashmi,95,2013-10-08 23:11:33,84,5,4.03
1,141378,Jenna,33,2013-02-21 22:29:06,48,4,3.63
2,1013852,David,16,2013-10-04 00:16:10,28,0,3.71
3,1054713,Angela,17,2014-05-22 15:57:30,30,5,4.85
4,596783,Nancy,361,2013-10-23 07:02:50,1114,39,4.08


In [0]:
user_data.shape

(1637138, 2)

## Data Cleaning

In [0]:
ratings_data_nodup = ratings_data.drop_duplicates()

print("Before:", ratings_data.shape)
print("After:", ratings_data_nodup.shape)

Before: (10000, 3)
After: (9991, 3)


In [0]:
idx_dup = np.where(ratings_data_nodup.index.duplicated())
print(idx_dup)

(array([], dtype=int64),)


In [0]:
#Filter records for users who have rated at least 5 businesses
user_rating_counts = ratings_data_nodup['user_id'].value_counts()
print(user_rating_counts.shape[0])

9366


In [0]:
#Filter records for businesses who have rated at least 5 ratings
business_rating_counts = ratings_data_nodup['business_id'].value_counts()
print(business_rating_counts.shape[0])

4618


In [0]:
rating_counts = ratings_data_nodup['stars'].value_counts()
rating_counts

5.0    4418
4.0    2177
1.0    1527
3.0    1069
2.0     800
Name: stars, dtype: int64

## Dataset Presentation

In [0]:
ratings_dict = {'itemID': list(ratings_data_nodup.business_id),
                'userID': list(ratings_data_nodup.user_id),
                'rating': list(ratings_data_nodup.stars)}
df = pd.DataFrame(ratings_dict)

## Scikit-SurpriseLib

### Train/Test Split

In [0]:
reader = Reader(rating_scale=(0.5, 5.0))

data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)

trainset = data.build_full_trainset()

In [0]:
print("Number of Businesses:", trainset.n_items)
print("Number of Users:", trainset.n_users)

Number of Businesses: 4618
Number of Users: 9366


In [0]:
training, test = surprise_tts(data, test_size=.2)
test_for_predict = test

### Collaborative Filtering

In [0]:
for Algo in ["KNNBasic", 'KNNWithMeans', 'KNNWithZScore']:
  
  for sim_metric in [ 'cosine', 'pearson', 'msd', 'pearson_baseline']:

    sim_options = {'name': sim_metric,
                   'user_based': True  # compute  similarities between users
                   }

    if(Algo == "KNNBasic"):
      model_user = KNNBasic(sim_options=sim_options, verbose=False)
    elif(Algo == "KNNWithMeans"):
      model_user = KNNWithMeans(sim_options=sim_options, verbose=False)
    elif(Algo == "KNNWithZScore"):
      model_user = KNNWithZScore(sim_options=sim_options, verbose=False)
    
    model_user.fit(training)

    predictions_user = model_user.test(test_for_predict)

    # Then compute RMSE
    error_user = accuracy.rmse(predictions_user)

    # best RMSE score
    print('Algo: {}, similarity metric: {}, RMSE: {}'.format(Algo, sim_metric, error_user))

RMSE: 1.4733
Algo: KNNBasic, similarity metric: cosine, RMSE: 1.473314935577257
RMSE: 1.4739
Algo: KNNBasic, similarity metric: pearson, RMSE: 1.4738796557934557
RMSE: 1.4733
Algo: KNNBasic, similarity metric: msd, RMSE: 1.473314935577257
RMSE: 1.4739
Algo: KNNBasic, similarity metric: pearson_baseline, RMSE: 1.4738796557934557
RMSE: 1.4932
Algo: KNNWithMeans, similarity metric: cosine, RMSE: 1.4932031581304874
RMSE: 1.4936
Algo: KNNWithMeans, similarity metric: pearson, RMSE: 1.4935800059696343
RMSE: 1.4932
Algo: KNNWithMeans, similarity metric: msd, RMSE: 1.4932031581304874
RMSE: 1.4936
Algo: KNNWithMeans, similarity metric: pearson_baseline, RMSE: 1.4935800059696343
RMSE: 1.4931
Algo: KNNWithZScore, similarity metric: cosine, RMSE: 1.4930825798422611
RMSE: 1.4936
Algo: KNNWithZScore, similarity metric: pearson, RMSE: 1.4935800059696343
RMSE: 1.4931
Algo: KNNWithZScore, similarity metric: msd, RMSE: 1.4930825798422611
RMSE: 1.4936
Algo: KNNWithZScore, similarity metric: pearson_basel

### Matrix Factorization

In [0]:
for Algo in ["SVD", 'SVDpp', 'NMF']:
  
    if(Algo == "SVD"):
      model_MF = SVD()
    elif(Algo == "SVDpp"):
      model_MF = SVDpp()
    elif(Algo == "NMF"):
      model_MF = NMF()
    elif(Algo == "ALS"):
      bsl_options = {'method': 'als',
                      'n_epochs': 5,
                      'reg_u': 12,
                      'reg_i': 5 }
      model_MF = BaselineOnly(bsl_options=bsl_options)
      
    model = model_MF
    model.fit(training)
    predictions = model.test(test_for_predict)

    # Then compute RMSE
    error = accuracy.rmse(predictions)

    # best RMSE score
    print('Algo: {}, RMSE: {}'.format(Algo, error))

RMSE: 1.4300
Algo: SVD, RMSE: 1.430005490798664
RMSE: 1.4251
Algo: SVDpp, RMSE: 1.4251121726087237
RMSE: 1.4995
Algo: NMF, RMSE: 1.4995359743418488


In [0]:
param_grid = {'n_epochs': [10, 20, 40], 'lr_all': [0.002, 0.005, 0.01],
              'reg_all': [0.1, 0.2, 0.4]}

model = SVDpp
gs = GridSearchCV(model, param_grid, measures=['rmse'], cv=3)

gs.fit(data)

print('The best model was trained with %s' % gs.best_params['rmse'])

model = gs.best_estimator['rmse']
model.fit(training)
predictions = model.test(test_for_predict)

# Then compute RMSE
error = accuracy.rmse(predictions)

# best RMSE score
print('For testing data the RMSE is %s' % (error))

The best model was trained with {'n_epochs': 40, 'lr_all': 0.01, 'reg_all': 0.2}
RMSE: 1.4182
For testing data the RMSE is 1.4181911761433839


## Spark ML

### Train/Test split

In [0]:
from pyspark.sql import SparkSession

spark = SparkSession \
     .builder \
     .master("local[*]") \
     .config("spark.executor.memory", "4g") \
     .config("spark.driver.memory", "4g") \
     .config("spark.memory.offHeap.enabled",True) \
     .config("spark.memory.offHeap.size","4g") \
     .appName("rec") \
     .getOrCreate()

In [0]:
ratings_raw_data = spark.read.csv('review_enc_10K.csv',inferSchema=True,header=True)
ratings_raw_data_header = ratings_raw_data.columns

ratings_data = ratings_raw_data[['user_id', 'business_id', 'stars']]

In [0]:
training, validation, test = ratings_data.randomSplit([0.6, 0.2, 0.2], seed=0)

### Matrix Factorization

In [0]:
seed = 5
iterations = 25
regularization_parameter = 5.0
ranks = [4, 8, 10, 12]
errors = [0, 0, 0, 0]
err = 0
tolerance = 0.02

min_error = float('inf')
best_rank = -1
best_iteration = -1
for rank in ranks:
    als = ALS(maxIter=iterations, rank=rank, regParam=regularization_parameter, seed=seed, userCol="user_id", 
          itemCol="business_id", ratingCol="stars", coldStartStrategy="drop")
    model = als.fit(training)
    predictions = model.transform(validation)
    evaluator = RegressionEvaluator(metricName="rmse", 
                                labelCol="stars",
                                predictionCol="prediction")
    error = evaluator.evaluate(predictions)
    errors[err] = error
    err += 1
    print('For rank %s the RMSE is %s' % (rank, error))
    if error < min_error:
        min_error = error
        best_rank = rank

print('The best model was trained with rank %s' % best_rank)

For rank 4 the RMSE is 4.21255973560481
For rank 8 the RMSE is 4.212133766200419
For rank 10 the RMSE is 4.211716778894937
For rank 12 the RMSE is 4.2117295975393905
The best model was trained with rank 10


In [0]:
predictions.head(3)

[Row(user_id=48898, business_id=173382, stars=5.0, prediction=7.059702757279684e-13),
 Row(user_id=1352785, business_id=186574, stars=4.0, prediction=0.0003806327877100557),
 Row(user_id=1354006, business_id=3834, stars=5.0, prediction=-3.2748345111270975e-41)]

In [0]:
als = ALS(maxIter=iterations, rank=best_rank, regParam=regularization_parameter, seed=seed, userCol="user_id", 
          itemCol="business_id", ratingCol="stars", coldStartStrategy="drop")
model = als.fit(training)
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", 
                                labelCol="stars",
                                predictionCol="prediction")
error = evaluator.evaluate(predictions)
    
print('For testing data the RMSE is %s' % (error))

For testing data the RMSE is 3.932013566624354


## Recommendation

In [0]:
testSubject = 956232

print("\nBuilding recommendation model...")

trainSet = data.build_full_trainset()

model = SVDpp(n_epochs=40, lr_all=0.01, reg_all=0.2)

model.fit(trainSet)
            
print("Computing recommendations...")
trainset = data.build_full_trainset()
fill = trainset.global_mean
anti_testset = []
u = trainset.to_inner_uid(testSubject)
user_items = set([j for (j, _) in trainset.ur[u]])

anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for
                                 i in trainset.all_items() if
                                 i not in user_items]
testSet = anti_testset
        
predictions = model.test(testSet)
            
recommendations = []

print ("\nWe recommend:")
for user_id, business_id, stars, estimatedRating, _ in predictions:
  recommendations.append((business_id, estimatedRating))

recommendations.sort(key=lambda x: x[1], reverse=True)

for ratings in recommendations[:10]:
  print(business_data[business_data.business_id == ratings[0]]["name"].values[0], ratings[1])


Building recommendation model...
Computing recommendations...

We recommend:
Animal Kindness Veterinary Hospital 4.542446337259187
Gio Rana's Really Really Nice Restaurant 4.528699723489563
Biaggio's Pizzeria 4.51662373084823
Nikki's Akropolis Pizza 4.481412511521263
Clear View Home Inspections 4.459661964887028
Fountains of Bellagio 4.4584252409045915
PokÃ© Catcher 4.454446491709405
Las Enchiladas Demama 4.454090502414389
Gun Garage 4.445375169969016
Pink Cherry Wax 4.4436391083879005
