# Model-Based Collaborative Filtering Recommender System

## Import Libraries, Check Region

In [1]:
import boto3
from sagemaker import get_execution_role
import numpy as np
import pandas as pd

# Define IAM role
role = get_execution_role()
prefix = 'sagemaker/test-recommender'
my_region = boto3.session.Session().region_name # set the region of the instance

print("Success - the MySageMakerInstance is in the " + my_region + " region.")

Success - the MySageMakerInstance is in the us-east-1 region.


## Preprocessing

In [2]:
df = pd.read_csv('SG_Restaurants.csv')

# Drop rows with NA
df = df.dropna()

# Remove Beverage companies from data
df = df[~df['cuisine'].str.contains("Beverages")]
df.head()

Unnamed: 0,id_source,name,address,country,cuisine,currency,delivery_cost,lat,lon,opening_hours,image_url,radius,rating,reviews_nr,delivery_options,promo,loc_type,delivery_by,delivery_time
1,4-CYTDLPUJEP53N6,Subway,Subway - Vivocity,Singapore,"[""Healthy"", ""Sandwiches"", ""Fast Food"", ""Halal""...",SGD,570.0,1.264741,103.822072,"{""open"": true, ""displayedHours"": ""12:00-19:00""...",https://d1sag4ddilekf6.cloudfront.net/compress...,3000,4.2,197.0,ONLY_DELIVERY,Use 'TEATIME' for FREE delivery between 3PM to...,FOOD,GRAB,42.0
3,SGDD07548,Wingstop,Wingstop - VivoCity,Singapore,"[""Halal"", ""Chicken"", ""Fast Food"", ""Burger"", ""W...",SGD,560.0,1.263527,103.821712,"{""open"": true, ""displayedHours"": ""11:00-21:00""...",https://d1sag4ddilekf6.cloudfront.net/compress...,5000,4.4,1391.0,ONLY_DELIVERY,Use 'TEATIME' for FREE delivery between 3PM to...,FOOD,GRAB,45.0
4,4-CY2ZLZLTT2EDJ2,Le Shrimp Ramen,Le Shrimp Ramen - Vivocity,Singapore,"[""Japanese"", ""Ramen"", ""Noodles"", ""Asian""]",SGD,570.0,1.26481,103.822554,"{""open"": true, ""displayedHours"": ""10:30-21:00""...",https://d1sag4ddilekf6.cloudfront.net/compress...,5000,4.6,309.0,DELIVERY_TAKEAWAY,Use 'TEATIME' for FREE delivery between 3PM to...,FOOD,GRAB,44.0
6,4-CY3TEKXEVCN1J6,Burger King,Burger King - VivoCity,Singapore,"[""American"", ""Burger"", ""Halal""]",SGD,560.0,1.263741,103.821354,"{""open"": true, ""displayedHours"": ""09:00-21:45""...",https://d1sag4ddilekf6.cloudfront.net/compress...,5000,4.1,389.0,ONLY_DELIVERY,Use 'TEATIME' for FREE delivery between 3PM to...,FOOD,GRAB,41.0
7,4-C2NXGNCCHBXCE2,Egg Stop,Egg Stop - VivoCity,Singapore,"[""Breakfast & Brunch"", ""Korean"", ""Sandwiches"",...",SGD,409.0,1.264835,103.821785,"{""open"": true, ""displayedHours"": ""10:30-21:00""...",https://d1sag4ddilekf6.cloudfront.net/compress...,3000,3.5,25.0,DELIVERY_TAKEAWAY,$0 Delivery Fee (min. $25 spend)!,FOOD,GRAB,39.0


In [4]:
# Create restaurant name to ID and ID to restaurant name dictionaries

restaurant2rid = {}
rid2restaurant = {}
df_rand = df.sample(n=69, random_state=2)
df_rnames = df_rand['name'].tolist()
for idx, name in enumerate(df_rnames):
    restaurant2rid['name'] = idx
    rid2restaurant[idx] = name

In [5]:
# Mock ratings

rating_df = pd.read_csv('MOCK_DATA.csv')
rating_df = rating_df[['user_id', 'restaurant_id', 'rating']].drop_duplicates()
rating_df.loc[rating_df.rating == 1, 'rating'] = 0

In [6]:
M = np.zeros((11, 69))
for ridx, row in rating_df.iterrows():
    M[row.user_id-1][row.restaurant_id-10001] = row.rating

In [7]:
# Save unvisited restaurants
unvisited = {}
for userid, user in enumerate(M):
    unvisited[userid] = [rid for rid, i in enumerate(M[userid]) if i == 0]

## Model Training

In [8]:
# Define non-negative matrix factorization class

class NMF:
    
    def __init__(self, M, k=100):
        self.M, self.k = M, k
    
        num_users, num_items = M.shape
        
        self.Z = np.argwhere(M != 0)
        self.W = np.random.rand(num_users, k)
        self.H = np.random.rand(k, num_items)

        
    def calc_loss(self):
        return np.sum(np.square((self.M - np.dot(self.W, self.H)))[self.M != 0])
    
    
    def fit(self, learning_rate=0.0001, lambda_reg=0.1, num_iter=3500, verbose=False):
        for it in range(num_iter):
            wh = np.dot(self.W, self.H)
            wh[self.M == 0] = 0
            grad_w = -2 * np.dot((self.M - wh), self.H.T) + 2 * lambda_reg * self.W
            grad_h = -2 * np.dot((self.M - wh).T, self.W).T + 2 * lambda_reg * self.H

            self.W -= learning_rate * grad_w
            self.H -= learning_rate * grad_h    

            # Print loss every 10% of the iterations
            if verbose == True:
                if(it % (num_iter/10) == 0):
                    print('Loss: {:.5f} \t {:.0f}%'.format(self.calc_loss(), (it / (num_iter/100))))

        # Print final loss        
        if verbose == True:
            print('Loss: {:.5f} \t 100%'.format(self.calc_loss()))        
        
        
    def predict(self):
        return np.dot(self.W, self.H)

In [9]:
# Get initial loss

np.random.seed(0)

nmf = NMF(M)

loss = nmf.calc_loss()

print('Initial loss: {:.1f}'.format(loss))

Initial loss: 223437.2


In [10]:
# Train model

np.random.seed(0)

nmf = NMF(M)

nmf.fit(verbose=True)

Loss: 122668.66669 	 0%
Loss: 107.27122 	 10%
Loss: 14.73573 	 20%
Loss: 2.50975 	 30%
Loss: 0.56479 	 40%
Loss: 0.18596 	 50%
Loss: 0.09759 	 60%
Loss: 0.07361 	 70%
Loss: 0.06637 	 80%
Loss: 0.06416 	 90%
Loss: 0.06364 	 100%


In [11]:
print(M[9])

[0. 2. 0. 2. 0. 5. 2. 3. 4. 2. 0. 3. 5. 5. 0. 3. 0. 3. 2. 3. 0. 5. 4. 4.
 3. 4. 2. 0. 0. 0. 0. 4. 0. 2. 5. 5. 0. 0. 0. 2. 3. 5. 0. 0. 5. 3. 0. 5.
 0. 4. 5. 5. 4. 2. 0. 0. 4. 0. 3. 2. 5. 0. 2. 2. 0. 5. 3. 0. 3.]


In [12]:
# Obtain predictions

P = nmf.predict()
print(np.around(P, 2))

[[4.99 3.   2.02 4.99 3.9  4.99 2.01 4.98 3.01 3.12 4.99 2.01 4.72 4.3
  3.   4.98 2.01 2.01 3.99 3.01 3.27 3.99 4.32 3.99 3.   4.99 3.28 2.72
  3.13 3.99 2.01 4.35 3.01 4.98 4.99 3.01 4.77 2.01 2.01 3.11 3.99 2.02
  3.   3.99 3.01 4.98 3.99 3.89 3.17 3.99 3.87 3.99 2.01 4.56 3.   4.
  3.65 2.02 3.99 3.   2.54 3.   2.01 2.02 3.99 3.99 3.99 4.02 3.87]
 [2.01 4.   2.01 4.98 1.9  3.   2.01 3.99 4.23 3.   3.   4.25 3.01 3.37
  3.99 2.01 4.98 3.62 2.02 4.   3.64 3.   4.9  3.   4.99 4.99 3.01 2.01
  2.01 3.01 4.99 3.79 3.75 3.99 4.99 2.01 4.98 4.98 4.99 4.54 3.99 4.36
  3.   2.84 3.99 3.49 5.17 2.01 4.99 2.81 3.   3.26 3.3  2.   3.99 3.01
  3.54 4.   4.54 4.98 2.01 4.31 4.98 2.01 4.99 2.02 2.01 3.58 2.02]
 [3.   3.99 4.   4.   3.01 2.02 4.3  4.   4.99 2.02 4.99 3.65 5.06 4.99
  4.98 3.01 3.01 3.   4.99 4.99 4.39 4.98 5.33 4.99 3.01 4.05 4.98 3.
  3.   4.   4.71 3.   2.02 4.98 5.35 3.99 4.98 4.87 4.99 3.   3.58 3.57
  3.   3.5  2.01 2.01 3.86 3.96 4.14 3.   4.   2.02 4.23 2.81 4.   4.34
  4.9

In [13]:
# Get predicted ratings of users for their respective unvisited restaurants

pred_ratings = {}
for userid in unvisited:
    pred_ratings[userid] = []
    for i in range(M.shape[1]):
        if i in unvisited[userid]:
            pred_ratings[userid].append(P[userid][i])
        else:
            pred_ratings[userid].append(0)

In [14]:
# Get Top 5 predictions for each user

top5_preds = {}
for i in pred_ratings:
    ind = np.argsort(pred_ratings[i])
    top5_ind = ind[::-1][:5]
    restaurants = []
    for rid in top5_ind:
        restaurants.append(rid2restaurant[rid])
    top5_preds[i] = restaurants

In [17]:
print(top5_preds[9])

['Tim Ho Wan', 'The Providore', 'Teppanyaki Omu Curry Rice & Donburi', 'Kebabs Faktory', 'Megumi Japanese Restaurant']


In [16]:
res = pd.DataFrame(top5_preds).transpose()
res.to_csv('predictions.csv', index=False)

In [17]:
# Store results in s3 bucket

s3 = boto3.resource('s3')
s3.meta.client.upload_file('predictions.csv', 'recommenderpredictions', 'predictions.csv')

In [None]:
""" 
# Run this cell to create bucket

bucket_name = 'recommenderpredictions'
s3 = boto3.resource('s3')
try:
    if  my_region == 'us-east-1':
      s3.create_bucket(Bucket=bucket_name)
    else: 
      s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={ 'LocationConstraint': my_region })
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ',e)
"""