<a href="https://colab.research.google.com/github/teyang-lau/coffee-joint-rec-sys/blob/main/MF_LOL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
import pandas as pd
import io
username = 'tituslhy'
token = 'xxx'
github_session = requests.Session()
github_session.auth = (username,token)
train_url = 'https://raw.githubusercontent.com/teyang-lau/coffee-joint-rec-sys/main/data/processed/train_lol.csv'
val_url = 'https://raw.githubusercontent.com/teyang-lau/coffee-joint-rec-sys/main/data/processed/val_lol.csv'
test_url = 'https://raw.githubusercontent.com/teyang-lau/coffee-joint-rec-sys/main/data/processed/test_lol.csv'

## Get data

In [2]:
def get_data(url,username=username,token=token):
  download = github_session.get(url).content
  df = pd.read_csv(io.StringIO(download.decode('utf-8')))
  df = df[['userid','shop','rating']]
  data = list(df.itertuples(index=False,name=None))
  return data

In [3]:
train = get_data(train_url)
train[0:5]

[('-5YMIME_WEin_by41Bj-3Q', 'liho-tea-singapore-117', 3.0),
 ('-5YMIME_WEin_by41Bj-3Q', 'old-hen-coffee-bar-singapore-2', 4.0),
 ('-5YMIME_WEin_by41Bj-3Q', 'two-men-bagel-house-singapore', 4.0),
 ('-5YMIME_WEin_by41Bj-3Q', 'old-airport-road-food-centre-singapore', 5.0),
 ('-G1YjYxjDpxOBzFgo36ORA', 'the-book-cafe-singapore', 3.0)]

In [4]:
val = get_data(val_url)
val[0:5]

[('-5YMIME_WEin_by41Bj-3Q', 'dutch-colony-coffee-singapore', 4.0),
 ('-G1YjYxjDpxOBzFgo36ORA', 'the-providore-singapore', 1.0),
 ('-WShM_YFbtG4OcE0vrFVyw', 'nylon-coffee-roasters-singapore', 4.0),
 ('-XaIf12ricWc5z5BRt9nnQ', 'tiong-bahru-bakery-singapore-4', 4.0),
 ('-fUWq6sOIEe1uTUhNKS9sQ', 'drips-singapore', 4.0)]

In [5]:
test = get_data(test_url)
test[0:5]

[('-5YMIME_WEin_by41Bj-3Q', 'luna-singapore', 3.0),
 ('-G1YjYxjDpxOBzFgo36ORA', 'common-man-coffee-roasters-singapore', 2.0),
 ('-WShM_YFbtG4OcE0vrFVyw', 'chye-seng-huat-hardware-singapore', 3.0),
 ('-XaIf12ricWc5z5BRt9nnQ', 'chye-seng-huat-hardware-singapore', 3.0),
 ('-fUWq6sOIEe1uTUhNKS9sQ', 'meng-kitchen-singapore', 5.0)]

## Get packages

In [6]:
!pip install --quiet cornac==1.14.2 adjustText

[K     |████████████████████████████████| 12.4 MB 753 kB/s 
[?25h  Building wheel for adjustText (setup.py) ... [?25l[?25hdone


In [7]:
import os
import sys
import itertools
import json

import scipy.sparse as sp
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from adjustText import adjust_text
%matplotlib inline

import cornac
from cornac.eval_methods import BaseMethod, CrossValidation, RatioSplit
from cornac.models import MF
from cornac.hyperopt import Discrete
from cornac.hyperopt import GridSearch

print(f"System version: {sys.version}")
print(f"Cornac version: {cornac.__version__}")

SEED = 42
VERBOSE = True

System version: 3.7.13 (default, Apr 24 2022, 01:04:09) 
[GCC 7.5.0]
Cornac version: 1.14.2


In [8]:
from scipy.stats import hmean

base = BaseMethod.from_splits(train_data = train, 
                              test_data = val,
                              rating_threshold=3.5,
                              seed = SEED,
                              fmt='UIR')

eval_metrics = [
  cornac.metrics.NDCG(k=5),
  cornac.metrics.NCRR(k=5),
  cornac.metrics.Recall(k=5)
]

## Vanilla runs

In [9]:
K=50
lr = 0.001
iter = 1000

mf = MF(k=K, use_bias=True,
        verbose=VERBOSE, seed=SEED, name=f"Vanilla MF(K={K})")
mf2 = MF(k=K, learning_rate = lr, use_bias=True,
        verbose=VERBOSE, seed=SEED, name=f"MF(K={K}, lr = {lr})")
mf3 = MF(k=K, max_iter = iter, use_bias=True,
        verbose=VERBOSE, seed=SEED, name=f"MF(K={K}, num_iter = {iter})")
mf4 = MF(k=K, max_iter=iter, learning_rate = lr, use_bias=True,
        verbose=VERBOSE, seed=SEED, name=f"MF(K={K}, num_iter = {iter},lr = {lr})")

exp = cornac.Experiment(eval_method=base, models=[mf,mf2,mf3,mf4], metrics=eval_metrics)

exp.run()

  0%|          | 0/20 [00:00<?, ?it/s]

Optimization finished!


  0%|          | 0/20 [00:00<?, ?it/s]

Optimization finished!


  0%|          | 0/1000 [00:00<?, ?it/s]

Optimization finished!


  0%|          | 0/1000 [00:00<?, ?it/s]

Optimization finished!

TEST:
...
                                     | NCRR@5 | NDCG@5 | Recall@5 | Train (s) | Test (s)
------------------------------------ + ------ + ------ + -------- + --------- + --------
Vanilla MF(K=50)                     | 0.0169 | 0.0205 |   0.0310 |    0.1180 |   0.5132
MF(K=50, lr = 0.001)                 | 0.0285 | 0.0395 |   0.0732 |    0.0984 |   0.3288
MF(K=50, num_iter = 1000)            | 0.0000 | 0.0000 |   0.0000 |    2.2980 |   2.0913
MF(K=50, num_iter = 1000,lr = 0.001) | 0.0009 | 0.0014 |   0.0028 |    3.3956 |   1.7207



MF(K=50,lr=0.001) is the best vanilla run setting. To optimize this model better.

In [10]:
from scipy.stats import hmean

hmeans = []

for i in range(len(exp.result)):
  hmeans.append(float(hmean([exp.result[i].metric_avg_results['NCRR@5'],
                       exp.result[i].metric_avg_results['NDCG@5'],
                       exp.result[i].metric_avg_results['Recall@5']])))
  
print('Vanilla harmonic mean score: {:.4f}'.format(max(hmeans)))

Vanilla harmonic mean score: 0.0405


## Hyperparameter tuning

In [11]:
!pip install --quiet hyperopt

In [12]:
from hyperopt import tpe,hp,fmin,Trials, STATUS_OK

In [None]:
bestparams=[]
bestharmonic=0

NDCG=cornac.metrics.NDCG(5)
NCRR=cornac.metrics.NCRR(5)
Recall=cornac.metrics.Recall(5)

In [None]:
def ObjectiveF(params):
  global NDCG
  global NCRR
  global Recall
  global bestparams
  global bestharmonic

  latentk=params['latentk']
  learningrate = params['learningrate']
  lambdareg=params['lambdareg']

  mf=MF(k=latentk, 
        max_iter=100, 
        learning_rate=learningrate,
        use_bias = True, 
        lambda_reg=lambdareg,
        seed=SEED)

  test_result, val_result = base.evaluate(model=mf, 
                                          metrics=[NDCG,NCRR,Recall], 
                                          user_based=False, 
                                          show_validation=False)
  
  Sample_NCRR=test_result.metric_avg_results['NCRR@5']
  Sample_NDCG=test_result.metric_avg_results['NDCG@5']
  Sample_Recall=test_result.metric_avg_results['Recall@5']
  Sample_HM=hmean([Sample_NCRR,Sample_NDCG,Sample_Recall])
  loss=-Sample_HM
  reportstring=f"The Harmonic Mean for K={latentk}, reg={lambdareg}, learningrate={learningrate} is {loss}"
  print(reportstring)

  if Sample_HM>bestharmonic:
    bestparams=(latentk, lambdareg,learningrate)
    bestharmonic=Sample_HM
  return loss

In [None]:
%%timeit
trials = Trials()

space = {
    'latentk': hp.choice('latentk', np.arange(50, 100, dtype=int)),
    'learningrate':hp.loguniform('learning_rate',-20,-10),
    'lambdareg': hp.loguniform('lambdareg', -25, -10)
}

best=fmin(
    fn=ObjectiveF,
    space=space,
    algo=tpe.suggest,
    trials=trials,
    max_evals = 1
)

The Harmonic Mean for K=59, reg=4.2588886702944083e-07, learningrate=2.3397950055212107e-06 is -0.039866976998979585
100%|██████████| 1/1 [00:00<00:00,  2.98it/s, best loss: -0.039866976998979585]
The Harmonic Mean for K=83, reg=1.0556754545881819e-05, learningrate=1.71109214464399e-05 is -0.04477442588009661
100%|██████████| 1/1 [00:00<00:00,  2.93it/s, best loss: -0.04477442588009661]
The Harmonic Mean for K=51, reg=5.836248423593029e-09, learningrate=1.620322669566987e-07 is -0.0046854040061089005
100%|██████████| 1/1 [00:00<00:00,  3.41it/s, best loss: -0.0046854040061089005]
The Harmonic Mean for K=78, reg=3.236094012830954e-10, learningrate=9.880405073921438e-07 is -0.018879140763139502
100%|██████████| 1/1 [00:00<00:00,  3.53it/s, best loss: -0.018879140763139502]
The Harmonic Mean for K=90, reg=2.2032120017200963e-06, learningrate=6.632438253532254e-06 is -0.04406472449521756
100%|██████████| 1/1 [00:00<00:00,  3.10it/s, best loss: -0.04406472449521756]
The Harmonic Mean for K=

In [None]:
print(f'The best hyperparameters are: K={bestparams[0]},lr = {bestparams[1]},lambda = {bestparams[2]}')
print(f'The harmonic mean from the best parameters is: {bestharmonic}')

The best hyperparameters are: K=83,lr = 1.0556754545881819e-05,lambda = 1.71109214464399e-05
The harmonic mean from the best parameters is: 0.04477442588009661


Testing model against test set to ensure that the model is generalizable.

In [14]:
base2 = BaseMethod.from_splits(train_data = train, 
                              test_data = test,
                              rating_threshold=3.5,
                              seed = SEED,
                              fmt='UIR')

mf5 = MF(k=83, learning_rate = 1.0556754545881819e-05, 
         lambda_reg = 1.71109214464399e-05, use_bias=True,
         verbose=VERBOSE, seed=SEED)
        #  name="MF(K={}, lr = {:.04g}, lambdareg = {:.04g})".format(bestparams[0],bestparams[1],bestparams[2]))

exp = cornac.Experiment(eval_method=base, models=[mf5], metrics=eval_metrics)
exp.run()

  0%|          | 0/20 [00:00<?, ?it/s]

Optimization finished!

TEST:
...
   | NCRR@5 | NDCG@5 | Recall@5 | Train (s) | Test (s)
-- + ------ + ------ + -------- + --------- + --------
MF | 0.0345 | 0.0428 |   0.0676 |    0.1397 |   0.5457



In [None]:
base2 = BaseMethod.from_splits(train_data = train, 
                              test_data = test,
                              rating_threshold=3.5,
                              seed = SEED,
                              fmt='UIR')

mf5 = MF(k=bestparams[0], learning_rate = bestparams[1], 
         lambda_reg = bestparams[2], use_bias=True,
         verbose=VERBOSE, seed=SEED, 
         name="MF(K={}, lr = {:.04g}, lambdareg = {:.04g})".format(bestparams[0],bestparams[1],bestparams[2]))

exp = cornac.Experiment(eval_method=base, models=[mf5], metrics=eval_metrics)
exp.run()

  0%|          | 0/20 [00:00<?, ?it/s]

Optimization finished!

TEST:
...
                                                | NCRR@5 | NDCG@5 | Recall@5 | Train (s) | Test (s)
----------------------------------------------- + ------ + ------ + -------- + --------- + --------
MF(K=83, lr = 1.056e-05, lambdareg = 1.711e-05) | 0.0345 | 0.0428 |   0.0676 |    0.2751 |   1.5019



In [15]:
print('Harmonic mean score of tuned model against test data: {:.4f}'.format(float(hmean([exp.result[0].metric_avg_results['NCRR@5'],
                                                                exp.result[0].metric_avg_results['NDCG@5'],
                                                                exp.result[0].metric_avg_results['Recall@5']]))))

Harmonic mean score of tuned model against test data: 0.0447


The harmonic mean score is sufficiently close to the training score. Model is sufficiently generalizable.

Generate recommendations

In [16]:
userids = mf5.train_set.uid_map
itemids = mf5.train_set.iid_map
item_map = dict()
for key,value in enumerate(itemids):
  item_map[key]=value

In [17]:
from collections import defaultdict

TOP = 5
recs = defaultdict(list)

for UIDX in list(userids.keys()):
  userid=userids[UIDX]
  recommendations,scores = mf5.rank(userid)
  rec = recommendations[:TOP]
  for item in rec:
    recs[UIDX].append(item_map[item])  

In [18]:
df =pd.DataFrame(recs).T.reset_index()
columns = ['userid']
for i in range(5):
  columns.append('recommendation_'+str(i+1))
df.columns = columns
df.head()

Unnamed: 0,userid,recommendation_1,recommendation_2,recommendation_3,recommendation_4,recommendation_5
0,-5YMIME_WEin_by41Bj-3Q,nylon-coffee-roasters-singapore,maxwell-food-centre-singapore-3,toms-palette-singapore,two-men-bagel-house-singapore,hainanese-village-centre-singapore
1,-G1YjYxjDpxOBzFgo36ORA,two-men-bagel-house-singapore,toms-palette-singapore,nylon-coffee-roasters-singapore,amoy-street-food-centre-singapore,maxwell-food-centre-singapore-3
2,-WShM_YFbtG4OcE0vrFVyw,maxwell-food-centre-singapore-3,nylon-coffee-roasters-singapore,tong-heng-singapore-3,toms-palette-singapore,old-airport-road-food-centre-singapore
3,-XaIf12ricWc5z5BRt9nnQ,nylon-coffee-roasters-singapore,toms-palette-singapore,maxwell-food-centre-singapore-3,two-men-bagel-house-singapore,geisha-specialty-coffee-singapore
4,-fUWq6sOIEe1uTUhNKS9sQ,toms-palette-singapore,maxwell-food-centre-singapore-3,nylon-coffee-roasters-singapore,two-men-bagel-house-singapore,ya-kun-kaya-toast-singapore-22


In [19]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab Notebooks

mf_TGS_recs = df.to_csv('mf_lol_recs.csv')

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks
