<a href="https://colab.research.google.com/github/teyang-lau/coffee-joint-rec-sys/blob/main/MF_LOL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
import pandas as pd
import io
username = 'tituslhy'
token = 'xxx'
github_session = requests.Session()
github_session.auth = (username,token)
train_url = 'https://raw.githubusercontent.com/teyang-lau/coffee-joint-rec-sys/main/data/processed/train_lol.csv'
val_url = 'https://raw.githubusercontent.com/teyang-lau/coffee-joint-rec-sys/main/data/processed/val_lol.csv'
test_url = 'https://raw.githubusercontent.com/teyang-lau/coffee-joint-rec-sys/main/data/processed/test_lol.csv'

## Get data

In [2]:
def get_data(url,username=username,token=token):
  download = github_session.get(url).content
  df = pd.read_csv(io.StringIO(download.decode('utf-8')))
  df = df[['shop','userid','rating']]
  data = list(df.itertuples(index=False,name=None))
  return data

In [3]:
train = get_data(train_url)
train[0:5]

[('liho-tea-singapore-117', '-5YMIME_WEin_by41Bj-3Q', 3.0),
 ('old-hen-coffee-bar-singapore-2', '-5YMIME_WEin_by41Bj-3Q', 4.0),
 ('two-men-bagel-house-singapore', '-5YMIME_WEin_by41Bj-3Q', 4.0),
 ('old-airport-road-food-centre-singapore', '-5YMIME_WEin_by41Bj-3Q', 5.0),
 ('the-book-cafe-singapore', '-G1YjYxjDpxOBzFgo36ORA', 3.0)]

In [4]:
val = get_data(val_url)
val[0:5]

[('dutch-colony-coffee-singapore', '-5YMIME_WEin_by41Bj-3Q', 4.0),
 ('the-providore-singapore', '-G1YjYxjDpxOBzFgo36ORA', 1.0),
 ('nylon-coffee-roasters-singapore', '-WShM_YFbtG4OcE0vrFVyw', 4.0),
 ('tiong-bahru-bakery-singapore-4', '-XaIf12ricWc5z5BRt9nnQ', 4.0),
 ('drips-singapore', '-fUWq6sOIEe1uTUhNKS9sQ', 4.0)]

In [5]:
test = get_data(test_url)
test[0:5]

[('luna-singapore', '-5YMIME_WEin_by41Bj-3Q', 3.0),
 ('common-man-coffee-roasters-singapore', '-G1YjYxjDpxOBzFgo36ORA', 2.0),
 ('chye-seng-huat-hardware-singapore', '-WShM_YFbtG4OcE0vrFVyw', 3.0),
 ('chye-seng-huat-hardware-singapore', '-XaIf12ricWc5z5BRt9nnQ', 3.0),
 ('meng-kitchen-singapore', '-fUWq6sOIEe1uTUhNKS9sQ', 5.0)]

## Get packages

In [6]:
!pip install --quiet cornac==1.14.2 adjustText

In [7]:
import os
import sys
import itertools
import json

import scipy.sparse as sp
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from adjustText import adjust_text
%matplotlib inline

import cornac
from cornac.eval_methods import BaseMethod, CrossValidation
from cornac.models import MF

print(f"System version: {sys.version}")
print(f"Cornac version: {cornac.__version__}")

SEED = 42
VERBOSE = True

System version: 3.7.13 (default, Apr 24 2022, 01:04:09) 
[GCC 7.5.0]
Cornac version: 1.14.2


In [8]:
from scipy.stats import hmean

base = BaseMethod.from_splits(train_data = train, 
                              test_data = val,
                              rating_threshold=3.5,
                              seed = SEED,
                              fmt='UIR')

eval_metrics = [
  cornac.metrics.NDCG(k=5),
  cornac.metrics.NCRR(k=5),
  cornac.metrics.Recall(k=5)
]

## Vanilla runs

In [9]:
K=50
lr = 0.001
iter = 1000

mf = MF(k=K, use_bias=True,
        verbose=VERBOSE, seed=SEED, name=f"Vanilla MF(K={K})")
mf2 = MF(k=K, learning_rate = lr, use_bias=True,
        verbose=VERBOSE, seed=SEED, name=f"MF(K={K}, lr = {lr})")
mf3 = MF(k=K, max_iter = iter, use_bias=True,
        verbose=VERBOSE, seed=SEED, name=f"MF(K={K}, num_iter = {iter})")
mf4 = MF(k=K, max_iter=iter, learning_rate = lr, use_bias=True,
        verbose=VERBOSE, seed=SEED, name=f"MF(K={K}, num_iter = {iter},lr = {lr})")

exp = cornac.Experiment(eval_method=base, models=[mf,mf2,mf3,mf4], metrics=eval_metrics).run()

  0%|          | 0/20 [00:00<?, ?it/s]

Optimization finished!


  0%|          | 0/20 [00:00<?, ?it/s]

Optimization finished!


  0%|          | 0/1000 [00:00<?, ?it/s]

Optimization finished!


  0%|          | 0/1000 [00:00<?, ?it/s]

Optimization finished!

TEST:
...
                                     | NCRR@5 | NDCG@5 | Recall@5 | Train (s) | Test (s)
------------------------------------ + ------ + ------ + -------- + --------- + --------
Vanilla MF(K=50)                     | 0.0081 | 0.0093 |   0.0121 |    0.0949 |   0.5433
MF(K=50, lr = 0.001)                 | 0.0101 | 0.0130 |   0.0220 |    0.1016 |   0.1235
MF(K=50, num_iter = 1000)            | 0.0021 | 0.0027 |   0.0029 |    1.3799 |   1.3528
MF(K=50, num_iter = 1000,lr = 0.001) | 0.0020 | 0.0020 |   0.0018 |    1.4225 |   1.0083



MF(K=50,lr=0.001) is the best vanilla run setting. To optimize this model better.

In [19]:
print('Vanilla harmonic mean score: {:.4f}'.format(float(hmean([0.0101,0.0130,0.0220]))))

Vanilla harmonic mean score: 0.0136


## Hyperparameter tuning

In [11]:
!pip install hyperopt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [12]:
from hyperopt import tpe,hp,fmin,Trials, STATUS_OK
from scipy.stats import hmean

In [13]:
bestparams=[]
bestharmonic=0

NDCG=cornac.metrics.NDCG(5)
NCRR=cornac.metrics.NCRR(5)
Recall=cornac.metrics.Recall(5)

In [14]:
def ObjectiveF(params):
  global NDCG
  global NCRR
  global Recall
  global bestparams
  global bestharmonic

  latentk=params['latentk']
  learningrate = params['learningrate']
  lambdareg=params['lambdareg']

  mf=MF(k=latentk, 
        max_iter=100, 
        learning_rate=learningrate,
        use_bias = True, 
        lambda_reg=lambdareg,
        seed=SEED)

  test_result, val_result = base.evaluate(model=mf, 
                                          metrics=[NDCG,NCRR,Recall], 
                                          user_based=False, 
                                          show_validation=False)
  
  Sample_NCRR=test_result.metric_avg_results['NCRR@5']
  Sample_NDCG=test_result.metric_avg_results['NDCG@5']
  Sample_Recall=test_result.metric_avg_results['Recall@5']
  Sample_HM=hmean([Sample_NCRR,Sample_NDCG,Sample_Recall])
  loss=-Sample_HM
  reportstring=f"The Harmonic Mean for K={latentk}, reg={lambdareg}, learningrate={learningrate} is {loss}"
  print(reportstring)

  if Sample_HM>bestharmonic:
    bestparams=(latentk, lambdareg,learningrate)
    bestharmonic=Sample_HM
  return loss

In [15]:
%%timeit
trials = Trials()

space = {
    'latentk': hp.choice('latentk', np.arange(50, 100, dtype=int)),
    'learningrate':hp.loguniform('learning_rate',-20,-10),
    'lambdareg': hp.loguniform('lambdareg', -25, -10)
}

best=fmin(
    fn=ObjectiveF,
    space=space,
    algo=tpe.suggest,
    trials=trials,
    max_evals = 1
)

The Harmonic Mean for K=99, reg=3.4255700736465434e-10, learningrate=2.4882126866430996e-07 is -0.011625767174437232
100%|██████████| 1/1 [00:00<00:00,  6.77it/s, best loss: -0.011625767174437232]
The Harmonic Mean for K=92, reg=1.3973160483973113e-08, learningrate=2.3165487893725914e-05 is -0.015354591300341112
100%|██████████| 1/1 [00:00<00:00,  4.09it/s, best loss: -0.015354591300341112]
The Harmonic Mean for K=55, reg=5.967794429618498e-07, learningrate=8.021119489355173e-06 is -0.013508798504387387
100%|██████████| 1/1 [00:00<00:00,  7.62it/s, best loss: -0.013508798504387387]
The Harmonic Mean for K=75, reg=5.273493661804108e-06, learningrate=7.044612826827908e-07 is -0.012370809425137743
100%|██████████| 1/1 [00:00<00:00,  7.98it/s, best loss: -0.012370809425137743]
The Harmonic Mean for K=69, reg=8.789216100732317e-11, learningrate=4.143050026597341e-06 is -0.013508798504387387
100%|██████████| 1/1 [00:00<00:00,  6.74it/s, best loss: -0.013508798504387387]
The Harmonic Mean for

In [16]:
print(f'The best hyperparameters are: K={bestparams[0]},lr = {bestparams[1]},lambda = {bestparams[2]}')
print(f'The harmonic mean from the best parameters is: {bestharmonic}')

The best hyperparameters are: K=60,lr = 3.790032679906342e-06,lambda = 1.9725119838753178e-08
The harmonic mean from the best parameters is: 0.02957886229756361


Testing model against test set to ensure that the model is generalizable.

In [17]:
base2 = BaseMethod.from_splits(train_data = train, 
                              test_data = test,
                              rating_threshold=3.5,
                              seed = SEED,
                              fmt='UIR')

mf5 = MF(k=bestparams[0], learning_rate = bestparams[1], 
         lambda_reg = bestparams[2], use_bias=True,
         verbose=VERBOSE, seed=SEED, 
         name="MF(K={}, lr = {:.04g}, lambdareg = {:.04g})".format(bestparams[0],bestparams[1],bestparams[2]))

exp = cornac.Experiment(eval_method=base, models=[mf5], metrics=eval_metrics).run()

  0%|          | 0/20 [00:00<?, ?it/s]

Optimization finished!

TEST:
...
                                               | NCRR@5 | NDCG@5 | Recall@5 | Train (s) | Test (s)
---------------------------------------------- + ------ + ------ + -------- + --------- + --------
MF(K=60, lr = 3.79e-06, lambdareg = 1.973e-08) | 0.0225 | 0.0267 |   0.0396 |    0.0674 |   0.2832



In [20]:
print('Test harmonic mean score: {:.4f}'.format(float(hmean([0.0225,0.0267,0.0396]))))

Test harmonic mean score: 0.0280


The harmonic mean score is sufficiently close to the training score. Model is sufficiently generalizable.

Testing to see if the model is generalizable by using CV as the evaluation method.

In [21]:
joint = train+val
cv = CrossValidation(joint,
                     n_folds=5,
                     rating_threshold = 3.5,
                     fmt='UIR',
                     seed=SEED
                     )

In [22]:
mf5 = MF(k=bestparams[0], learning_rate = bestparams[1], 
         lambda_reg = bestparams[2], use_bias=True,
         verbose=VERBOSE, seed=SEED, 
         name="MF(K={}, lr = {:.04g}, lambdareg = {:.04g})".format(bestparams[0],bestparams[1],bestparams[2]))

cornac.Experiment(eval_method=cv, models=[mf5], metrics=eval_metrics).run()

  0%|          | 0/20 [00:00<?, ?it/s]

Optimization finished!


  0%|          | 0/20 [00:00<?, ?it/s]

Optimization finished!


  0%|          | 0/20 [00:00<?, ?it/s]

Optimization finished!


  0%|          | 0/20 [00:00<?, ?it/s]

Optimization finished!


  0%|          | 0/20 [00:00<?, ?it/s]

Optimization finished!

TEST:
...
[MF(K=60, lr = 3.79e-06, lambdareg = 1.973e-08)]
       | NCRR@5 | NDCG@5 | Recall@5 | Train (s) | Test (s)
------ + ------ + ------ + -------- + --------- + --------
Fold 0 | 0.0069 | 0.0065 |   0.0056 |    0.0792 |   0.1660
Fold 1 | 0.0033 | 0.0041 |   0.0051 |    0.0737 |   0.2138
Fold 2 | 0.0053 | 0.0076 |   0.0135 |    0.0793 |   0.1613
Fold 3 | 0.0100 | 0.0118 |   0.0148 |    0.0833 |   0.1795
Fold 4 | 0.0124 | 0.0137 |   0.0194 |    0.0758 |   0.1713
------ + ------ + ------ + -------- + --------- + --------
Mean   | 0.0076 | 0.0087 |   0.0117 |    0.0783 |   0.1784
Std    | 0.0033 | 0.0035 |   0.0056 |    0.0033 |   0.0187



In [23]:
avg_NCRR = (0.0069+0.0033+0.0053+0.01+0.0124)/5
avg_NDCG = (0.0065+0.0041+0.0076+0.0118+0.0137)/5
avg_recall = (0.0056+0.0051+0.0135+0.0148+0.0194)/5
print('Average CV harmonic mean score: {:.4f}'.format(float(hmean([avg_NCRR,avg_NDCG,avg_recall]))))

Average CV harmonic mean score: 0.0090
