<a href="https://colab.research.google.com/github/teyang-lau/coffee-joint-rec-sys/blob/main/WMF_TGS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
import pandas as pd
import io
username = 'tituslhy'
token = 'xxx'
github_session = requests.Session()
github_session.auth = (username,token)
train_url = 'https://raw.githubusercontent.com/teyang-lau/coffee-joint-rec-sys/main/data/processed/train_tgs.csv'
val_url = 'https://raw.githubusercontent.com/teyang-lau/coffee-joint-rec-sys/main/data/processed/val_tgs.csv'
test_url = 'https://raw.githubusercontent.com/teyang-lau/coffee-joint-rec-sys/main/data/processed/test_tgs.csv'

## Get data

In [2]:
def get_data(url,username=username,token=token):
  download = github_session.get(url).content
  df = pd.read_csv(io.StringIO(download.decode('utf-8')))
  df = df[['userid','shop','rating']]
  data = list(df.itertuples(index=False,name=None))
  return data

In [3]:
train = get_data(train_url)
train[0:5]

[('BLCH29-jOurbrj1fsaQ3Hw', 'tiffin-singapore', 5.0),
 ('BLCH29-jOurbrj1fsaQ3Hw', 'ntuc-fairprice-singapore-60', 3.0),
 ('BLCH29-jOurbrj1fsaQ3Hw', 'pacific-coffee-singapore-4', 4.0),
 ('BLCH29-jOurbrj1fsaQ3Hw', 'crossroads-café-singapore-2', 5.0),
 ('BLCH29-jOurbrj1fsaQ3Hw', 'ten-rens-tea-singapore', 4.0)]

In [4]:
val = get_data(val_url)
val[0:5]

[('6YUB481VjkkWDkP3XLy-XA',
  'da-paolo-gastronomia-holland-village-singapore-2',
  5.0),
 ('eipO40vDyfHFXug2ElJ7Yg', 'maxwell-food-centre-singapore-3', 5.0),
 ('eipO40vDyfHFXug2ElJ7Yg', 'tolidos-espresso-nook-singapore-3', 5.0),
 ('eipO40vDyfHFXug2ElJ7Yg',
  'chinatown-complex-market-and-food-centre-singapore',
  5.0),
 ('9gZ4pQHdK6v8xMLig6EEFA', 'mahota-commune-singapore', 5.0)]

In [5]:
test = get_data(test_url)
test[0:5]

[('8-16ryk-5pdzUs_6cI_5aw', 'old-airport-road-food-centre-singapore', 4.0),
 ('ZmZk86ubu7Kt7HZ5Gn_8xw', 'tolidos-espresso-nook-singapore-3', 5.0),
 ('H8mXfh5XgGCqmMLwVH7k5A', 'kumoya-singapore', 4.0),
 ('e3fX7_qkoSm-6-yTdlwcXw', 'starbucks-singapore-106', 3.0),
 ('ZNyfGsIwsedPlpjmoaq3Eg', 'killiney-kopitiam-singapore-4', 4.0)]

In [6]:
print(len(train))
print(len(val))
print(len(test))

3052
382
382


## Get packages

In [7]:
!pip install --quiet cornac==1.14.2 adjustText

[K     |████████████████████████████████| 12.4 MB 14.8 MB/s 
[?25h  Building wheel for adjustText (setup.py) ... [?25l[?25hdone


In [8]:
import os
import sys
import itertools
import json

import scipy.sparse as sp
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from adjustText import adjust_text
%matplotlib inline

import cornac
from cornac.eval_methods import BaseMethod, CrossValidation
from cornac.models import WMF

%tensorflow_version 1.x
import tensorflow as tf

print(f"System version: {sys.version}")
print(f"Cornac version: {cornac.__version__}")
print(f"Tensorflow version: {tf.__version__}")

SEED = 42
VERBOSE = True

TensorFlow 1.x selected.
System version: 3.7.13 (default, Apr 24 2022, 01:04:09) 
[GCC 7.5.0]
Cornac version: 1.14.2
Tensorflow version: 1.15.2


In [9]:
base = BaseMethod.from_splits(train_data = train, 
                              test_data = val,
                              rating_threshold=3.5,
                              seed = SEED,
                              fmt='UIR')

eval_metrics = [
  cornac.metrics.NDCG(k=5),
  cornac.metrics.NCRR(k=5),
  cornac.metrics.Recall(k=5)
]

## Vanilla runs

In [12]:
K=50
lr = 0.001
iter = 1000
K2 = 100

#Default params:
# k=200, lambda_u=0.01, lambda_v=0.01, a=1, b=0.01, learning_rate=0.001, batch_size=128, max_iter=100

wmf = WMF(k=K, verbose=VERBOSE, seed=SEED, name=f"Vanilla WMF(K={K})")
wmf2 = WMF(k=K, learning_rate = lr, verbose=VERBOSE, seed=SEED, name=f"WMF(K={K}, lr = {lr})")
wmf3 = WMF(k=K, max_iter = iter, verbose=VERBOSE, seed=SEED, name=f"WMF(K={K}, num_iter = {iter})")
wmf4 = WMF(k=K, max_iter=iter, learning_rate = lr, verbose=VERBOSE, seed=SEED, name=f"WMF(K={K}, num_iter = {iter},lr = {lr})")
wmf5 = WMF(k=K2, verbose=VERBOSE, seed=SEED, name=f"Vanilla WMF(K={K2})")
wmf6 = WMF(k=K2, max_iter=iter, learning_rate = lr, verbose=VERBOSE, seed=SEED, name=f"WMF(K={K2}, num_iter = {iter},lr = {lr})")

exp = cornac.Experiment(eval_method=base, models=[wmf,wmf2,wmf3,wmf4,wmf5,wmf6], metrics=eval_metrics)
exp.run()

  0%|          | 0/100 [00:00<?, ?it/s]

Learning completed!


  0%|          | 0/100 [00:00<?, ?it/s]

Learning completed!


  0%|          | 0/1000 [00:00<?, ?it/s]

Learning completed!


  0%|          | 0/1000 [00:00<?, ?it/s]

Learning completed!


  0%|          | 0/100 [00:00<?, ?it/s]

Learning completed!


  0%|          | 0/1000 [00:00<?, ?it/s]

Learning completed!

TEST:
...
                                       | NCRR@5 | NDCG@5 | Recall@5 | Train (s) | Test (s)
-------------------------------------- + ------ + ------ + -------- + --------- + --------
Vanilla WMF(K=50)                      | 0.0070 | 0.0070 |   0.0067 |    3.1905 |   0.1162
WMF(K=50, lr = 0.001)                  | 0.0070 | 0.0070 |   0.0067 |    2.7456 |   0.0717
WMF(K=50, num_iter = 1000)             | 0.0089 | 0.0099 |   0.0134 |   23.1766 |   0.0751
WMF(K=50, num_iter = 1000,lr = 0.001)  | 0.0089 | 0.0099 |   0.0134 |   23.6208 |   0.0739
Vanilla WMF(K=100)                     | 0.0060 | 0.0055 |   0.0045 |    3.0757 |   0.0797
WMF(K=100, num_iter = 1000,lr = 0.001) | 0.0060 | 0.0055 |   0.0045 |   26.1197 |   0.0713



Increasing the number of iterations improves the score but not the learning rate. Increasing K might have resulted in some overfitting. 

In [13]:
from scipy.stats import hmean

hmeans = []

for i in range(len(exp.result)):
  hmeans.append(float(hmean([exp.result[i].metric_avg_results['NCRR@5'],
                       exp.result[i].metric_avg_results['NDCG@5'],
                       exp.result[i].metric_avg_results['Recall@5']])))
  
print('Vanilla harmonic mean score: {:.4f}'.format(max(hmeans)))

Vanilla harmonic mean score: 0.0104


## Hyperparameter tuning

In [14]:
!pip install hyperopt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [15]:
from hyperopt import tpe,hp,fmin,Trials, STATUS_OK

In [59]:
bestparams=[]
bestharmonic=0

NDCG=cornac.metrics.NDCG(5)
NCRR=cornac.metrics.NCRR(5)
Recall=cornac.metrics.Recall(5)

In [51]:
def ObjectiveF(params):
  global NDCG
  global NCRR
  global Recall
  global bestparams
  global bestharmonic

  latentk=params['latentk']
  learningrate = params['learningrate']
  lambda_u=params['lambda_u']
  lambda_v=params['lambda_v']
  b = params['b']
  iter = params['iter']

  wmf=WMF(k=latentk, 
          max_iter=iter, 
          a=1,
          b=b,
          learning_rate=learningrate,
          lambda_u=lambda_u, 
          lambda_v=lambda_v,
          seed=SEED)

  test_result, val_result = base.evaluate(model=wmf, 
                                          metrics=[NDCG,NCRR,Recall], 
                                          user_based=False, 
                                          show_validation=False)
  
  Sample_NCRR=test_result.metric_avg_results['NCRR@5']
  Sample_NDCG=test_result.metric_avg_results['NDCG@5']
  Sample_Recall=test_result.metric_avg_results['Recall@5']
  Sample_HM=hmean([Sample_NCRR,Sample_NDCG,Sample_Recall])
  loss=-Sample_HM
  reportstring=f"The Harmonic Mean for K={latentk}, lambda_u={lambda_u}, lambda_v = {lambda_v}, learningrate={learningrate}, b={b}, max_iter ={iter} is {loss}"
  print(reportstring)

  if Sample_HM>bestharmonic:
    bestparams=(latentk, learningrate, lambda_u, lambda_v,iter,b)
    bestharmonic=Sample_HM
  return loss

In [60]:
%%timeit
trials = Trials()

space = {
    'latentk': hp.choice('latentk', np.arange(50, 100, dtype=int)),
    'learningrate':hp.loguniform('learning_rate',-20,-10),
    'lambda_u': hp.loguniform('lambda_u', -10, -5),
    'lambda_v': hp.loguniform('lambda_v', -10, -5),
    'iter': hp.choice('iter', np.arange(100, 300, dtype=int)),
    'b' : hp.loguniform('b', -10, -5)
}

best=fmin(
    fn=ObjectiveF,
    space=space,
    algo=tpe.suggest,
    trials=trials,
    max_evals = 1
)

  0%|          | 0/1 [00:00<?, ?it/s, best loss: ?]

  0%|          | 0/264 [00:00<?, ?it/s]

Learning completed!
The Harmonic Mean for K=73, lambda_u=0.0001616495457588715, lambda_v = 0.0005792049087065849, learningrate=1.2366538582548875e-06, b=0.00028306932281680746, max_iter =264 is -0.009535806446624205
100%|██████████| 1/1 [00:06<00:00,  6.70s/it, best loss: -0.009535806446624205]
  0%|          | 0/1 [00:00<?, ?it/s, best loss: ?]

  0%|          | 0/164 [00:00<?, ?it/s]

Learning completed!
The Harmonic Mean for K=98, lambda_u=0.002014073267493561, lambda_v = 0.0031411882227429513, learningrate=3.6106244645822966e-06, b=0.0005235310755457856, max_iter =164 is -0.011507777708716154
100%|██████████| 1/1 [00:04<00:00,  4.52s/it, best loss: -0.011507777708716154]
  0%|          | 0/1 [00:00<?, ?it/s, best loss: ?]

  0%|          | 0/151 [00:00<?, ?it/s]

Learning completed!
The Harmonic Mean for K=67, lambda_u=0.00632715702037659, lambda_v = 0.002922678925723872, learningrate=5.851302987627745e-07, b=0.0002794260461294957, max_iter =151 is -0.009496753246753246
100%|██████████| 1/1 [00:03<00:00,  3.82s/it, best loss: -0.009496753246753246]
  0%|          | 0/1 [00:00<?, ?it/s, best loss: ?]

  0%|          | 0/262 [00:00<?, ?it/s]

Learning completed!
The Harmonic Mean for K=61, lambda_u=0.00016808789554199492, lambda_v = 0.004390400834242589, learningrate=6.629562445839125e-09, b=7.892902848810115e-05, max_iter =262 is -0.009496753246753246
100%|██████████| 1/1 [00:06<00:00,  6.32s/it, best loss: -0.009496753246753246]
  0%|          | 0/1 [00:00<?, ?it/s, best loss: ?]

  0%|          | 0/113 [00:00<?, ?it/s]

Learning completed!
The Harmonic Mean for K=86, lambda_u=0.0007416154852006646, lambda_v = 0.0021647864857253425, learningrate=3.0313305340272617e-05, b=8.097854541530788e-05, max_iter =113 is -0.007989949247778835
100%|██████████| 1/1 [00:03<00:00,  3.09s/it, best loss: -0.007989949247778835]
  0%|          | 0/1 [00:00<?, ?it/s, best loss: ?]

  0%|          | 0/150 [00:00<?, ?it/s]

Learning completed!
The Harmonic Mean for K=57, lambda_u=0.00012777087900837076, lambda_v = 0.00026696208243417937, learningrate=2.3396428667629397e-09, b=0.0030162740707384175, max_iter =150 is -0.01595075655307176
100%|██████████| 1/1 [00:04<00:00,  4.38s/it, best loss: -0.01595075655307176]
1 loop, best of 5: 3.15 s per loop


In [61]:
print(f'The best hyperparameters are: K={bestparams[0]},lr = {bestparams[1]},lambda_u = {bestparams[2]}, lambda_v = {bestparams[3]}, max_iter = {bestparams[4]}, b = {bestparams[5]}')
print(f'The harmonic mean from the best parameters is: {bestharmonic}')

The best hyperparameters are: K=57,lr = 2.3396428667629397e-09,lambda_u = 0.00012777087900837076, lambda_v = 0.00026696208243417937, max_iter = 150, b = 0.0030162740707384175
The harmonic mean from the best parameters is: 0.01595075655307176


Testing model against test set to ensure that the model is generalizable.

In [62]:
base2 = BaseMethod.from_splits(train_data = train, 
                              test_data = test,
                              rating_threshold=3.5,
                              seed = SEED,
                              fmt='UIR')

wmf_f = WMF(k=bestparams[0], 
            learning_rate = bestparams[1], 
            lambda_u = bestparams[2], 
            lambda_v = bestparams[3],
            max_iter = bestparams[4],
            b = bestparams[5],
            verbose=VERBOSE, seed=SEED, 
            name="WMF(K={}, lr = {:.04g}, lambda_u = {:.04g},lambda_v ={:.04g}, max_iter = {}, b = {:.04g})".format(bestparams[0],bestparams[1],bestparams[2],bestparams[3],bestparams[4],bestparams[5]))

exp = cornac.Experiment(eval_method=base, models=[wmf_f], metrics=eval_metrics)
exp.run()

  0%|          | 0/150 [00:00<?, ?it/s]

Learning completed!

TEST:
...
                                                                                                | NCRR@5 | NDCG@5 | Recall@5 | Train (s) | Test (s)
----------------------------------------------------------------------------------------------- + ------ + ------ + -------- + --------- + --------
WMF(K=57, lr = 2.34e-09, lambda_u = 0.0001278,lambda_v =0.000267, max_iter = 150, b = 0.003016) | 0.0149 | 0.0154 |   0.0179 |    3.7118 |   0.0705



In [63]:
print('Test harmonic mean score: {:.4f}'.format(float(hmean([exp.result[0].metric_avg_results['NCRR@5'],
                                                             exp.result[0].metric_avg_results['NDCG@5'],
                                                             exp.result[0].metric_avg_results['Recall@5']]))))

Test harmonic mean score: 0.0160


The harmonic mean score is sufficiently close to the training score. Model is sufficiently generalizable.