<a href="https://colab.research.google.com/github/teyang-lau/coffee-joint-rec-sys/blob/main/CDL_TGS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
import numpy as np
import pandas as pd
import io
username = 'shaunaloh'
token = 'xxx'
github_session = requests.Session()
github_session.auth = (username, token)
train_url = 'https://raw.githubusercontent.com/teyang-lau/coffee-joint-rec-sys/main/data/processed/train_tgs.csv'
val_url = 'https://raw.githubusercontent.com/teyang-lau/coffee-joint-rec-sys/main/data/processed/val_tgs.csv'
test_url = 'https://raw.githubusercontent.com/teyang-lau/coffee-joint-rec-sys/main/data/processed/test_tgs.csv'

In [2]:
%tensorflow_version 1.x
import tensorflow as tf

TensorFlow 1.x selected.


In [3]:
!pip install --quiet cornac==1.14.2

Get Data (all columns)

In [4]:
def get_data(url,username=username,token=token):
  download = github_session.get(url).content
  df = pd.read_csv(io.StringIO(download.decode('utf-8')))
  df = df[['shop', 'userid', 'rating', 'text']]
  return df

In [5]:
train = get_data(train_url)
train.head()

Unnamed: 0,shop,userid,rating,text
0,tiffin-singapore,BLCH29-jOurbrj1fsaQ3Hw,5.0,Situated on the corner of Jiak Chuan and Teck ...
1,ntuc-fairprice-singapore-60,BLCH29-jOurbrj1fsaQ3Hw,3.0,Reasonably priced and not too large food and h...
2,pacific-coffee-singapore-4,BLCH29-jOurbrj1fsaQ3Hw,4.0,"The coffee is good, the service extremely frie..."
3,crossroads-café-singapore-2,BLCH29-jOurbrj1fsaQ3Hw,5.0,"Excellent coffee and light lunch place, but th..."
4,ten-rens-tea-singapore,BLCH29-jOurbrj1fsaQ3Hw,4.0,"When tea is your thing, this may be your place..."


In [6]:
val = get_data(val_url)

In [7]:
test= get_data(test_url)

In [8]:
'''
https://github.com/PreferredAI/cornac/blob/master/examples/cdl_example.py
'''
import cornac
from cornac.data import Reader
from cornac.eval_methods import RatioSplit, BaseMethod
from cornac.data import TextModality
from cornac.data.text import BaseTokenizer

In [9]:
#docs: corpus
#shop: item id
docs = train['text'].values.tolist()
item_ids = train['shop'].values.tolist()



In [10]:
merged_ratings = train[['userid','shop','rating']]
merged_records = merged_ratings.to_records(index=False)
merged_result = list(merged_records)

In [11]:
val_merged_ratings = val[['userid','shop','rating']]
val_merged_records = val_merged_ratings.to_records(index=False)
val_merged_result = list(val_merged_records)

In [12]:
item_text_modality = TextModality(
    corpus=docs,
    ids=item_ids,
    tokenizer=BaseTokenizer(stop_words="english"),
    max_vocab=8000,
    max_doc_freq=0.5)

In [13]:
merged_result[:5]

[('BLCH29-jOurbrj1fsaQ3Hw', 'tiffin-singapore', 5.),
 ('BLCH29-jOurbrj1fsaQ3Hw', 'ntuc-fairprice-singapore-60', 3.),
 ('BLCH29-jOurbrj1fsaQ3Hw', 'pacific-coffee-singapore-4', 4.),
 ('BLCH29-jOurbrj1fsaQ3Hw', 'crossroads-café-singapore-2', 5.),
 ('BLCH29-jOurbrj1fsaQ3Hw', 'ten-rens-tea-singapore', 4.)]

In [14]:
item_ids[:5]

['tiffin-singapore',
 'ntuc-fairprice-singapore-60',
 'pacific-coffee-singapore-4',
 'crossroads-café-singapore-2',
 'ten-rens-tea-singapore']

In [15]:
docs[:5]

['Situated on the corner of Jiak Chuan and Teck Lim (off Neil Rd) in a quiet old fashioned part of hinatown, this charming little bar offers a delicious selection of coffee and pastries in the morning. Seating outside available.',
 'Reasonably priced and not too large food and household items supermarket steps away from the Tanjong Pagar MRT station. Good quality and also a small range of organic food and European food - due to the many expats who shop here - next to the Chinese who are certainly by large the majority here, which you can also tell from e.g. the vegetables on offer.',
 "The coffee is good, the service extremely friendly, and the Danish acceptable. Only setback: don't expect here crispy croissants! I still must find them in Singapore anyway. Other than that: this is a cozy coffee place with a large variety of milkshakes, where the inviting seats and magazines urge you to take it easy. Good to start the day, but I see the place is even crowded in the evening - mainly pack

CDL Vanilla Run

In [16]:
# Define a base evaluation method for train and validation sets
SEED=42
VERBOSE=True

base = BaseMethod.from_splits(
    train_data = merged_result, 
    test_data = val_merged_result,
    rating_threshold=3.5,
    item_text = item_text_modality,
    verbose=VERBOSE,
    seed = SEED,
    fmt='UIR')

# Instantiate CDL model
cdl = cornac.models.CDL(
    k=50,
    autoencoder_structure=[200],
    max_iter=100,
    lambda_u=0.1,
    lambda_v=1,
    lambda_w=0.1,
    lambda_n=1000,
    verbose=VERBOSE,
    seed=SEED
)

eval_metrics = [
  cornac.metrics.NDCG(k=5),
  cornac.metrics.NCRR(k=5),
  cornac.metrics.Recall(k=5)
]


rating_threshold = 3.5
exclude_unknowns = False
---
Training data:
Number of users = 426
Number of items = 699
Number of ratings = 3052
Max rating = 5.0
Min rating = 1.0
Global mean = 3.7
---
Test data:
Number of users = 123
Number of items = 242
Number of ratings = 382
Number of unknown users = 40
Number of unknown items = 76
---
Total users = 466
Total items = 775


In [17]:
exp = cornac.Experiment(eval_method=base, models=[cdl], metrics=eval_metrics).run()


[CDL] Training started!


  0%|          | 0/100 [00:00<?, ?it/s]

Learning completed!

[CDL] Evaluation started!


Ranking:   0%|          | 0/123 [00:00<?, ?it/s]


TEST:
...
    | NCRR@5 | NDCG@5 | Recall@5 | Train (s) | Test (s)
--- + ------ + ------ + -------- + --------- + --------
CDL | 0.0088 | 0.0104 |   0.0156 |   62.8410 |   0.1370



In [18]:
from scipy.stats import hmean
print('Vanilla harmonic mean score: {:.4f}'.format(float(hmean([0.0088,0.0104,0.0156]))))

Vanilla harmonic mean score: 0.0110


Hyperparameter Tuning

In [19]:
!pip install hyperopt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [20]:
from hyperopt import tpe,hp,fmin,Trials, STATUS_OK

In [21]:
bestparams=[]
bestharmonic=0

NDCG=cornac.metrics.NDCG(5)
NCRR=cornac.metrics.NCRR(5)
Recall=cornac.metrics.Recall(5)

In [22]:
def ObjectiveF(params):
  global NDCG
  global NCRR
  global Recall
  global bestparams
  global bestharmonic

  latentk=params['latentk']
  autoencoderstructure = params['autoencoderstructure']
  learningrate=params['learningrate']

  cdl_hp= cornac.models.CDL(
    k=latentk,
    autoencoder_structure=[autoencoderstructure],
    max_iter=100,
    lambda_u=0.1,
    lambda_v=1,
    lambda_w=0.1,
    lambda_n=1000,
    learning_rate = learningrate,
    verbose=VERBOSE,
    seed=SEED
)
  

  test_result, val_result = base.evaluate(model=cdl_hp, 
                                          metrics=[NDCG,NCRR,Recall], 
                                          user_based=False, 
                                          show_validation=False)
  
  Sample_NCRR=test_result.metric_avg_results['NCRR@5']
  Sample_NDCG=test_result.metric_avg_results['NDCG@5']
  Sample_Recall=test_result.metric_avg_results['Recall@5']
  Sample_HM=hmean([Sample_NCRR,Sample_NDCG,Sample_Recall])
  loss=-Sample_HM
  reportstring=f"The Harmonic Mean for K={latentk}, autoencoderstructure={autoencoderstructure}, learningrate={learningrate} is {loss}"
  print(reportstring)

  if Sample_HM>bestharmonic:
    bestparams=(latentk, autoencoderstructure, learningrate)
    bestharmonic=Sample_HM
  return loss

In [23]:
%%timeit
trials = Trials()

space = {
    'latentk': hp.choice('latentk', np.arange(50, 100, 10, dtype=int)),
    'autoencoderstructure': hp.choice('autoencoderstructure', np.arange(200, 300, 50, dtype=int)),
    'learningrate': hp.uniform('learningrate', 0.001, 0.01)
}

best=fmin(
    fn=ObjectiveF,
    space=space,
    algo=tpe.suggest,
    trials=trials,
    max_evals = 1
)


[CDL] Training started!
  0%|          | 0/1 [00:00<?, ?it/s, best loss: ?]

  0%|          | 0/100 [00:00<?, ?it/s]

Learning completed!

[CDL] Evaluation started!
  0%|          | 0/1 [00:55<?, ?it/s, best loss: ?]

Ranking:   0%|          | 0/123 [00:00<?, ?it/s]

The Harmonic Mean for K=60, autoencoderstructure=200, learningrate=0.003378394319462715 is -0.009496753246753246
100%|██████████| 1/1 [00:55<00:00, 55.33s/it, best loss: -0.009496753246753246]

[CDL] Training started!
  0%|          | 0/1 [00:00<?, ?it/s, best loss: ?]

  0%|          | 0/100 [00:00<?, ?it/s]

Learning completed!

[CDL] Evaluation started!
  0%|          | 0/1 [00:47<?, ?it/s, best loss: ?]

Ranking:   0%|          | 0/123 [00:00<?, ?it/s]

The Harmonic Mean for K=50, autoencoderstructure=200, learningrate=0.006589398609450872 is -0.014097744360902255
100%|██████████| 1/1 [00:47<00:00, 47.55s/it, best loss: -0.014097744360902255]

[CDL] Training started!
  0%|          | 0/1 [00:00<?, ?it/s, best loss: ?]

  0%|          | 0/100 [00:00<?, ?it/s]

Learning completed!

[CDL] Evaluation started!
  0%|          | 0/1 [00:54<?, ?it/s, best loss: ?]

Ranking:   0%|          | 0/123 [00:00<?, ?it/s]

The Harmonic Mean for K=60, autoencoderstructure=250, learningrate=0.00105286513809381 is -0.005220440655432809
100%|██████████| 1/1 [00:54<00:00, 54.24s/it, best loss: -0.005220440655432809]

[CDL] Training started!
  0%|          | 0/1 [00:00<?, ?it/s, best loss: ?]

  0%|          | 0/100 [00:00<?, ?it/s]

Learning completed!

[CDL] Evaluation started!
  0%|          | 0/1 [00:52<?, ?it/s, best loss: ?]

Ranking:   0%|          | 0/123 [00:00<?, ?it/s]

The Harmonic Mean for K=50, autoencoderstructure=250, learningrate=0.0037924710247855976 is -0.014496455371280081
100%|██████████| 1/1 [00:52<00:00, 52.35s/it, best loss: -0.014496455371280081]

[CDL] Training started!
  0%|          | 0/1 [00:00<?, ?it/s, best loss: ?]

  0%|          | 0/100 [00:00<?, ?it/s]

Learning completed!

[CDL] Evaluation started!
  0%|          | 0/1 [00:52<?, ?it/s, best loss: ?]

Ranking:   0%|          | 0/123 [00:00<?, ?it/s]

The Harmonic Mean for K=80, autoencoderstructure=250, learningrate=0.002941647021442294 is -0.005220440655432809
100%|██████████| 1/1 [00:52<00:00, 52.88s/it, best loss: -0.005220440655432809]

[CDL] Training started!
  0%|          | 0/1 [00:00<?, ?it/s, best loss: ?]

  0%|          | 0/100 [00:00<?, ?it/s]

Learning completed!

[CDL] Evaluation started!
  0%|          | 0/1 [00:54<?, ?it/s, best loss: ?]

Ranking:   0%|          | 0/123 [00:00<?, ?it/s]

The Harmonic Mean for K=80, autoencoderstructure=250, learningrate=0.0039653561078833765 is -0.005220440655432809
100%|██████████| 1/1 [00:54<00:00, 54.66s/it, best loss: -0.005220440655432809]
1 loop, best of 5: 47.6 s per loop


In [25]:
print(f'The best hyperparameters are: K={bestparams[0]},autoencoderstructure = {bestparams[1]},lr = {bestparams[2]}')
print(f'The harmonic mean from the best parameters is: {bestharmonic}')

The best hyperparameters are: K=50,autoencoderstructure = 250,lr = 0.0037924710247855976
The harmonic mean from the best parameters is: 0.014496455371280081


In [26]:
'''
Testing best params on CDL model
'''

cdl_bestparams = cornac.models.CDL(
    k=bestparams[0],
    autoencoder_structure=[bestparams[1]],
    learning_rate=bestparams[2],
    max_iter=100,
    lambda_u=0.1,
    lambda_v=1,
    lambda_w=0.1,
    lambda_n=1000,
    verbose=VERBOSE,
    seed=SEED
)

exp_bestparams = cornac.Experiment(eval_method=base, models=[cdl_bestparams], metrics=eval_metrics).run()


[CDL] Training started!


  0%|          | 0/100 [00:00<?, ?it/s]

Learning completed!

[CDL] Evaluation started!


Ranking:   0%|          | 0/123 [00:00<?, ?it/s]


TEST:
...
    | NCRR@5 | NDCG@5 | Recall@5 | Train (s) | Test (s)
--- + ------ + ------ + -------- + --------- + --------
CDL | 0.0112 | 0.0138 |   0.0223 |   55.9680 |   0.1257



In [27]:
print('Hyperopt harmonic mean score: {:.4f}'.format(float(hmean([0.0112,0.0138,0.0223]))))

Hyperopt harmonic mean score: 0.0145
