In [None]:
import pandas as pd
import numpy as np
import sqlite3
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
from tqdm import tqdm

import cornac
from cornac.data import Reader
from cornac.eval_methods import BaseMethod, RatioSplit
from cornac.models import Recommender, BPR, WMF

%load_ext autoreload
%autoreload 2
%matplotlib inline
# %tensorflow_version 1.x
import tensorflow as tf

print(f"Cornac version: {cornac.__version__}")
print(f"Tensorflow version: {tf.__version__}")

SEED = 2020
VERBOSE = True
MODEL_DIR="D:/CS608 Project/"

Cornac version: 1.7.1
Tensorflow version: 1.15.0


In [None]:
df = pd.read_csv('D:/CS608 Project/triplets_sample.csv')

### Stratified split df into trainning and testing:

In [None]:
from sklearn.model_selection import train_test_split
X = df[['user_id', 'song_id']].values
Y = df[['play_count']].values

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2020)

In [None]:
triplets_train = pd.concat([pd.DataFrame(x_train), pd.DataFrame(y_train)], axis=1)
triplets_test = pd.concat([pd.DataFrame(x_test), pd.DataFrame(y_test)], axis=1)

triplets_train.columns = ['user_id', 'song_id', 'play_count']
triplets_test.columns = ['user_id', 'song_id', 'play_count']
print(triplets_train.shape, triplets_test.shape)

(655968, 3) (163993, 3)


In [None]:
triplets_train.to_csv('D:/CS608 Project/triplets_train.csv', index=None)
triplets_test.to_csv('D:/CS608 Project/triplets_test.csv', index=None)

## Implicit Feedback Model

In [None]:
train_data = pd.read_csv('D:/CS608 Project/triplets_train.csv')
test_data = pd.read_csv('D:/CS608 Project/triplets_test.csv')

In [None]:
# base_method = BaseMethod.from_splits(train_data, test_data, exclude_unknowns=True, seed=SEED, verbose=True)
rs = RatioSplit(data=df.values, test_size=0.1, val_size=0.1, early_stop=True, verbose=True)

eval_metrics = [
  cornac.metrics.AUC(),
  cornac.metrics.Precision(k=20),
  cornac.metrics.Recall(k=20),
  cornac.metrics.FMeasure(k=20),
  cornac.metrics.NDCG(k=[20]),
  cornac.metrics.NCRR(k=[20]),
  cornac.metrics.MRR(),
  cornac.metrics.MAP()
]

rating_threshold = 1.0
exclude_unknowns = True
---
Training data:
Number of users = 41894
Number of items = 21840
Number of ratings = 655967
Max rating = 150.0
Min rating = 5.0
Global mean = 11.0
---
Test data:
Number of users = 33885
Number of items = 17851
Number of ratings = 81991
Number of unknown users = 0
Number of unknown items = 0
---
Validation data:
Number of users = 33931
Number of items = 17891
Number of ratings = 81987
---
Total users = 41894
Total items = 21840


### 1. BPR

In [None]:
from cornac.hyperopt import Discrete, Continuous
from cornac.hyperopt import GridSearch, RandomSearch

bpr = BPR(max_iter=50, verbose=True, seed=2020)
auc = cornac.metrics.AUC()

# Wrap BPR model inside GridSearch along with the searching space
gs_bpr = GridSearch(
    model=bpr,
    space=[
        Discrete("k", [20, 40, 60]),
        Discrete('learning_rate', [1e-4, 1e-3]), 
        Discrete("lambda_reg", [1e-2, 1e-3]),
    ],
    metric = auc,
    eval_method = rs,
)


In [None]:
# Put everything together into an experiment and run it
cornac.Experiment(
    eval_method=rs,
    models=[gs_bpr],
    metrics=eval_metrics,
    user_based=False,
).run()


[GridSearch_BPR] Training started!
Evaluating: {'k': 20, 'lambda_reg': 0.001, 'learning_rate': 0.0001}


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Optimization finished!
Evaluating: {'k': 20, 'lambda_reg': 0.001, 'learning_rate': 0.001}


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Optimization finished!
Evaluating: {'k': 20, 'lambda_reg': 0.01, 'learning_rate': 0.0001}


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Optimization finished!
Evaluating: {'k': 20, 'lambda_reg': 0.01, 'learning_rate': 0.001}


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Optimization finished!
Evaluating: {'k': 40, 'lambda_reg': 0.001, 'learning_rate': 0.0001}


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Optimization finished!
Evaluating: {'k': 40, 'lambda_reg': 0.001, 'learning_rate': 0.001}


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Optimization finished!
Evaluating: {'k': 40, 'lambda_reg': 0.01, 'learning_rate': 0.0001}


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Optimization finished!
Evaluating: {'k': 40, 'lambda_reg': 0.01, 'learning_rate': 0.001}


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Optimization finished!
Evaluating: {'k': 60, 'lambda_reg': 0.001, 'learning_rate': 0.0001}


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Optimization finished!
Evaluating: {'k': 60, 'lambda_reg': 0.001, 'learning_rate': 0.001}


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Optimization finished!
Evaluating: {'k': 60, 'lambda_reg': 0.01, 'learning_rate': 0.0001}


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Optimization finished!
Evaluating: {'k': 60, 'lambda_reg': 0.01, 'learning_rate': 0.001}


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Optimization finished!
Best parameter settings: {'k': 20, 'lambda_reg': 0.01, 'learning_rate': 0.0001}
AUC = 0.7717

[GridSearch_BPR] Evaluation started!


HBox(children=(IntProgress(value=0, description='Ranking', max=33885, style=ProgressStyle(description_width='i…




HBox(children=(IntProgress(value=0, description='Ranking', max=33931, style=ProgressStyle(description_width='i…



VALIDATION:
...
               |    AUC |  F1@20 |    MAP |    MRR | NCRR@20 | NDCG@20 | Precision@20 | Recall@20 | Time (s)
-------------- + ------ + ------ + ------ + ------ + ------- + ------- + ------------ + --------- + --------
GridSearch_BPR | 0.7717 | 0.0114 | 0.0186 | 0.0342 |  0.0199 |  0.0293 |       0.0066 |    0.0570 | 550.1325

TEST:
...
               |    AUC |  F1@20 |    MAP |    MRR | NCRR@20 | NDCG@20 | Precision@20 | Recall@20 | Train (s) | Test (s)
-------------- + ------ + ------ + ------ + ------ + ------- + ------- + ------------ + --------- + --------- + --------
GridSearch_BPR | 0.7725 | 0.0112 | 0.0180 | 0.0330 |  0.0191 |  0.0285 |       0.0065 |    0.0559 | 3740.2769 | 630.0454



In [None]:
base_method = BaseMethod.from_splits(train_data.values, test_data.values, exclude_unknowns=True, seed=SEED, verbose=True)

rating_threshold = 1.0
exclude_unknowns = True
---
Training data:
Number of users = 41892
Number of items = 21840
Number of ratings = 655968
Max rating = 150.0
Min rating = 5.0
Global mean = 11.0
---
Test data:
Number of users = 39733
Number of items = 20775
Number of ratings = 163966
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 41892
Total items = 21840


In [None]:
bpr = BPR(k=20, max_iter=100, learning_rate=0.0001, lambda_reg=0.01, verbose=True, seed=SEED, name=f"song_BPR")
cornac.Experiment(eval_method=base_method, models=[bpr], metrics=eval_metrics).run()


[song_BPR] Training started!


HBox(children=(IntProgress(value=0), HTML(value='')))


Optimization finished!

[song_BPR] Evaluation started!


HBox(children=(IntProgress(value=0, description='Ranking', max=39733, style=ProgressStyle(description_width='i…



TEST:
...
         |    AUC |  F1@20 |    MAP |    MRR | NCRR@20 | NDCG@20 | Precision@20 | Recall@20 | Train (s) | Test (s)
-------- + ------ + ------ + ------ + ------ + ------- + ------- + ------------ + --------- + --------- + --------
song_BPR | 0.7695 | 0.0181 | 0.0209 | 0.0537 |  0.0267 |  0.0359 |       0.0114 |    0.0586 |   43.5408 | 765.8838



In [None]:
bpr.save('D:/CS608 Project/')

song_BPR model is saved to D:/CS608 Project/song_BPR\2020-07-24_20-38-51-267935.pkl


'D:/CS608 Project/song_BPR\\2020-07-24_20-38-51-267935.pkl'

In [None]:
# Get items' latent vectors matrix
pd.DataFrame(bpr.i_factors).to_csv('D:/CS608 Project/bpr_item_lf.csv', index=None)

In [None]:
# import pickle

# # save the model to disk
# filename = 'finalized_bpr_model'
# pickle.dump(bpr, open(filename, 'wb'))

# # # load the model from disk
# # loaded_model = pickle.load(open('finalized_bpr_model', 'rb'))

In [None]:
import pickle
# load the model from disk
bpr = pickle.load(open('finalized_bpr_model', 'rb'))

In [None]:
user_id2idx = bpr.train_set.uid_map
user_idx2id  = list(bpr.train_set.user_ids)
item_id2idx = bpr.train_set.iid_map
item_idx2id = list(bpr.train_set.item_ids)

In [None]:
# import pandas as pd
# pd.DataFrame(user_idx2id).to_csv('user_idx2id_bpr.csv', index=None)
# pd.DataFrame(item_idx2id).to_csv('item_idx2id_bpr.csv', index=None)

### 2. WMF

In [None]:
from cornac.hyperopt import Discrete, Continuous
from cornac.hyperopt import GridSearch, RandomSearch

wmf = WMF(a=1.0, lambda_u=0.001, lambda_v=0.001, max_iter=50, verbose=True, seed=2020)
auc = cornac.metrics.AUC()

# Wrap WMF model inside GridSearch along with the searching space
gs_wmf = GridSearch(
    model=wmf,
    space=[
        Discrete("k", [20, 40, 60]),
        Discrete('learning_rate', [1e-4, 1e-3]), 
        Discrete("b", [1e-2, 1e-3]),
    ],
    metric=auc,
    eval_method = rs,
)

In [None]:
# Put everything together into an experiment and run it
cornac.Experiment(
    eval_method=rs,
    models=[gs_wmf],
    metrics=eval_metrics,
    user_based=False
).run()


[GridSearch_WMF] Training started!
Evaluating: {'b': 0.001, 'k': 20, 'learning_rate': 0.0001}


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Learning completed!
Evaluating: {'b': 0.001, 'k': 20, 'learning_rate': 0.001}


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Learning completed!
Evaluating: {'b': 0.001, 'k': 40, 'learning_rate': 0.0001}


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Learning completed!
Evaluating: {'b': 0.001, 'k': 40, 'learning_rate': 0.001}


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Learning completed!
Evaluating: {'b': 0.001, 'k': 60, 'learning_rate': 0.0001}


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Learning completed!
Evaluating: {'b': 0.001, 'k': 60, 'learning_rate': 0.001}


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Learning completed!
Evaluating: {'b': 0.01, 'k': 20, 'learning_rate': 0.0001}


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Learning completed!
Evaluating: {'b': 0.01, 'k': 20, 'learning_rate': 0.001}


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Learning completed!
Evaluating: {'b': 0.01, 'k': 40, 'learning_rate': 0.0001}


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Learning completed!
Evaluating: {'b': 0.01, 'k': 40, 'learning_rate': 0.001}


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Learning completed!
Evaluating: {'b': 0.01, 'k': 60, 'learning_rate': 0.0001}


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Learning completed!
Evaluating: {'b': 0.01, 'k': 60, 'learning_rate': 0.001}


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Learning completed!
Best parameter settings: {'b': 0.01, 'k': 40, 'learning_rate': 0.001}
AUC = 0.8656

[GridSearch_WMF] Evaluation started!


HBox(children=(IntProgress(value=0, description='Ranking', max=33858, style=ProgressStyle(description_width='i…




HBox(children=(IntProgress(value=0, description='Ranking', max=33948, style=ProgressStyle(description_width='i…



VALIDATION:
...
               |    AUC |  F1@20 |    MAP |    MRR | NCRR@20 | NDCG@20 | Precision@20 | Recall@20 | Time (s)
-------------- + ------ + ------ + ------ + ------ + ------- + ------- + ------------ + --------- + --------
GridSearch_WMF | 0.8656 | 0.0279 | 0.0370 | 0.0638 |  0.0391 |  0.0639 |       0.0162 |    0.1345 | 443.8617

TEST:
...
               |    AUC |  F1@20 |    MAP |    MRR | NCRR@20 | NDCG@20 | Precision@20 | Recall@20 |  Train (s) | Test (s)
-------------- + ------ + ------ + ------ + ------ + ------- + ------- + ------------ + --------- + ---------- + --------
GridSearch_WMF | 0.8656 | 0.0278 | 0.0364 | 0.0635 |  0.0385 |  0.0630 |       0.0162 |    0.1324 | 35936.0461 | 447.0626



In [None]:
base_method = BaseMethod.from_splits(train_data.values, test_data.values, exclude_unknowns=True, seed=SEED, verbose=True)

rating_threshold = 1.0
exclude_unknowns = True
---
Training data:
Number of users = 41892
Number of items = 21840
Number of ratings = 655968
Max rating = 150.0
Min rating = 5.0
Global mean = 11.0
---
Test data:
Number of users = 39733
Number of items = 20775
Number of ratings = 163966
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 41892
Total items = 21840


In [None]:
wmf = WMF(k=40, max_iter=100, a=1.0, b=0.01, learning_rate=0.001, lambda_u=0.001, lambda_v=0.001,
          verbose=True, seed=SEED, name=f"song_WMF")

cornac.Experiment(eval_method=base_method, models=[wmf], metrics=eval_metrics).run()


[song_WMF] Training started!


HBox(children=(IntProgress(value=0), HTML(value='')))


Learning completed!

[song_WMF] Evaluation started!


HBox(children=(IntProgress(value=0, description='Ranking', max=39733, style=ProgressStyle(description_width='i…



TEST:
...
         |    AUC |  F1@20 |    MAP |    MRR | NCRR@20 | NDCG@20 | Precision@20 | Recall@20 | Train (s) | Test (s)
-------- + ------ + ------ + ------ + ------ + ------- + ------- + ------------ + --------- + --------- + --------
song_WMF | 0.8674 | 0.0456 | 0.0441 | 0.0991 |  0.0535 |  0.0790 |       0.0290 |    0.1386 | 6173.3920 | 637.8924



In [None]:
wmf.save('D:/CS608 Project/')

song_WMF model is saved to D:/CS608 Project/song_WMF\2020-07-24_20-24-37-564571.pkl


'D:/CS608 Project/song_WMF\\2020-07-24_20-24-37-564571.pkl'

In [None]:
# Get items' latent vectors matrix
pd.DataFrame(wmf.V).to_csv('D:/CS608 Project/wmf_item_lf.csv', index=None)

In [None]:
# import pickle

# # save the model to disk
# filename = 'finalized_wmf_model'
# pickle.dump(wmf, open(filename, 'wb'))

# # # load the model from disk
# # loaded_model = pickle.load(open('finalized_wmf_model', 'rb'))

In [None]:
import pickle
# load the model from disk
wmf = pickle.load(open('finalized_wmf_model', 'rb'))

In [None]:
user_id2idx = wmf.train_set.uid_map
user_idx2id  = list(wmf.train_set.user_ids)
item_id2idx = wmf.train_set.iid_map
item_idx2id = list(wmf.train_set.item_ids)

In [None]:
# import pandas as pd
# pd.DataFrame(user_idx2id).to_csv('user_idx2id_wmf.csv', index=None)
# pd.DataFrame(item_idx2id).to_csv('item_idx2id_wmf.csv', index=None)