In [8]:
import pandas as pd
import numpy as np
from scipy import sparse

import pprint

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import tensorflow_ranking as tfr
import tensorflow_recommenders as tfrs

import warnings
warnings.filterwarnings("ignore")

In [26]:
positive_samples = pd.read_csv('adj_matrx_v2/01_adjacency_continent_lang_study_subject_weighted_suppliers_average_profit.csv')
suppliers = positive_samples.suppliers_info.values.tolist()
positive_samples = positive_samples.set_index('suppliers_info')
positive_samples = positive_samples.sort_index()
positive_samples = positive_samples.stack().reset_index()

positive_samples.columns = ['supplier_id', 'subjects', 'score']

positive_samples['supplier_id'] = positive_samples['supplier_id'].astype('str')
positive_samples['score'] = positive_samples['score'].astype('float32')

positive_samples = positive_samples[positive_samples.score != 0.0]

positive_samples.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2801 entries, 23 to 33304
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   supplier_id  2801 non-null   object 
 1   subjects     2801 non-null   object 
 2   score        2801 non-null   float32
dtypes: float32(1), object(2)
memory usage: 76.6+ KB


## 1. SURPRISE Library

In [27]:
from surprise import Dataset
from surprise import Reader
from surprise import SVD, SVDpp, NormalPredictor, KNNBasic, BaselineOnly, CoClustering
from surprise import SlopeOne, KNNBaseline, KNNWithMeans, KNNWithZScore
from surprise.model_selection import cross_validate

reader = Reader(rating_scale=(0, 1))

data = Dataset.load_from_df(positive_samples[['supplier_id', 'subjects', 'score']], reader)

In [28]:
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), SVDpp(), SlopeOne(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...


In [29]:
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
KNNBaseline,13.287508,0.011642,0.069972
BaselineOnly,13.291603,0.001,0.002005
SVD,13.297514,0.070772,0.004
KNNBasic,13.300853,0.009185,0.044332
SlopeOne,13.31248,0.003656,0.005011
SVDpp,13.328593,0.228628,0.010751
KNNWithMeans,13.351541,0.015396,0.065041
CoClustering,13.366697,0.074164,0.003345
KNNWithZScore,13.373244,0.02339,0.069141
NormalPredictor,13.441,0.000998,0.003665


## 2. LTR

In [30]:
positive_samples.columns = ['suppliers__ref', 'projects__study_types_subject_ids', 'positive_score']

training_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(positive_samples['suppliers__ref'].values, tf.string),
            tf.cast(positive_samples['projects__study_types_subject_ids'].values, tf.string),
            tf.cast(positive_samples['positive_score'].values, tf.float32)
        )
    )
)

ratings = training_dataset.map(lambda x,y,z: {
    "movie_title": y,
    "user_id": x,
    "user_rating": z,
})
movies = ratings.map(lambda x:x['movie_title'])

unique_movie_titles = positive_samples['projects__study_types_subject_ids'].unique()
unique_user_ids = positive_samples['suppliers__ref'].unique()

In [31]:
tf.random.set_seed(42)

# Split between train and tests sets, as before.
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(100_000)
test = shuffled.skip(80_000).take(20_000)

# We sample 50 lists for each user for the training data. For each list we
# sample 5 movies from the movies the user rated.
train = tfrs.examples.movielens.sample_listwise(
    train,
    num_list_per_user=50,
    num_examples_per_list=5,
    seed=42
)

test = tfrs.examples.movielens.sample_listwise(
    test,
    num_list_per_user=1,
    num_examples_per_list=5,
    seed=42
)

In [32]:
for example in train.take(1):
  pprint.pprint(example)

{'movie_title': <tf.Tensor: shape=(5,), dtype=string, numpy=
array([b'b2b@technology', b'consumer_study@print_social_media',
       b'consumer_study@other', b'consumer_study@entertainment',
       b'b2b@security'], dtype=object)>,
 'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'south-america@por@271'>,
 'user_rating': <tf.Tensor: shape=(5,), dtype=float32, numpy=
array([0.18429855, 0.900848  , 0.49826503, 1.1475626 , 0.2804881 ],
      dtype=float32)>}


## Build Model

In [33]:
class RankingModel(tfrs.Model):

  def __init__(self, loss):
    super().__init__()
    embedding_dimension = 32

    # Compute embeddings for users.
    self.user_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=tf.convert_to_tensor(unique_user_ids)),
      tf.keras.layers.Embedding(len(unique_user_ids) + 2, embedding_dimension)
    ])

    # Compute embeddings for movies.
    self.movie_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=tf.convert_to_tensor(unique_movie_titles)),
      tf.keras.layers.Embedding(len(unique_movie_titles) + 2, embedding_dimension)
    ])

    # Compute predictions.
    self.score_model = tf.keras.Sequential([
      # Learn multiple dense layers.
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dense(64, activation="relu"),
      # Make rating predictions in the final layer.
      tf.keras.layers.Dense(1)
    ])

    self.task = tfrs.tasks.Ranking(
      loss=loss,
      metrics=[
        tfr.keras.metrics.NDCGMetric(name="ndcg_metric"),
        tf.keras.metrics.RootMeanSquaredError()
      ]
    )

  def call(self, features):
    # We first convert the id features into embeddings.
    # User embeddings are a [batch_size, embedding_dim] tensor.
    user_embeddings = self.user_embeddings(features["user_id"])

    # Movie embeddings are a [batch_size, num_movies_in_list, embedding_dim]
    # tensor.
    movie_embeddings = self.movie_embeddings(features["movie_title"])

    # We want to concatenate user embeddings with movie emebeddings to pass
    # them into the ranking model. To do so, we need to reshape the user
    # embeddings to match the shape of movie embeddings.
    list_length = features["movie_title"].shape[1]
    user_embedding_repeated = tf.repeat(
        tf.expand_dims(user_embeddings, 1), [list_length], axis=1)

    # Once reshaped, we concatenate and pass into the dense layers to generate
    # predictions.
    concatenated_embeddings = tf.concat(
        [user_embedding_repeated, movie_embeddings], 2)

    return self.score_model(concatenated_embeddings)

  def compute_loss(self, features, training=False):
    labels = features.pop("user_rating")

    scores = self(features)

    return self.task(
        labels=labels,
        predictions=tf.squeeze(scores, axis=-1),
    )

In [34]:
epochs = 3

cached_train = train.shuffle(100_000).batch(512).cache()
cached_test = test.batch(64).cache()

## MSE Model

In [35]:
mse_model = RankingModel(tf.keras.losses.MeanSquaredError())
mse_model.compile(optimizer=tf.keras.optimizers.Adam(0.01))
callback = tf.keras.callbacks.EarlyStopping(monitor='root_mean_squared_error', patience=3, restore_best_weights=True, verbose=0)

mse_model.fit(cached_train, epochs=epochs, verbose=True, callbacks=[callback])

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1fee58a4820>

## Pairwise hinge loss model

In [36]:
hinge_model = RankingModel(tfr.keras.losses.PairwiseHingeLoss())
hinge_model.compile(optimizer=tf.keras.optimizers.Adam(0.01))
callback = tf.keras.callbacks.EarlyStopping(monitor='root_mean_squared_error', patience=3, restore_best_weights=True, verbose=0)

hinge_model.fit(cached_train, epochs=epochs, verbose=True, callbacks=[callback])

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1fed7dd5fa0>

## ListWise Model

In [37]:
listwise_model = RankingModel(tfr.keras.losses.ListMLELoss())
listwise_model.compile(optimizer=tf.keras.optimizers.Adam(0.01))

listwise_model.fit(cached_train, epochs=epochs, verbose=True)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1fee59a9ee0>

In [None]:
listwise_model = RankingModel(tfr.keras.losses.ApproxMRRLoss())
listwise_model.compile(optimizer=tf.keras.optimizers.Adam(0.01))

listwise_model.fit(cached_train, epochs=epochs, verbose=True)