In [1]:
import pandas as pd
import numpy as np
from scipy import sparse

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('sub_supplier_ratings.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2480 entries, 0 to 2479
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   supplier_id  2480 non-null   int64  
 1   subjects     2480 non-null   object 
 2   score        2480 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 58.2+ KB


## 1. SURPRISE Library

In [3]:
from surprise import Dataset
from surprise import Reader
from surprise import SVD, SVDpp, NormalPredictor, KNNBasic, BaselineOnly, CoClustering
from surprise import SlopeOne, KNNBaseline, KNNWithMeans, KNNWithZScore
from surprise.model_selection import cross_validate

reader = Reader(rating_scale=(0, 1))

data = Dataset.load_from_df(df[['supplier_id', 'subjects', 'score']], reader)

In [4]:
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), SVDpp(), SlopeOne(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
   

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...


In [5]:
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
KNNBaseline,0.137136,0.003,0.042908
KNNWithZScore,0.137319,0.004309,0.034171
BaselineOnly,0.138157,0.001233,0.001638
KNNWithMeans,0.139151,0.002666,0.030563
SlopeOne,0.14142,0.001,0.008429
SVDpp,0.141491,0.438943,0.018212
KNNBasic,0.147297,0.002355,0.026962
SVD,0.154877,0.061298,0.004401
CoClustering,0.158023,0.018635,0.002
NormalPredictor,0.188043,0.00104,0.002871


## 2. Light FM Library

In [6]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score
from scipy import sparse

In [7]:
adj_matrix = sparse.csr_matrix(pd.pivot(df, index=['supplier_id'], columns='subjects', values='score').values)
supplier_features = sparse.csr_matrix(pd.read_csv('supplier_features.csv').values)

FileNotFoundError: [Errno 2] No such file or directory: 'supplier_features.csv'

In [None]:
positive_samples = pd.read_csv('positive_known_ratings.csv')
positive_samples['suppliers__ref'] = positive_samples['suppliers__ref'].astype('int32')
test = pd.pivot(positive_samples, index=['suppliers__ref'], columns='projects__study_types_subject_ids', values='positive_score')

In [None]:
positive_samples[['suppliers__ref', 'projects__study_types_subject_ids']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 417 entries, 0 to 416
Data columns (total 2 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   suppliers__ref                     417 non-null    int32 
 1   projects__study_types_subject_ids  417 non-null    object
dtypes: int32(1), object(1)
memory usage: 5.0+ KB


In [None]:
model1 = LightFM(loss='warp',
                random_state=2016,
                learning_rate=0.01,
                no_components=3)
model1.fit(supplier_features, epochs=10)

# preds = []
# for i in positive_samples[['suppliers__ref', 'projects__study_types_subject_ids']].itertuples():
#     preds.append(model1.predict(int(i[1]), i[2]))

<lightfm.lightfm.LightFM at 0x24d9cba80a0>

## 3. Tensorflow Recommenders

In [None]:
positive_samples['suppliers__ref'] = positive_samples['suppliers__ref'].astype('str')

#### 1. List Wise Ranking

In [None]:
import pprint

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import tensorflow_ranking as tfr
import tensorflow_recommenders as tfrs

In [None]:
features = ['suppliers__ref', 'projects__study_types_subject_ids']
target = []

training_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(positive_samples['suppliers__ref'].values, tf.string),
            tf.cast(positive_samples['projects__study_types_subject_ids'].values, tf.string),
            tf.cast(positive_samples['positive_score'].values, tf.float32)
        )
    )
)

ratings = training_dataset.map(lambda x,y,z: {
    "movie_title": y,
    "user_id": x,
    "user_rating": z,
})
movies = ratings.map(lambda x:x['movie_title'])

unique_movie_titles = positive_samples['projects__study_types_subject_ids'].unique()
unique_user_ids = positive_samples['suppliers__ref'].unique()

In [None]:
tf.random.set_seed(42)

# Split between train and tests sets, as before.
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

# We sample 50 lists for each user for the training data. For each list we
# sample 5 movies from the movies the user rated.
train = tfrs.examples.movielens.sample_listwise(
    train,
    num_list_per_user=50,
    num_examples_per_list=5,
    seed=42
)

test = tfrs.examples.movielens.sample_listwise(
    test,
    num_list_per_user=1,
    num_examples_per_list=5,
    seed=42
)

In [None]:
for example in train.take(1):
  pprint.pprint(example)

{'movie_title': <tf.Tensor: shape=(5,), dtype=string, numpy=
array([b'other', b'grooming_cosmetics', b'marketing_advertising',
       b'finance_legal_insurance', b'entertainment'], dtype=object)>,
 'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'458'>,
 'user_rating': <tf.Tensor: shape=(5,), dtype=float32, numpy=array([0.04, 0.  , 0.02, 0.32, 0.  ], dtype=float32)>}


## Build Model

In [None]:
class RankingModel(tfrs.Model):

  def __init__(self, loss):
    super().__init__()
    embedding_dimension = 32

    # Compute embeddings for users.
    self.user_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=tf.convert_to_tensor(unique_user_ids)),
      tf.keras.layers.Embedding(len(unique_user_ids) + 2, embedding_dimension)
    ])

    # Compute embeddings for movies.
    self.movie_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=tf.convert_to_tensor(unique_movie_titles)),
      tf.keras.layers.Embedding(len(unique_movie_titles) + 2, embedding_dimension)
    ])

    # Compute predictions.
    self.score_model = tf.keras.Sequential([
      # Learn multiple dense layers.
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dense(64, activation="relu"),
      # Make rating predictions in the final layer.
      tf.keras.layers.Dense(1)
    ])

    self.task = tfrs.tasks.Ranking(
      loss=loss,
      metrics=[
        tfr.keras.metrics.NDCGMetric(name="ndcg_metric"),
        tf.keras.metrics.RootMeanSquaredError()
      ]
    )

  def call(self, features):
    # We first convert the id features into embeddings.
    # User embeddings are a [batch_size, embedding_dim] tensor.
    user_embeddings = self.user_embeddings(features["user_id"])

    # Movie embeddings are a [batch_size, num_movies_in_list, embedding_dim]
    # tensor.
    movie_embeddings = self.movie_embeddings(features["movie_title"])

    # We want to concatenate user embeddings with movie emebeddings to pass
    # them into the ranking model. To do so, we need to reshape the user
    # embeddings to match the shape of movie embeddings.
    list_length = features["movie_title"].shape[1]
    user_embedding_repeated = tf.repeat(
        tf.expand_dims(user_embeddings, 1), [list_length], axis=1)

    # Once reshaped, we concatenate and pass into the dense layers to generate
    # predictions.
    concatenated_embeddings = tf.concat(
        [user_embedding_repeated, movie_embeddings], 2)

    return self.score_model(concatenated_embeddings)

  def compute_loss(self, features, training=False):
    labels = features.pop("user_rating")

    scores = self(features)

    return self.task(
        labels=labels,
        predictions=tf.squeeze(scores, axis=-1),
    )

In [None]:
epochs = 3

cached_train = train.shuffle(100_000).batch(2).cache()
cached_test = test.batch(2).cache()

## Mean squared error model

In [None]:
mse_model = RankingModel(tf.keras.losses.MeanSquaredError())
mse_model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

In [None]:
mse_model.fit(cached_train, epochs=epochs, verbose=True)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x29e9eb35c70>

## Pairwise hinge loss model

In [None]:
hinge_model = RankingModel(tfr.keras.losses.PairwiseHingeLoss())
hinge_model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

In [None]:
hinge_model.fit(cached_train, epochs=epochs, verbose=True)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x29ea22dc700>

## Listwise model

In [None]:
listwise_model = RankingModel(tfr.keras.losses.ListMLELoss())
listwise_model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

In [None]:
listwise_model.fit(cached_train, epochs=epochs, verbose=True)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x29ea0211f10>

#### Basic Ranking based Recommenders

In [None]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

##### 1. Preparing the dataset

In [None]:
features = ['suppliers__ref', 'projects__study_types_subject_ids']
target = []

training_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(positive_samples['suppliers__ref'].values, tf.string),
            tf.cast(positive_samples['projects__study_types_subject_ids'].values, tf.string),
            tf.cast(positive_samples['positive_score'].values, tf.float32)
        )
    )
)

ratings = training_dataset.map(lambda x,y,z: {
    "subject_id": y,
    "supplier_id": x,
    "score": z,
})

subjects = ratings.map(lambda x:x['subject_id'])

unique_subjects = positive_samples['projects__study_types_subject_ids'].unique()
unique_suppliers = positive_samples['suppliers__ref'].unique()

In [None]:
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

##### 2. Implementing a model

In [None]:
class RankingModel(tf.keras.Model):

  def __init__(self):
    super().__init__()
    embedding_dimension = 32

    # Compute embeddings for users.
    self.supplier_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_suppliers, mask_token=None),
      tf.keras.layers.Embedding(len(unique_suppliers) + 1, embedding_dimension)
    ])

    # Compute embeddings for movies.
    self.subject_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_subjects, mask_token=None),
      tf.keras.layers.Embedding(len(unique_subjects) + 1, embedding_dimension)
    ])

    # Compute predictions.
    self.ratings = tf.keras.Sequential([
      # Learn multiple dense layers.
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dense(64, activation="relu"),
      # Make rating predictions in the final layer.
      tf.keras.layers.Dense(1)
  ])

  def call(self, inputs):

    supplier_id, subject_id = inputs

    supplier_embedding = self.supplier_embeddings(supplier_id)
    subject_embedding = self.subject_embeddings(subject_id)

    return self.ratings(tf.concat([supplier_embedding, subject_embedding], axis=1))

In [None]:
class SupplierRecommender(tfrs.models.Model):

  def __init__(self):
    super().__init__()
    self.ranking_model: tf.keras.Model = RankingModel()
    self.task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
      loss = tf.keras.losses.MeanSquaredError(),
      metrics=[tf.keras.metrics.RootMeanSquaredError()]
    )

  def call(self, features: Dict[str, tf.Tensor]) -> tf.Tensor:
    return self.ranking_model(
        (features["supplier_id"], features["subject_id"]))

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    labels = features.pop("score")

    rating_predictions = self(features)

    # The task computes the loss and the metrics.
    return self.task(labels=labels, predictions=rating_predictions)

In [None]:
cached_train = train.shuffle(100_000).batch(10).cache()
cached_test = test.batch(2).cache()

In [None]:
model = SupplierRecommender()
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))
model.fit(cached_train, epochs=4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x29ea6b87580>

In [None]:
cached_test

<CacheDataset element_spec={'subject_id': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'supplier_id': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'score': TensorSpec(shape=(None,), dtype=tf.float32, name=None)}>

In [None]:
# model.evaluate(cached_test, return_dict=True)

In [None]:
preds = []

for i in positive_samples.itertuples():
    preds.append(model({
        "supplier_id": np.array([i[1]]),
        "subject_id": np.array([i[2]])
        }).numpy().tolist()[0][0])

In [None]:
positive_samples['preds'] = pd.Series(preds)

In [None]:
print(preds)

[0.186411052942276, 0.17048010230064392, 0.40434226393699646, 0.30209293961524963, 0.24737195670604706, 0.3803718090057373, 0.3860608637332916, 0.32912567257881165, 0.27031415700912476, 0.1074674054980278, 0.1137082576751709, 0.1268240064382553, 0.08942921459674835, 0.13704340159893036, 0.09185008704662323, 0.2037433534860611, 0.1362847536802292, 0.11112768948078156, 0.10015000402927399, 0.11726918071508408, 0.13068561255931854, 0.09657150506973267, 0.11846281588077545, 0.12199914455413818, 0.4581640660762787, 0.1459229737520218, 0.09529384970664978, 0.10233353823423386, 0.0840628445148468, 0.1456412374973297, 0.14586901664733887, 0.12258178740739822, 0.36383944749832153, 0.35718849301338196, 0.26017630100250244, 0.2239917665719986, 0.13559921085834503, 0.14297130703926086, 0.24791786074638367, 0.1892600953578949, 0.19448764622211456, 0.20673449337482452, 0.13167281448841095, 0.1906924843788147, 0.10848350077867508, 0.12293143570423126, 0.11321958154439926, 0.13423378765583038, 0.17748

In [None]:
count = 0

for i in positive_samples[['positive_score', 'preds']].itertuples():
    if i[1]-i[2] >= 0.3:
        print(f"Deviation greater than threshold : {i[1], i[2]}\n")
        count+=1

print(f"The total count where Deviation greater than threshold: {count} of {positive_samples.shape[0]}", )

Deviation greater than threshold : (0.86, 0.40434226393699646)

Deviation greater than threshold : (0.86, 0.3860608637332916)

Deviation greater than threshold : (0.7, 0.2037433534860611)

Deviation greater than threshold : (0.43, 0.11726918071508408)

Deviation greater than threshold : (0.67, 0.26017630100250244)

Deviation greater than threshold : (0.51, 0.17748603224754333)

Deviation greater than threshold : (0.56, 0.22913812100887299)

Deviation greater than threshold : (0.68, 0.18800245225429535)

Deviation greater than threshold : (0.66, 0.27682578563690186)

Deviation greater than threshold : (0.68, 0.37486669421195984)

Deviation greater than threshold : (0.74, 0.33969539403915405)

Deviation greater than threshold : (1.0, 0.39411965012550354)

Deviation greater than threshold : (0.75, 0.3112550973892212)

Deviation greater than threshold : (1.0, 0.4491212069988251)

Deviation greater than threshold : (0.79, 0.3574565649032593)

Deviation greater than threshold : (0.99, 0.2977