In [None]:
import numpy as np
import pandas as pd
import torch

In [3]:
import warnings
from datetime import datetime

warnings.filterwarnings('ignore')
start_time = datetime.now()

In [4]:
class RestrictedBoltzmannMachine():
    """
    Python implementation of a Restricted Boltzmann Machine (RBM) with 'c_nh' hidden nodes and 'c_nv' visible nodes.
    """
    def __init__(self, c_nv, c_nh):
        """
        RBM initialization module where three tensors are defined:
        W - Weight tensor
        a - Visible node bias tensor
        b - Hidden node bias tensor
        a and b are created as two-dimensional tensors to accommodate batches of observations over training.
        """
        self.W = torch.randn(c_nh, c_nv)
        self.a = torch.randn(1, c_nh)
        self.b = torch.randn(1, c_nv)


    def sample_h(self, c_vx):
        """
        Method devoted to Gibbs sampling probabilities of hidden nodes given visible nodes - p (h|v)
        c_vx - Input visible node tensor
        """
        c_w_vx = torch.mm(c_vx, self.W.t())
        c_activation = c_w_vx + self.a.expand_as(c_w_vx)
        c_p_h_given_v = torch.sigmoid(c_activation)
        return c_p_h_given_v, torch.bernoulli(c_p_h_given_v)


    def sample_v(self, c_hx):
        """
        Method devoted to Gibbs sampling probabilities of visible nodes given hidden nodes - p (v|h)
        c_hx - Input hidden node tensor
        """
        c_w_hx = torch.mm(c_hx, self.W)
        c_activation = c_w_hx + self.b.expand_as(c_w_hx)
        c_p_v_given_h = torch.sigmoid(c_activation)
        return c_p_v_given_h, torch.bernoulli(c_p_v_given_h)


    def train(self, c_nr_observations, c_nr_epoch, c_batch_size, c_train_tensor, c_metric):
        """
        Method through which contrastive divergence-based training is performed.
        c_nr_observations - Number of observations used for training
        c_nr_epoch - Number of training epochs
        c_batch_size - Batch size
        c_train_tensor - Tensor containing training observations
        c_metric - Training performance metric of choice ('MAbsE' for Mean Absolute Error, 'RMSE' for Root Mean Square Error)
        """
        print('Training...')
        for c_epoch in range(1, c_nr_epoch + 1):
            c_start_time = datetime.now()
            print(f'Epoch {str(c_epoch)} of {str(c_nr_epoch)} ', end='')
            c_train_loss = 0
            c_s = 0.
            for c_id_user in range(0, c_nr_observations - c_batch_size, c_batch_size):
                c_v0 = c_train_tensor[c_id_user:c_id_user+c_batch_size]
                c_vk = c_train_tensor[c_id_user:c_id_user+c_batch_size]
                c_ph0,_ = self.sample_h(c_v0)
                for c_k in range(10):
                    _,c_hk = self.sample_h(c_vk)
                    _,c_vk = self.sample_v(c_hk)
                    c_vk[c_v0<0] = c_v0[c_v0<0]
                c_phk,_ = self.sample_h(c_vk)
                self.W += (torch.mm(c_v0.t(), c_ph0) - torch.mm(c_vk.t(), c_phk)).t()
                self.b += torch.sum((c_v0 - c_vk), 0)
                self.a += torch.sum((c_ph0 - c_phk), 0)
                if c_metric == 'MAbsE':
                    c_train_loss += torch.mean(torch.abs(c_v0[c_v0>=0] - c_vk[c_v0>=0]))
                elif c_metric == 'RMSE':
                    c_train_loss += np.sqrt(torch.mean((c_v0[c_v0>=0] - c_vk[c_v0>=0])**2))
                c_s += 1.
            c_end_time = datetime.now()
            c_time_elapsed = c_end_time - c_start_time
            c_time_elapsed = c_time_elapsed.total_seconds()
            print(f'- Loss ({c_metric}): {c_train_loss/c_s:.8f} ({c_time_elapsed:.2f} seconds)')


    def test(self, c_nr_observations, c_train_tensor, c_test_tensor, c_metric):
        """
        Method through which testing is performed.
        c_nr_observations - Number of observations used for testing
        c_train_tensor - Tensor containing training observations
        c_test_tensor - Tensor containing testing observations
        c_metric - Training performance metric of choice ('MAbsE' for Mean Absolute Error, 'RMSE' for Root Mean Square Error)
        """
        print('Testing...')
        c_test_loss = 0
        c_s = 0.
        for c_id_user in range(c_nr_observations):
            c_v = c_train_tensor[c_id_user:c_id_user+1]
            c_vt = c_test_tensor[c_id_user:c_id_user+1]
            if len(c_vt[c_vt>=0]) > 0:
                _,c_h = self.sample_h(c_v)
                _,c_v = self.sample_v(c_h)
                if c_metric == 'MAbsE':
                    c_test_loss += torch.mean(torch.abs(c_vt[c_vt>=0] - c_v[c_vt>=0]))
                elif c_metric == 'RMSE':
                    c_test_loss += np.sqrt(torch.mean((c_vt[c_vt>=0] - c_v[c_vt>=0])**2))
                c_s += 1.
        print(f'Test loss ({c_metric}): {c_test_loss/c_s:.8f}')


    def predict(self, c_visible_nodes):
        """
        Method through which predictions for one specific observation are derived.
        c_visible_nodes - Tensor containing one particular observation (set of values for each visible node)
        """
        c_h_v,_ = self.sample_h(c_visible_nodes)
        c_v_h,_ = self.sample_v(c_h_v)
        return c_v_h

In [5]:
def convert(f_data, f_nr_observations, f_nr_entities):
        """
        Generates (from a numpy array) a list of lists containing the number of hits per user (rows), per entity (columns).
        Each of the constituent lists will correspond to an observation / user (row).
        Each observation list will contain the number of hits (columns), one for each hit entity
        f_data - Input table (numpy array)
        f_nr_observations - Number of observations
        f_nr_entities - Number of entities hit in each observation
        """
        f_converted_data = []
        for f_id_user in range(1, f_nr_observations + 1):
            f_id_entity = f_data[:,1][f_data[:,0] == f_id_user].astype(int)
            f_id_hits = f_data[:,2][f_data[:,0] == f_id_user]
            f_hits = np.zeros(f_nr_entities)
            f_hits[f_id_entity - 1] = f_id_hits
            f_converted_data.append(list(f_hits))
        return f_converted_data

In [6]:
def preferred_recommended(f_artist_list, f_train_set, f_test_set, f_model, f_user_id, f_top=10):
        """
        Generates music artist recommendations for a particular platform user.
        f_artist_list - List of artists and corresponding IDs
        f_train_set - Tensor containing training observations
        f_test_set - Tensor containing testing observations
        f_model - A RBM machine learning model previously instantiated
        f_user_id - The user for which preferred artists will be assessed and recommendations will be provided
        f_top - Number of most preferred and most recommended music artists for user 'f_user_id'
        """
        if f_user_id < 1515:
            f_user_sample = f_train_set[f_user_id - 1:f_user_id]
        else:
            f_user_sample = f_test_set[f_user_id - 1:f_user_id]
        f_prediction = f_model.predict(f_user_sample).numpy()
        f_user_sample = f_user_sample.numpy()
        f_user_sample = pd.Series(f_user_sample[0])
        f_user_sample = f_user_sample.sort_values(ascending=False)
        f_user_sample = f_user_sample.iloc[:f_top]
        f_fan_list = f_user_sample.index.values.tolist()
        print(f'\nUser {f_user_id} is a fan of...\n')
        for f_artist_id in f_fan_list:
            print(f_artist_list[f_artist_list.artist_id == f_artist_id + 1].iloc[0][1])
        f_prediction = pd.Series(f_prediction[0])
        f_prediction = f_prediction.sort_values(ascending=False)
        f_prediction_list = f_prediction.index.values.tolist()
        print(f'\nUser {f_user_id} may be interested in...\n')
        f_nb_recommendations = 0
        f_i = 0
        while f_nb_recommendations < f_top:
            f_pred_artist = f_prediction_list[f_i]
            if f_pred_artist not in f_fan_list:
                print(f_artist_list[f_artist_list.artist_id == f_pred_artist + 1].iloc[0][1])
                f_nb_recommendations += 1
            f_i += 1

In [7]:
from google.colab import drive
drive.mount('/content/drive')
!pip install -q kaggle
!pip install -q kaggle-cli
!mkdir -p ~/.kaggle
!cp "/content/drive/MyDrive/kaggle.json" ~/.kaggle/
!cat ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d pcbreviglieri/lastfm-music-artist-scrobbles
!unzip /content/lastfm-music-artist-scrobbles

Mounted at /content/drive
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.4/74.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.4/90.4 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.5/107.5 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.7/49.7 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.2/147.2 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for kaggle-cli (setup.py) ... [?25l[?25hdone
  [1;31m

In [8]:
scrobbles = pd.read_csv('/content/lastfm_user_scrobbles.csv', header = 0)
scrobbles.head()

Unnamed: 0,user_id,artist_id,scrobbles
0,1,4562,13883
1,1,10191,11690
2,1,494,11351
3,1,6673,10300
4,1,8402,8983


In [9]:
# Group the 'scrobbles' DataFrame by 'user_id' and select the 'scrobbles' column.
# Then, apply a lambda function to normalize each user's 'scrobbles' count.
scrobbles['scrobbles'] = scrobbles.groupby('user_id')[['scrobbles']].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

# Fill any missing values in the 'scrobbles' column with 0.5.
scrobbles['scrobbles'] = scrobbles['scrobbles'].fillna(0.5)
scrobbles.head()

Unnamed: 0,user_id,artist_id,scrobbles
0,1,4562,1.0
1,1,10191,0.825509
2,1,494,0.798536
3,1,6673,0.714911
4,1,8402,0.610121


In [10]:
training_size = 74254
training_set = scrobbles.iloc[:training_size, :]  # Until userID = 1514
test_set = scrobbles.iloc[training_size:, :]      # Starting at userID = 1515

training_set = training_set.values
test_set = test_set.values

training_set.shape, test_set.shape

((74254, 3), (18538, 3))

In [11]:
# Find the maximum user ID in both the training and test sets,
# then take the overall maximum.
max_user_id = max(max(training_set[:, 0]), max(test_set[:, 0]))

# Find the maximum artist ID in both the training and test sets,
# then take the overall maximum.
max_artist_id = max(max(training_set[:, 1]), max(test_set[:, 1]))

# Convert the maximum user and artist IDs to integers.
nr_users = int(max_user_id)
nr_artists = int(max_artist_id)

In [12]:
nr_users, nr_artists

(1892, 17493)

In [13]:
# Convert the training set into a matrix format suitable for processing,
# with dimensions (nr_users, nr_artists), using the convert function.
training_set = convert(training_set, nr_users, nr_artists)

# Convert the test set into a matrix format suitable for processing,
# with dimensions (nr_users, nr_artists), using the convert function.
test_set = convert(test_set, nr_users, nr_artists)

In [14]:
# Convert the training set from a NumPy array to a PyTorch FloatTensor.
# This step converts the data type and prepares it for processing with PyTorch.
training_set = torch.FloatTensor(training_set)

# Convert the test set from a NumPy array to a PyTorch FloatTensor.
# This step converts the data type and prepares it for processing with PyTorch.
test_set = torch.FloatTensor(test_set)

In [15]:
# Number of visible units, typically the number of items in the dataset.
nv = len(training_set[0])

# Number of hidden units in the Restricted Boltzmann Machine (RBM).
nh = 100

# Batch size for training the RBM. Set to 1 for stochastic gradient descent.
batch_size = 1

# Number of epochs (iterations) for training the RBM.
epoch = 50

# Metric used for evaluation, in this case, Mean Absolute Error (MAE).
metric = 'MAbsE'

# Initialize the RBM model with the number of visible units (nv) and hidden units (nh).
model = RestrictedBoltzmannMachine(nv, nh)

In [16]:
# Train the model using the specified number of users, epochs, batch size, training set, and evaluation metric.
model.train(nr_users, epoch, batch_size, training_set, metric)

# Test the trained model using the specified number of users, training set, test set, and evaluation metric.
model.test(nr_users, training_set, test_set, metric)

Training...
Epoch 1 of 50 - Loss (MAbsE): 0.00214087 (47.37 seconds)
Epoch 2 of 50 - Loss (MAbsE): 0.00069013 (48.22 seconds)
Epoch 3 of 50 - Loss (MAbsE): 0.00072597 (47.00 seconds)
Epoch 4 of 50 - Loss (MAbsE): 0.00060555 (48.68 seconds)
Epoch 5 of 50 - Loss (MAbsE): 0.00060204 (48.68 seconds)
Epoch 6 of 50 - Loss (MAbsE): 0.00060220 (47.86 seconds)
Epoch 7 of 50 - Loss (MAbsE): 0.00057565 (48.75 seconds)
Epoch 8 of 50 - Loss (MAbsE): 0.00059644 (48.69 seconds)
Epoch 9 of 50 - Loss (MAbsE): 0.00062342 (47.89 seconds)
Epoch 10 of 50 - Loss (MAbsE): 0.00057898 (48.66 seconds)
Epoch 11 of 50 - Loss (MAbsE): 0.00055928 (48.12 seconds)
Epoch 12 of 50 - Loss (MAbsE): 0.00055221 (48.94 seconds)
Epoch 13 of 50 - Loss (MAbsE): 0.00057799 (48.91 seconds)
Epoch 14 of 50 - Loss (MAbsE): 0.00062145 (48.38 seconds)
Epoch 15 of 50 - Loss (MAbsE): 0.00061552 (48.33 seconds)
Epoch 16 of 50 - Loss (MAbsE): 0.00058836 (48.70 seconds)
Epoch 17 of 50 - Loss (MAbsE): 0.00057588 (48.01 seconds)
Epoch 18 of

In [18]:
artist_list = pd.read_csv('/content/lastfm_artist_list.csv', header = 0)

In [19]:
preferred_recommended(artist_list, training_set, test_set, model, 1515, 10)


User 1515 is a fan of...

Glee Cast
Britney Spears
Lady Gaga
Christina Aguilera
Fresno
Beyonce
Nx Zero
Avril Lavigne
Katy Perry
Rihanna

User 1515 may be interested in...

Paramore
Shakira
Miley Cyrus
The Pretty Reckless
Hannah Montana
Cascada
Jordin Sparks
Cheryl Cole
The Saturdays
Reik


In [20]:
preferred_recommended(artist_list, training_set, test_set, model, 1789, 10)


User 1789 is a fan of...

Iron Maiden
Megadeth
Tuatha De Danann
Slayer
Korpiklaani
Led Zeppelin
Ac/Dc
Ozzy Osbourne
Matanza
Avenged Sevenfold

User 1789 may be interested in...

Marco Masini
Red Hot Chili Peppers
Black Sabbath
John Lennon
Opeth
Death
Primus
Lamb Of God
Paradise Lost
Metallica


In [22]:
preferred_recommended(artist_list, training_set, test_set, model, 111, 10)


User 111 is a fan of...

Dudley Taft
Kirk Fletcher
Curtis Salgado
Andy Egert Blues Band
Anders Osborne
Stevie Ray Vaughan And Double Trouble
Lance Lopez
Stevie Ray Vaughan
Joe Bonamassa
Eric Clapton

User 111 may be interested in...

Red Hot Chili Peppers
2pac
Paramore
John Frusciante
Nine Inch Nails
Portishead
The Velvet Underground
Beck
Porcupine Tree
Marilyn Manson
