In [5]:
import numpy as np
import pandas as pd
import torch
import seaborn as sns
from sklearn.metrics import mean_squared_error

In [6]:
import warnings
from datetime import datetime

warnings.filterwarnings('ignore')
start_time = datetime.now()

In [7]:
class RestrictedBoltzmannMachine():
    """
    Python implementation of a Restricted Boltzmann Machine (RBM) with 'c_nh' hidden nodes and 'c_nv' visible nodes.
    """
    def __init__(self, c_nv, c_nh):
        """
        RBM initialization module where three tensors are defined:
        W - Weight tensor
        a - Visible node bias tensor
        b - Hidden node bias tensor
        a and b are created as two-dimensional tensors to accommodate batches of observations over training.
        """
        self.W = torch.randn(c_nh, c_nv)
        self.a = torch.randn(1, c_nh)
        self.b = torch.randn(1, c_nv)

        
    def sample_h(self, c_vx):
        """
        Method devoted to Gibbs sampling probabilities of hidden nodes given visible nodes - p (h|v)
        c_vx - Input visible node tensor
        """
        c_w_vx = torch.mm(c_vx, self.W.t())
        c_activation = c_w_vx + self.a.expand_as(c_w_vx)
        c_p_h_given_v = torch.sigmoid(c_activation)
        return c_p_h_given_v, torch.bernoulli(c_p_h_given_v)

    
    def sample_v(self, c_hx):
        """
        Method devoted to Gibbs sampling probabilities of visible nodes given hidden nodes - p (v|h)
        c_hx - Input hidden node tensor
        """
        c_w_hx = torch.mm(c_hx, self.W)
        c_activation = c_w_hx + self.b.expand_as(c_w_hx)
        c_p_v_given_h = torch.sigmoid(c_activation)
        return c_p_v_given_h, torch.bernoulli(c_p_v_given_h)

    
    def train(self, c_nr_observations, c_nr_epoch, c_batch_size, c_train_tensor, c_metric):
        """
        Method through which contrastive divergence-based training is performed.
        c_nr_observations - Number of observations used for training
        c_nr_epoch - Number of training epochs
        c_batch_size - Batch size
        c_train_tensor - Tensor containing training observations
        c_metric - Training performance metric of choice ('MAbsE' for Mean Absolute Error, 'RMSE' for Root Mean Square Error)
        """
        print('Training...')
        for c_epoch in range(1, c_nr_epoch + 1):
            c_start_time = datetime.now()
            print(f'Epoch {str(c_epoch)} of {str(c_nr_epoch)} ', end='')
            c_train_loss = 0
            c_s = 0.
            for c_id_user in range(0, c_nr_observations - c_batch_size, c_batch_size):
                c_v0 = c_train_tensor[c_id_user:c_id_user+c_batch_size]
                c_vk = c_train_tensor[c_id_user:c_id_user+c_batch_size]
                c_ph0,_ = self.sample_h(c_v0)
                for c_k in range(10):
                    _,c_hk = self.sample_h(c_vk)
                    _,c_vk = self.sample_v(c_hk)
                    c_vk[c_v0<0] = c_v0[c_v0<0]
                c_phk,_ = self.sample_h(c_vk)
                self.W += (torch.mm(c_v0.t(), c_ph0) - torch.mm(c_vk.t(), c_phk)).t()
                self.b += torch.sum((c_v0 - c_vk), 0)
                self.a += torch.sum((c_ph0 - c_phk), 0)
                if c_metric == 'MAbsE':
                    c_train_loss += torch.mean(torch.abs(c_v0[c_v0>=0] - c_vk[c_v0>=0]))
                elif c_metric == 'RMSE':
                    c_train_loss += np.sqrt(torch.mean((c_v0[c_v0>=0] - c_vk[c_v0>=0])**2))
                c_s += 1.
            c_end_time = datetime.now()
            c_time_elapsed = c_end_time - c_start_time
            c_time_elapsed = c_time_elapsed.total_seconds()
            print(f'- Loss ({c_metric}): {c_train_loss/c_s:.8f} ({c_time_elapsed:.2f} seconds)')


    def test(self, c_nr_observations, c_train_tensor, c_test_tensor, c_metric):
        """
        Method through which testing is performed.
        c_nr_observations - Number of observations used for testing
        c_train_tensor - Tensor containing training observations
        c_test_tensor - Tensor containing testing observations
        c_metric - Training performance metric of choice ('MAbsE' for Mean Absolute Error, 'RMSE' for Root Mean Square Error)
        """
        print('Testing...')
        c_test_loss = 0
        c_s = 0.
        for c_id_user in range(c_nr_observations):
            c_v = c_train_tensor[c_id_user:c_id_user+1]
            c_vt = c_test_tensor[c_id_user:c_id_user+1]
            if len(c_vt[c_vt>=0]) > 0:
                _,c_h = self.sample_h(c_v)
                _,c_v = self.sample_v(c_h)
                if c_metric == 'MAbsE':
                    c_test_loss += torch.mean(torch.abs(c_vt[c_vt>=0] - c_v[c_vt>=0]))
                elif c_metric == 'RMSE':
                    c_test_loss += np.sqrt(torch.mean((c_vt[c_vt>=0] - c_v[c_vt>=0])**2))
                c_s += 1.
        print(f'Test loss ({c_metric}): {c_test_loss/c_s:.8f}')
        
        
    def predict(self, c_visible_nodes):
        """
        Method through which predictions for one specific observation are derived.
        c_visible_nodes - Tensor containing one particular observation (set of values for each visible node) 
        """
        c_h_v,_ = self.sample_h(c_visible_nodes)
        c_v_h,_ = self.sample_v(c_h_v)
        return c_v_h

In [8]:
def convert(f_data, f_nr_observations, f_nr_entities):
        """
        Generates (from a numpy array) a list of lists containing the number of hits per user (rows), per entity (columns).
        Each of the constituent lists will correspond to an observation / user (row).
        Each observation list will contain the number of hits (columns), one for each hit entity
        f_data - Input table (numpy array)
        f_nr_observations - Number of observations
        f_nr_entities - Number of entities hit in each observation
        """
        f_converted_data = []
        for f_id_user in range(1, f_nr_observations + 1):
            f_id_entity = f_data[:,1][f_data[:,0] == f_id_user].astype(int)
            f_id_hits = f_data[:,2][f_data[:,0] == f_id_user]
            f_hits = np.zeros(f_nr_entities)
            f_hits[f_id_entity - 1] = f_id_hits
            f_converted_data.append(list(f_hits))
        return f_converted_data

In [9]:
k=1000
train_df = pd.read_pickle('r_movie_train_df_'+str(k)+'.pkl')
test_df = pd.read_pickle('r_movie_test_df_'+str(k)+'.pkl')
users = pd.read_pickle('r_movie_user_data_'+str(k)+'.pkl')
contents = pd.read_pickle('r_movie_content_data_'+str(k)+'.pkl')
len(train_df),len(test_df),len(users),len(contents)

(4532, 1005, 329, 1229)

In [10]:
train_df

Unnamed: 0,SUB_ID,A_TITLE_ID,SUB_ID_index,A_TITLE_ID_index,interaction_level
0,s_7HbK,at_8kP,0,0,5
1,s_22RD,at_1lg,1,1,5
2,s_6COn,at_7cZ,2,2,5
3,s_6Hbf,at_bKD,3,3,1
4,s_6M3j,at_7y,4,4,1
...,...,...,...,...,...
4527,s_6iZg,at_8kP,229,0,3
4528,s_JAx,at_aht,258,157,5
4529,s_OQq,at_4Nh,52,43,3
4530,s_r1h,at_12d,249,910,1


In [11]:
train_df = train_df[['SUB_ID_index','A_TITLE_ID_index','interaction_level']]
train_df['interaction_level'] = train_df.groupby('SUB_ID_index')[['interaction_level']].apply(lambda x: (x-x.min())/(x.max()-x.min()))
train_df['interaction_level'] = train_df['interaction_level'].fillna(0.5)
train_df.head()

Unnamed: 0,SUB_ID_index,A_TITLE_ID_index,interaction_level
0,0,0,1.0
1,1,1,1.0
2,2,2,1.0
3,3,3,0.0
4,4,4,0.0


In [12]:
train_df.isna().sum()

SUB_ID_index         0
A_TITLE_ID_index     0
interaction_level    0
dtype: int64

In [13]:
test_df = test_df[['SUB_ID_index','A_TITLE_ID_index','interaction_level']]
test_df['interaction_level'] = test_df.groupby('SUB_ID_index')[['interaction_level']].apply(lambda x: (x-x.min())/(x.max()-x.min()))
test_df['interaction_level'] = test_df['interaction_level'].fillna(0.5)
test_df.head()

Unnamed: 0,SUB_ID_index,A_TITLE_ID_index,interaction_level
0,17,222,0.0
1,0,222,0.0
2,255,222,0.0
3,191,222,1.0
4,17,356,0.5


In [14]:
training_set = train_df
test_set = test_df

In [15]:
training_set = training_set.values
test_set = test_set.values

training_set.shape, test_set.shape

((4532, 3), (1005, 3))

In [16]:
nr_users = int(max(max(training_set[:,0]), max(test_set[:,0])))+1
nr_artists = int(max(max(training_set[:,1]), max(test_set[:,1])))+1
nr_users, nr_artists

(329, 1229)

In [17]:
training_set = convert(training_set, nr_users, nr_artists)
test_set = convert(test_set, nr_users, nr_artists)
len(test_set),len(test_set[0])

(329, 1229)

In [18]:
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

In [19]:
nv = len(training_set[0])
nh = 100
batch_size = 1
epoch = 50
metric = 'RMSE'

model = RestrictedBoltzmannMachine(nv, nh)

In [20]:
model.train(nr_users, epoch, batch_size, training_set, metric)
model.test(nr_users, training_set, test_set, metric)

Training...
Epoch 1 of 50 - Loss (RMSE): 0.13731763 (0.65 seconds)
Epoch 2 of 50 - Loss (RMSE): 0.11183197 (0.54 seconds)
Epoch 3 of 50 - Loss (RMSE): 0.10865120 (0.51 seconds)
Epoch 4 of 50 - Loss (RMSE): 0.10681102 (0.53 seconds)
Epoch 5 of 50 - Loss (RMSE): 0.10562892 (0.50 seconds)
Epoch 6 of 50 - Loss (RMSE): 0.10689188 (0.51 seconds)
Epoch 7 of 50 - Loss (RMSE): 0.10511919 (0.50 seconds)
Epoch 8 of 50 - Loss (RMSE): 0.10477584 (0.52 seconds)
Epoch 9 of 50 - Loss (RMSE): 0.10489936 (0.50 seconds)
Epoch 10 of 50 - Loss (RMSE): 0.10330936 (0.51 seconds)
Epoch 11 of 50 - Loss (RMSE): 0.10155431 (0.50 seconds)
Epoch 12 of 50 - Loss (RMSE): 0.10120176 (0.51 seconds)
Epoch 13 of 50 - Loss (RMSE): 0.10018340 (0.51 seconds)
Epoch 14 of 50 - Loss (RMSE): 0.09967983 (0.52 seconds)
Epoch 15 of 50 - Loss (RMSE): 0.09975517 (0.50 seconds)
Epoch 16 of 50 - Loss (RMSE): 0.09784777 (0.50 seconds)
Epoch 17 of 50 - Loss (RMSE): 0.09849181 (0.50 seconds)
Epoch 18 of 50 - Loss (RMSE): 0.09612572 (0.5

In [21]:
def preferred_recommended(f_content,f_user, f_train_set, f_test_set, f_model, f_users, f_top=10):
        """
        Generates music artist recommendations for a particular platform user. 
        f_content - List of artists and corresponding IDs
        f_train_set - Tensor containing training observations
        f_test_set - Tensor containing testing observations
        f_model - A RBM machine learning model previously instantiated
        f_user_id - The user for which preferred artists will be assessed and recommendations will be provided
        f_top - Number of most preferred and most recommended music artists for user 'f_user_id'
        """
        f_users = [x - 1 for x in f_users]
        f_user_sample = f_train_set[f_users]

        f_prediction = f_model.predict(f_user_sample).numpy()
        f_user_sample = f_user_sample.numpy()
        
        users = pd.DataFrame(f_users, columns = ['SUB_ID_index'])
        users['level_0'] = users.index
        watched_contents = pd.DataFrame(f_user_sample).stack().reset_index()
        watched_contents = watched_contents.merge(users, on = "level_0", how = 'left')
        watched_contents = watched_contents.drop('level_0',axis = 1)
        watched_contents.rename(columns={'level_1':'A_TITLE_ID_index',0:'interaction_level'}, inplace = True)
        
        watched_contents['A_TITLE_ID_index'] = watched_contents['A_TITLE_ID_index'].apply(lambda x: (x+1)%nr_artists)
        watched_contents['SUB_ID_index'] = watched_contents['SUB_ID_index'].apply(lambda x: (x+1))
        watched_contents['watched'] = 1
        watched_contents = watched_contents[watched_contents.interaction_level>0]
    
        predictions = pd.DataFrame(f_prediction).stack().reset_index()
        predictions = predictions.merge(users, on = "level_0", how = 'left')
        predictions = predictions.drop('level_0',axis = 1)
        predictions.rename(columns={'level_1':'A_TITLE_ID_index',0:'preds'}, inplace = True)
        predictions['A_TITLE_ID_index'] = predictions['A_TITLE_ID_index'].apply(lambda x: (x+1)%nr_artists)
        predictions['SUB_ID_index'] = predictions['SUB_ID_index'].apply(lambda x: (x+1))
        predictions = predictions.merge(watched_contents, on =['SUB_ID_index','A_TITLE_ID_index'], how = 'left')
        
        out = predictions.sort_values(by = 'preds', ascending = False).groupby('SUB_ID_index').head(5)
        out = out.merge(f_content, on='A_TITLE_ID_index', how='left')
        out = out.merge(f_user, on='SUB_ID_index', how='left')
        out = out[['SUB_ID_index','A_TITLE_ID_index','preds','A_TITLE_ID','SUB_ID']]
        
        predictions = predictions[['SUB_ID_index','A_TITLE_ID_index','preds']]
        return predictions,out, watched_contents

In [22]:
test_users = test_df.SUB_ID_index.unique()
test_users

array([ 17,   0, 255, 191, 201, 204, 325, 228, 147, 262, 126,  88,  10,
         4,  74,  75, 244, 101, 104,   1, 251,  93,  24, 155,  50,  39,
        19, 142,   5,  58, 302, 310,  53,  66, 295, 227,  30,  69, 196,
       222, 281, 148,  60, 113,  22, 266, 149, 246, 205,  35, 254, 183,
        23, 257,  33, 185, 195, 324, 153, 238, 265, 158, 264,  97, 305,
        67,  80,  63, 314, 143, 309,  92, 318, 263, 114, 296,  89, 270,
       119, 326,  32,  41, 130, 199, 283,   9, 219, 163,   7, 107,  57,
       267, 150, 110, 172, 284, 252, 300, 308, 229, 212,  42, 225, 174,
        73,  28, 157, 274, 292, 272, 182, 223,  47, 250, 243,  34, 190,
       224, 117, 301,   3, 312,  18,  81, 151,  40, 193, 112,  14, 100,
        59, 231, 122, 120,  20, 253, 211,  56, 166, 313,  16, 132,  48,
       319, 322, 200, 288, 216, 279, 129,  26, 307, 109, 237, 278, 105,
       181, 180, 206, 146, 294, 213, 236, 268, 221, 173,   6, 103, 102,
        70, 178, 145, 210,  71, 164,  44, 159, 177, 108, 275, 27

In [23]:
future_predictions, future_recommendations, watched_contents = preferred_recommended(contents, users,training_set, test_set, model, test_users, 5)
future_recommendations

Unnamed: 0,SUB_ID_index,A_TITLE_ID_index,preds,A_TITLE_ID,SUB_ID
0,210,80,1.000000,at_7vF,s_5sVP
1,182,887,1.000000,at_aiP,s_1gXQ
2,45,921,0.999999,at_1kB,s_7BkO
3,169,155,0.999999,at_4nh,s_1MQr
4,160,520,0.999997,at_7yu,s_1KzQ
...,...,...,...,...,...
1465,237,647,0.031456,at_7b0,s_2kgr
1466,85,444,0.031452,at_8u,s_5rlb
1467,55,262,0.027050,at_10n,s_BGc
1468,90,98,0.022615,at_4JT,s_I6q


In [24]:
test_users

array([ 17,   0, 255, 191, 201, 204, 325, 228, 147, 262, 126,  88,  10,
         4,  74,  75, 244, 101, 104,   1, 251,  93,  24, 155,  50,  39,
        19, 142,   5,  58, 302, 310,  53,  66, 295, 227,  30,  69, 196,
       222, 281, 148,  60, 113,  22, 266, 149, 246, 205,  35, 254, 183,
        23, 257,  33, 185, 195, 324, 153, 238, 265, 158, 264,  97, 305,
        67,  80,  63, 314, 143, 309,  92, 318, 263, 114, 296,  89, 270,
       119, 326,  32,  41, 130, 199, 283,   9, 219, 163,   7, 107,  57,
       267, 150, 110, 172, 284, 252, 300, 308, 229, 212,  42, 225, 174,
        73,  28, 157, 274, 292, 272, 182, 223,  47, 250, 243,  34, 190,
       224, 117, 301,   3, 312,  18,  81, 151,  40, 193, 112,  14, 100,
        59, 231, 122, 120,  20, 253, 211,  56, 166, 313,  16, 132,  48,
       319, 322, 200, 288, 216, 279, 129,  26, 307, 109, 237, 278, 105,
       181, 180, 206, 146, 294, 213, 236, 268, 221, 173,   6, 103, 102,
        70, 178, 145, 210,  71, 164,  44, 159, 177, 108, 275, 27

In [25]:
def evaluation(future_recommendations, train_df,test_df, k=5, predictions=pd.DataFrame(), algo=''):
    # evaluation metric results
    #remove watched contents
    watched = train_df[['SUB_ID_index','A_TITLE_ID_index']].drop_duplicates()
    watched['watched'] = 1
    test_df = test_df.merge(watched, on =['SUB_ID_index','A_TITLE_ID_index'], how = 'left')
    test_df = test_df[test_df.watched.isna()]
    
    evaluation = test_df.merge(future_recommendations[['SUB_ID_index','A_TITLE_ID_index','preds']], on =['SUB_ID_index','A_TITLE_ID_index'], how = 'left')
    precision = len(evaluation[~evaluation.preds.isna()]) / (len(test_df.SUB_ID_index.unique())*k)
    recall = len(evaluation[~evaluation.preds.isna()]) / len(test_df)
    f1_score = 2*recall*precision/(recall + precision)
    
    if len(predictions)>0:
        future_ratings = test_df.merge(predictions[['SUB_ID_index','A_TITLE_ID_index','preds']], on =['SUB_ID_index','A_TITLE_ID_index'], how = 'left')
        rmse = mean_squared_error(future_ratings.interaction_level, future_ratings.preds, squared=False)
        print('rmse:',rmse)
        
    print('Tp:', len(evaluation[~evaluation.preds.isna()]) )
    print('k: ',k,' precision:', round(precision*100,3),'%')
    print('k: ',k,' recall:', round(recall*100,3),'%')
    print('k: ',k,' f1_score:', round(f1_score*100,3),'%')
    
    out = {'algo':[algo],'precision':[round(precision*100,3)],
           'recall:':[round(recall*100,3)],
           'F1_score:':[round(f1_score*100,3)]}
    return pd.DataFrame(out)

In [26]:
result_rbm = evaluation(future_recommendations,train_df,test_df, k=5, algo='RBM')
result_rbm

Tp: 20
k:  5  precision: 1.361 %
k:  5  recall: 1.99 %
k:  5  f1_score: 1.616 %


Unnamed: 0,algo,precision,recall:,F1_score:
0,RBM,1.361,1.99,1.616


In [27]:
watched_contents[watched_contents.SUB_ID_index== 89]

Unnamed: 0,A_TITLE_ID_index,interaction_level,SUB_ID_index,watched
93410,7,1.0,89,1
93444,41,0.5,89,1
93445,42,1.0,89,1
93457,54,1.0,89,1
93460,57,1.0,89,1
93463,60,1.0,89,1
93467,64,1.0,89,1
93575,172,1.0,89,1
93600,197,1.0,89,1
93635,232,1.0,89,1


In [28]:
future_predictions[future_predictions.SUB_ID_index==89].sort_values(by = 'preds', ascending = False)

Unnamed: 0,SUB_ID_index,A_TITLE_ID_index,preds
93446,89,43,0.630748
93430,89,27,0.335896
93442,89,39,0.333362
93558,89,155,0.293794
93502,89,99,0.272871
...,...,...,...
93831,89,428,0.000002
93497,89,94,0.000001
94127,89,724,0.000001
93560,89,157,0.000001


In [29]:
torch.save(model, 'rbm_model.pkl')