<a href="https://colab.research.google.com/github/surajghuwalewala/CE888_Data_Science_and_Decision_Making/blob/master/Lab_5/my_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
import numpy as np
import pandas as pd

In [29]:
df = pd.read_csv("https://raw.githubusercontent.com/albanda/CE888/master/lab5-recommender/jester-data-1.csv")
df.shape

(24982, 101)

In [30]:
def replace(orig, percentage=0.1):
  """
  Replaces 'percentage'% of the original values in 'orig' with 99's
  :param orig: original data array
  :param percentage: percentage of values to replace (0<percentage<1)
  """
  new_data = orig.copy()
  rated = np.where(orig!=99)
  n_rated = len(rated[0])
  idx = np.random.choice(n_rated, size=int(percentage*n_rated), replace=False)
  new_data[rated[0][idx], rated[1][idx]] = 99
  return new_data, (rated[0][idx], rated[1][idx])

In [31]:
val_set, idx = replace(df.values, 0.1)

In [32]:
def latentFactorModelling(data, n_latent_factors = 2, alpha = 0.0001, iterations=300):

  user_ratings = data.values
  # Initialise as random values
  latent_user_preferences = np.random.random((user_ratings.shape[0], n_latent_factors))
  latent_item_features = np.random.random((user_ratings.shape[1], n_latent_factors))
  
  
  def predict_rating(user_id, item_id):
    """ Predict a rating given a user_id and an item_id.
    """
    user_preference = latent_user_preferences[user_id]
    item_preference = latent_item_features[item_id]
    return user_preference.dot(item_preference)


  def train(user_id, item_id, rating, alpha = alpha):
      
    #print item_id
    prediction_rating = predict_rating(user_id, item_id)
    err =  prediction_rating - rating
    #print err
    user_pref_values = latent_user_preferences[user_id][:]
    latent_user_preferences[user_id] -= alpha * err * latent_item_features[item_id]
    latent_item_features[item_id] -= alpha * err * user_pref_values
    return err
      

  def sgd(iterations=iterations):
    """ Iterate over all users and all items and train for 
        a certain number of iterations
    """
    for iteration in range(iterations):
        error = []
        for user_id in range(latent_user_preferences.shape[0]):
            for item_id in range(latent_item_features.shape[0]):
                rating = user_ratings[user_id][item_id]
                if (rating != 99.0):
                    err = train(user_id, item_id, rating)
                    error.append(err)
        mse = (np.array(error) ** 2).mean()   
        if (iteration % 10) == 0:
            print(mse)

  sgd(iterations)
    
  np.save("{}_latentFactors_{}_iters.npy".format(n_latent_factors, iterations), [latent_user_preferences, latent_item_features ])
  return latent_user_preferences, latent_item_features

In [33]:
# latent_user_preferences, latent_item_features = latentFactorModelling(df, iterations=50,  n_latent_factors = 3)
latent_user_preferences, latent_item_features = np.load("3_latentFactors_50_iters.npy", allow_pickle=True)

In [34]:
latent_user_preferences, latent_item_features

(array([[ 0.9966265 ,  0.79772757,  0.3976957 ],
        [ 0.93894214,  0.98346082, -0.66478696],
        [ 0.1875422 ,  0.71957662,  0.24075053],
        ...,
        [ 0.12617664,  0.21504443,  0.52301555],
        [ 0.2840791 ,  0.18813836,  0.37909168],
        [ 0.45729045,  0.91813345,  0.34218359]]),
 array([[ 4.50287036e+01,  3.91678737e+01,  4.72322628e+01],
        [ 9.58546685e-01,  4.04962612e+00, -3.15922494e+00],
        [ 2.43935256e-01,  4.50750678e+00, -3.83111835e+00],
        [ 1.19163943e+00,  4.09585220e+00, -4.36313636e+00],
        [ 4.94022849e-01,  3.76141790e+00, -5.99534822e+00],
        [ 4.18020259e-01,  3.31445447e+00, -2.82621952e+00],
        [ 4.95802147e-01,  5.72641383e+00, -3.46788928e+00],
        [ 2.29555855e-01,  2.57145801e+00, -3.24395668e+00],
        [ 8.34511413e-02,  1.33557372e+00, -2.24289178e+00],
        [ 6.18734743e-01,  4.07530928e+00, -5.26994597e+00],
        [ 1.26633272e+00,  4.16087433e+00, -3.16145080e+00],
        [ 1.21799740

In [35]:
## Train predictions
predictions = latent_user_preferences.dot(latent_item_features.T)

In [36]:
def calculate_mse(orig, pred):
  error = []
  for user_id in range(orig.shape[0]):
    for item_id in range(orig.shape[1]):
      user_rating = orig[user_id][item_id]
      if(user_rating!=99.0):
        err = pred[user_id][item_id] - user_rating
        error.append(err)

  mse = (np.array(error) ** 2).mean() 

  return mse

In [37]:
## Mean Square Error

mse = calculate_mse(df.values, predictions)

mse

17.79954773195226

In [38]:
df.values.shape

(24982, 101)

In [39]:
## Validation predictions

val_pred = val_set.copy()
val_error = []
for user_id in range(latent_user_preferences.shape[0]):
    for item_id in range(latent_item_features.shape[0]):
        rating = val_set[user_id][item_id]
        if (rating == 99.0):
            prediction_rating = predictions[user_id][item_id]
            val_pred[user_id][item_id] = prediction_rating

val_mse = calculate_mse(df.values, val_pred)

val_mse


1.7821700729109426

In [40]:
## Test Pred
test_pred = df.values.copy()
for user_id in range(latent_user_preferences.shape[0]):
    for item_id in range(latent_item_features.shape[0]):
        rating = val_set[user_id][item_id]
        if (rating == 99.0):
            prediction_rating = predictions[user_id][item_id]
            test_pred[user_id][item_id] = prediction_rating

test_mse = calculate_mse(df.values, test_pred)

test_mse

1.7821700729109426

In [41]:
# values = [zip(df.values[i], predictions[i], val_pred[i], test_pred[i]) for i in range(predictions.shape[0])]
# comparison_data = pd.DataFrame(values)
# comparison_data.columns = df.columns

