# Jester data exercise

This file is created as part of requirements in CE888.<br>
**Author** : Tomoko Ayakawa<br>
**Created on**: 4 February 2019<br>
**Last modified**: 13 February 2019

## (1) Import libraries

In [4]:
import pandas as pd
import numpy as np
from IPython.display import Image
np.set_printoptions(precision = 3)

## (2) Load data

In [5]:
joke_rating_df = pd.read_csv("jester-data-1.csv", header=None, index_col=0)
joke_rating_df.head(5)

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
74,-7.82,8.79,-9.66,-8.16,-7.52,-8.5,-9.85,4.17,-8.98,-4.76,...,2.82,99.0,99.0,99.0,99.0,99.0,-5.63,99.0,99.0,99.0
100,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,8.88,9.22,...,2.82,-4.95,-0.29,7.86,-0.19,-2.14,3.06,0.34,-4.32,1.07
49,99.0,99.0,99.0,99.0,9.03,9.27,9.03,9.27,99.0,99.0,...,99.0,99.0,99.0,9.08,99.0,99.0,99.0,99.0,99.0,99.0
48,99.0,8.35,99.0,99.0,1.8,8.16,-2.82,6.21,99.0,1.84,...,99.0,99.0,99.0,0.53,99.0,99.0,99.0,99.0,99.0,99.0
91,8.5,4.61,-4.17,-5.39,1.36,1.6,7.04,4.61,-0.44,5.73,...,5.19,5.58,4.27,5.19,5.73,1.55,3.11,6.55,1.8,1.6


#### Prepare trainig, validation and test datasets

In [6]:
num_users = joke_rating_df.shape[0]
num_jokes = joke_rating_df.shape[1]

In [7]:
# obtain index of cells which are not 99
trg_idx=[]
tst_idx=[]
for i in range (num_users):
    for j in range (num_jokes):
        if joke_rating_df.values[i][j] != 99:
            trg_idx.append ([i,j])
        else:
            tst_idx.append ([i,j])

In [8]:
# shuffule the list of target index
np.random.shuffle(trg_idx)

num_trg_idx = len (trg_idx)
num_trg_idx

1810455

#### Split the data into training and validation datasets

In [9]:
split = num_trg_idx//10
vld_idx = trg_idx[:split]
trg_idx = trg_idx[split:]

print ("Training: %d, Validation: %d, Test: %d" % (len(trg_idx), len(vld_idx), len(tst_idx)))

Training: 1629410, Validation: 181045, Test: 687845


## (3) Latent factor modelling
#### Set random latent factors for users and jokes

In [10]:
n_factors = 2

latent_user_factors = np.random.random((num_users, n_factors))
latent_joke_factors = np.random.random((num_jokes,n_factors))

#### Predict rating
Compute the rating by computing dot product of `latent user factor` and `latent joke factor`.

In [11]:
def predict_rating(user_id,joke_id):
    user_factor = latent_user_factors[user_id]
    joke_factor = latent_joke_factors[joke_id]

    return user_factor.dot(joke_factor)

#### Train the model
1. Compute the rating with current latent factors
2. Update the latent factors proportionally to the error

Argument: `alpha` = learning model (default 0.0001)

In [12]:
def train(user_id, joke_id, rating, mode, alpha = 0.0001):
    pred_rating = predict_rating(user_id, joke_id)
    err = pred_rating - rating
    
    # update latent factors when mode = 0 (training)
    if mode == 0:
        user_factor_values = latent_user_factors[user_id][:]
        latent_user_factors[user_id] -= alpha * err * latent_joke_factors[joke_id]
        latent_joke_factors[joke_id] -= alpha * err * user_factor_values
    
    return err

#### Store the errors

In [13]:
def compute_error(indices, mode):
    errors = []
    for i in indices:
        user_id, joke_id = i[0], i[1]
        rating = joke_rating_np[user_id][joke_id]
        if (not np.isnan (rating)):
            err = train (user_id, joke_id, rating, mode)
            errors.append (err)
    
    return errors

#### Training iteration
Iterate training for `iterations` times.

In [14]:
joke_rating_np = joke_rating_df.values

def sgd(iterations = 300000):
    for i in range(iterations):
        trg_error = compute_error (trg_idx, 0)
        trg_mse = (np.array(trg_error) ** 2).mean()   
        
        if(i%100 == 0 ):
            print ("Training MSE @iteration %6d = %.5f" % (i, trg_mse))

In [None]:
sgd (1000)

Training MSE @iteration      0 = 26.24116
Training MSE @iteration    100 = 17.07654
Training MSE @iteration    200 = 17.08116
Training MSE @iteration    300 = 17.08784
Training MSE @iteration    400 = 17.09644


#### Validate the model

In [None]:
vld_error = compute_error (vld_idx, 1)
vld_mse = (np.array(vld_error) ** 2).mean()   

print ("Validation MSE = %.5f" % vld_mse)

#### Test the model
*True labels for test dataset is not available...*

In [None]:
#tst_error = compute_error (tst_idx, 1)
#tst_mse = (np.array(vld_error) ** 2).mean()   

#print ("Validation MSE = %.5f" % tst_mse)

## (4) The best and the worst rated jokes

In [None]:
joke_rating_df.replace(99, -1, inplace=True)

# best score of each joke
bests = joke_rating_df.max(axis=1)
best_joke = bests.idxmax()

# best score of each joke
worsts = joke_rating_df.min(axis=1)
worst_joke = worsts.idxmin()

print ("Best rated joke is index %d" % best_joke)
print ("Best rated joke is index %d" % worst_joke)