# Libraries

In [1]:
import sys
sys.executable

'/home/avgupta/anaconda2/bin/python2'

In [2]:
#import snap
import collections
from collections import namedtuple
import numpy as np
import pdb
import pandas as pd
from sklearn.metrics import mean_squared_error
import copy
import matplotlib.pylab as plt
%matplotlib inline

In [3]:
np.random.seed(3)

# Defining the Ratings Graph Class

In [4]:
Rating = namedtuple('Rating', ['Rating', 'User', 'Object'])

def isfloat(value):
    try:
        float(value)
        return True
    except ValueError:
        return False

def get_data_set():
    movie_critic_ratings = {}
    
    movie_header_indeces = {}
    for line in open('data/grouplens/movies.dat'):
        line = line.strip()
        if not movie_header_indeces:
            for i, column in enumerate(line.split('\t')):
                movie_header_indeces[column] = i
            continue
        NUM_TOP_CRITICS_REVIEWS_IDX = movie_header_indeces['rtTopCriticsNumReviews']
        TOP_CRITICS_RATING_IDX = movie_header_indeces['rtTopCriticsRating']
        
        movie_id, critic_rating, num_critic_ratings = line.split()[0], line.split()[TOP_CRITICS_RATING_IDX], line.split()[NUM_TOP_CRITICS_REVIEWS_IDX]
        
        if isfloat(num_critic_ratings) and (num_critic_ratings > 10) and isfloat(critic_rating):
            critic_rating = float(critic_rating) / 10.0
            movie_critic_ratings[movie_id] = critic_rating
        
    user_rating_header_indeces = {}
    ratings = []
    for line in open('data/grouplens/user_ratedmovies.dat'):
        line = line.strip()
        if not user_rating_header_indeces:
            for i, column in enumerate(line.split('\t')):
                user_rating_header_indeces[column] = i
            continue
        
        user_id, movie_id, rating = line.split()[0], line.split()[1], line.split()[2]
        rating = float(rating) / 5.0
        ratings.append(Rating(Rating=rating, User=user_id, Object=movie_id))
    
    return ratings, movie_critic_ratings

ratings, movie_critic_ratings = get_data_set()

In [5]:
Rating = namedtuple('Rating', ['Rating', 'User', 'Object'])

class RatingsGraph:
    def __init__(self, ratings, gold_ratings, alpha):
        self.alpha = alpha
        self.ratings = ratings 
        self.gold_ratings_dict = gold_ratings #(object, gold_rating)
        
        self.user_set = set([rating.User for rating in self.ratings]) 
        self.users_lookup = {user: i for i, user in enumerate(self.user_set)}
        self.object_set = set([rating.Object for rating in self.ratings])
        self.objects_lookup = {obj: i for i, obj in enumerate(self.object_set)}

        num_users, num_objects = len(self.user_set), len(self.object_set)
        self.bias_diffs = []
        self.adjacency = np.zeros([num_users, num_objects]) - 1 #set them to -1 bc rating can be 0
        
        for rating in self.ratings:
            idx_user = self.users_lookup[rating.User]
            idx_obj = self.objects_lookup[rating.Object]
            self.adjacency[idx_user, idx_obj] = rating.Rating
        
        self.true_ratings = np.random.uniform(low = 0, high = 1, size = (num_objects, 1))
        self.biases = np.random.uniform(low = -1, high = 1, size = (num_users,1))
        
        self.num_ratings_per_obj = np.sum(self.adjacency >= 0, axis = 0)
        self.num_ratings_per_user = np.sum(self.adjacency >= 0, axis = 1)

        self.gold_ratings = np.zeros([num_objects, 1]) - 1
        for obj, gold_rating in self.gold_ratings_dict.iteritems():
            try:
                obj_idx = self.objects_lookup[obj]
                self.gold_ratings[obj_idx] = gold_rating
            except: continue
    
    def perform_single_iteration(self):
        converged = True
        alpha = self.alpha
        
        original_ratings = self.true_ratings.copy()
        # subtract user biases from all ratings, but only where rating existed in first place!
        updated_ratings = np.multiply((self.adjacency >= 0), self.adjacency - alpha*self.biases)
        self.true_ratings = (1.0/self.num_ratings_per_obj)*\
            np.sum(np.maximum(np.zeros(self.adjacency.shape),
                       np.minimum(np.ones(self.adjacency.shape), updated_ratings))
            , axis = 0)
            
        max_diff = np.max(abs(original_ratings - np.expand_dims(self.true_ratings,1)))
        if max_diff > 0.00001:  converged = False

            
        original_bias = self.biases.copy()
        # subtract true ratings from given ratings - only where a rating was originally given!
        updated_users = np.multiply((self.adjacency >= 0), self.adjacency - self.true_ratings)
        self.biases = (1.0/self.num_ratings_per_user) * np.sum(updated_users, axis=1)
            
        self.true_ratings = np.expand_dims(self.true_ratings, axis=1) # to make the shapes work
        self.biases = np.expand_dims(self.biases, axis=1) # to make the shapes work
            
        bias_diff = abs(original_bias - self.biases)
        if np.max(bias_diff) > 0.00001:
            converged = False
        self.bias_diffs.append(np.sum(bias_diff))
        
        return converged
    
    def get_test_error(self):
        num_gold_ratings = np.sum(self.gold_ratings >= 0)
        pred_ratings = (self.gold_ratings >= 0)*self.true_ratings # if not in gold ratings, set to 0
        gold_ratings_given = (self.gold_ratings >= 0) * self.gold_ratings
        return 1.0/num_gold_ratings * np.sum(np.square(pred_ratings - gold_ratings_given))
            
    def iterate_until_convergence(self):
        errors = []
        converged = False
        counter = 0
        max_iter = 100
        while not converged and counter < max_iter:
            error = self.get_test_error()
            print (error)
            errors.append(error)
            converged = self.perform_single_iteration()
            counter += 1

        return errors

In [None]:
ratings_graph = RatingsGraph(ratings, movie_critic_ratings, alpha=0.1)
errors = ratings_graph.iterate_until_convergence()
alpha_01_y = ratings_graph.bias_diffs

62.10798029703558
61.5052357622723
61.49367298914109
61.493134930998515
61.49308771285084


In [None]:
ratings_graph = RatingsGraph(ratings, movie_critic_ratings, alpha=0.2)
errors = ratings_graph.iterate_until_convergence()
alpha_02_y = ratings_graph.bias_diffs

62.209676444188126


In [None]:
ratings_graph = RatingsGraph(ratings, movie_critic_ratings, alpha=0.5)
errors = ratings_graph.iterate_until_convergence()
alpha_05_y = ratings_graph.bias_diffs

In [None]:
# alpha1_y = [1060.7027209712728, 8.151491760725987, 0.7153737296671119, 0.06850560075491723, 0.006667511431112653]
# alpha2_y = [1070.5935889341972, 4.564030247992877, 0.612904383225149, 0.11645027071102854, 0.022635037211159174, 0.004417496078544927]
# alpha3_y = [1053.1551336990312, 25.10273404110145, 7.154351826286142, 3.1423294052572497, 1.5036110680676322, 0.7318008354801494, 0.35723892532225704, 0.17449089566980958, 0.08524172566812642, 0.041643162313176836, 0.020344054518165375, 0.009938663773250384]

plt.plot([i+1 for i in range(len(alpha_01_y))], [np.log(y) for y in alpha_01_y], 'orange', label='alpha = 0.1')
plt.plot([i+1 for i in range(len(alpha_02_y))], [np.log(y) for y in alpha_02_y], '-b', label='alpha = 0.2')
plt.plot([i+1 for i in range(len(alpha_05_y))], [np.log(y) for y in alpha_05_y], '-g', label='alpha = 0.5')
plt.legend()
plt.title('Bias Convergence Error by Epoch')
plt.xlabel('Epoch Number')
plt.ylabel('Log Bias Convergence Error')

In [None]:
user_ratings_df = pd.read_table('data/grouplens/user_ratedmovies.dat', sep="\t")
movie_ratings_df = pd.read_table('data/grouplens/movies.dat', sep="\t")