In [None]:
# Useful starting lines
%matplotlib inline

import numpy as np
import scipy
import scipy.io
import scipy.sparse as sp
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

In [None]:
from helpers import load_data, preprocess_data

path_dataset = "data_train.csv"
ratings = load_data(path_dataset)
sp.find(ratings)
print(ratings.shape)

In [None]:
from plots import plot_raw_data

num_items_per_user, num_users_per_item = plot_raw_data(ratings)

print("min # of items per user = {}, min # of users per item = {}.".format(
        min(num_items_per_user), min(num_users_per_item)))

In [None]:
def split_data(ratings, num_items_per_user, num_users_per_item,
               min_num_ratings, p_test=0.1):
    """split the ratings to training data and test data.
    Args:
        min_num_ratings: 
            all users and items we keep must have at least min_num_ratings per user and per item. 
    """
    # set seed
    np.random.seed(988)
    
    # select user and item based on the condition.
    valid_users = np.where(num_items_per_user >= min_num_ratings)[0]
    valid_items = np.where(num_users_per_item >= min_num_ratings)[0]
    valid_ratings = ratings[valid_items, :][: , valid_users]  
    
    # ***************************************************
    # INSERT YOUR CODE HERE
    # split the data and return train and test data. TODO
    # NOTE: we only consider users and movies that have more
    # than 10 ratings
    # ***************************************************
    
    values = sp.find(valid_ratings)
    indexUser = values[0]
    indexItem = values[1]
    indexRate = values[2]
    
    indices = np.random.permutation(len(indexUser))
    testIndices = indices[0:p_test*len(indices)]
    trainIndices = indices[p_test*len(indices):]
    
    test = sp.coo_matrix((indexRate[testIndices],
                             (indexUser[testIndices], indexItem[testIndices])),
                            shape = valid_ratings.shape).tocsr()
    train = sp.coo_matrix((indexRate[trainIndices],(indexUser[trainIndices], indexItem[trainIndices])), shape = valid_ratings.shape).tocsr()

    print("Total number of nonzero elements in original data:{v}".format(v=ratings.nnz))
    print("Total number of nonzero elements in processed data:{v}".format(v=valid_ratings.nnz))
    print("Total number of nonzero elements in train data:{v}".format(v=train.nnz))
    print("Total number of nonzero elements in test data:{v}".format(v=test.nnz))
    return valid_ratings, train, test

In [None]:
from plots import plot_train_test_data

valid_ratings, train, test = split_data(
    ratings, num_items_per_user, num_users_per_item, min_num_ratings=10, p_test=0.1)
plot_train_test_data(train, test)

## Implementing Baselines 

In [None]:
values = sp.find(ratings)
items = values[0]
users = values[1]
rates = values[2]
print(np.unique(users).shape)
print(np.unique(items).shape)
print(np.unique(rates).shape)

In [None]:
from helpers import calculate_mse

def baseline_global_mean(train, test):
    """baseline method: use the global mean."""
    # ***************************************************
    # INSERT YOUR CODE HERE
    # TODO
    # ***************************************************    
    mean = sp.find(train)[2].mean()
    testRates = sp.find(test)[2]
    return calculate_mse(testRates, mean), mean

baseline_global_mean(train, test)

In [None]:
def baseline_user_mean(train, test):
    """baseline method: use the user means as the prediction."""
    mse = 0
    num_items, num_users = train.shape
    
    # ***************************************************
    # INSERT YOUR CODE HERE
    # TODO
    # ***************************************************
    values = sp.find(train)
    items = values[0]
    users = values[1]
    rates = values[2]
    
    values_test = sp.find(test)
    items_test = values_test[0]
    users_test = values_test[1]
    rates_test = values_test[2]
    
    print(users)
    print(items)
    print(rates)
    
    ratePerUser = np.zeros(len(np.unique(users))) # mean rate per user (over all movies)
    for i,user in enumerate(np.unique(users)):
        ratePerUser[i] = np.mean(rates[users == user])
        # ratePerUser[i] = mean rate given by user 'user' 
    
    print(users_test.shape)
    print(ratePerUser[users_test].shape)
    return calculate_mse(rates_test, ratePerUser[users_test]), ratePerUser[users_test]

baseline_user_mean(train, test)

In [None]:
v = sp.find(test)
print(v[1].shape)

The user mean method only considers the user past and looks at its mean rate, and then predicts this rate, whatever the movie is. The obtained rmse is 1.06.

In [None]:
def baseline_item_mean(train, test):
    """baseline method: use item means as the prediction."""
    mse = 0
    num_items, num_users = train.shape
    
    # ***************************************************
    # INSERT YOUR CODE HERE
    # TODO
    # ***************************************************
    values = sp.find(train)
    items = values[0]
    users = values[1]
    rates = values[2]
    
    values_test = sp.find(test)
    items_test = values_test[0]
    users_test = values_test[1]
    rates_test = values_test[2]
    
    ratePerMovie = np.zeros(len(np.unique(items))) # mean rate of each movie (over all users)
    for i,item in enumerate(np.unique(items)):
        ratePerMovie[i] = np.mean(rates[items == item])
    
    return calculate_mse(rates_test, ratePerMovie[items_test]), ratePerMovie[items_test]
    
baseline_item_mean(train, test)

The item mean method only considers the movie past rates and looks at its mean rate, and then predicts this rate, whatever the user is. The obtained rmse is 1.18. This is worse than user mean method, which means that if we are asked to predict how user i rates movie j, it is better to look only at the rates the user gave to other movies (ie see if he usually gives high rates or not), than to only look at the rates the movie received from other users (ie see if this is a good movie or not). This fact is actually a bit strange, we would rather think that the rates most depends on whether the movie is good or not, than whether the user gives good rates or not...

In [None]:
def mixed_method(train, test):
    
    (mse1, mean) = baseline_global_mean(train, test)
    (mse2, ratesUserMean) = baseline_user_mean(train, test)
    (mse3, ratesItemMean) = baseline_item_mean(train, test)
    
    w1=0
    w2=3
    w3=1
    mixedRates = (w1*mean + w2*ratesUserMean + w3*ratesItemMean) / (w1+w2+w3)
    
    if len(values) > 2:
        return calculate_mse(sp.find(test)[2], mixedRates), mixedRates
    return mixedRates

mixed_method(train, test)

We observe than by combining methods, we get a better performance than with each method separated.

In [None]:
test_path = "sampleSubmission.csv"
testSet = load_data(test_path)

In [None]:
testSetValues = sp.find(testSet)

In [None]:
predictionsTest = mixed_method(train,testSet)

In [None]:
import findspark
findspark.init('/srv/spark')
import pyspark
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
sc = pyspark.SparkContext()
sql_sc = pyspark.sql.SQLContext(sc)

In [None]:
import pandas as pd

values_train = sp.find(train)
trainItems = values_train[0]
trainUsers = values_train[1]
trainRates = values_train[2]

trainFrame = pd.DataFrame()
trainFrame['User'] = trainUsers
trainFrame['Movie'] = trainItems
trainFrame['Prediction'] = trainRates

trainFrame.head()

In [None]:
s_df_train = sql_sc.createDataFrame(trainFrame)

In [None]:
print(s_df_train.rdd.first())

In [None]:
# Build the recommendation model using Alternating Least Squares
rank = 10
numIterations = 10
model = ALS.train(s_df_train.rdd, rank, numIterations, 0.1)

In [None]:
values_test = sp.find(test)
testItems = values_test[0]
testUsers = values_test[1]
testRates = values_test[2]

testFrame = pd.DataFrame()
testFrame['User'] = testUsers
testFrame['Movie'] = testItems
testFrame['Prediction'] = testRates

testFrame.head()

In [None]:
s_df_test = sql_sc.createDataFrame(testFrame)
print(s_df_test.rdd.first())

In [None]:
# Evaluate the model on training data
testdata = s_df_test.rdd.map(lambda p: (p[0], p[1]))
predictionsTest = model.predictAll(testdata.rdd).map(lambda r: ((r[0], r[1]), r[2]))
ratesAndPreds = s_df_test.rdd.map(lambda r: (r[0], r[1], r[2])).join(predictions)
MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("Mean Squared Error = " + str(MSE))

In [None]:
df = predictions.toDF()
df.head()

In [None]:
print(type(df))

In [None]:
predictionsDF = df.toPandas()
predictionsDF.head()

In [None]:
predictionsDF.columns = ['User', 'Movie', 'Prediction']
predictionsDF.head()