In [1]:
import pandas as pd
import numpy as np
from scipy import spatial
import math
from tqdm.notebook import tqdm
from sklearn.metrics import pairwise as pair

In [2]:
global_mean = 0
user_avg = []
movie_avg = []
rm = []
user_list = []
movie_list = []
sim_df = []

def train(training):
    global global_mean
    global user_avg
    global movie_avg
    global rm
    global user_list
    global movie_list
    global sim_df
    
    global_mean = training['rating'].mean()
    user_avg = training[['userId','rating']].groupby('userId').mean().rating.tolist()
    movie_avg = training[['movieId','rating']].groupby('movieId').mean().rating.tolist()
    rm = training.pivot(index='movieId',columns = 'userId',values='rating')
    rm = (rm - rm.mean()).fillna(0)
    user_list = rm.columns.tolist()
    movie_list = rm.index.tolist()
    sim_matrix = pair.cosine_similarity(rm.to_numpy())
    sim_df = pd.DataFrame(sim_matrix,index=movie_list,columns=movie_list)

def predict(userId,movieId):
    global global_mean
    global user_avg
    global movie_avg
    global rm
    global user_list
    global movie_list
    global sim_df
    
    if userId not in user_list:
        return global_mean
    
    user_index = user_list.index(userId)
    if movieId not in movie_list:
        prediction = user_avg[user_index]
    else: 
        movie_index = movie_list.index(movieId)
        sim = sim_df[movieId]
        neighbors = sim.nlargest(6).index.tolist()
        if movieId in neighbors:
            neighbors.remove(movieId)

        top,bottom = 0,0
        for n in neighbors: 
            r = rm.loc[n][userId]
            top += sim[n]*r
            bottom += sim[n]
        if bottom == 0:
            prediction = user_avg[user_index] + movie_avg[movie_index] - global_mean
        else:
            prediction = top/bottom + user_avg[user_index] + movie_avg[movie_index] - global_mean
    return prediction

In [3]:
def get_rmse(testing):
    global global_mean
    global user_avg
    global movie_avg
    global rm
    global user_list
    global movie_list
    global sim_df
    
    np_testing = testing.to_numpy()
    
    ms = 0
    
    for i in tqdm(range(len(np_testing))):
        userId = int(np_testing[i][0])
        movieId = int(np_testing[i][1])
        rating = np_testing[i][2]
        
        prediction = predict(userId,movieId)

        ms += (prediction-rating)**2
    return math.sqrt(ms/len(np_testing))

# given 10

In [4]:
training = pd.read_csv("processed/training-10.csv")
testing = pd.read_csv("processed/testing-10.csv")
given = testing.groupby(by="userId").head(10)
testing = testing[~testing.index.isin(given.index)]
training = pd.concat([training,given])
print(training.size)
print(testing.size)
train(training)
rmse = get_rmse(testing)
print("rmse: ",rmse)

31425645
273168


HBox(children=(FloatProgress(value=0.0, max=91056.0), HTML(value='')))


rmse:  0.9123654598990113


# given 20

In [5]:
training = pd.read_csv("processed/training-20.csv")
testing = pd.read_csv("processed/testing-20.csv")
given = testing.groupby(by="userId").head(20)
testing = testing[~testing.index.isin(given.index)]
training = pd.concat([training,given])
print(training.size)
print(testing.size)
train(training)
rmse = get_rmse(testing)
print("rmse: ",rmse)

34006083
303621


HBox(children=(FloatProgress(value=0.0, max=101207.0), HTML(value='')))


rmse:  0.8833538515082674


# given 30

In [6]:
training = pd.read_csv("processed/training-30.csv")
testing = pd.read_csv("processed/testing-30.csv")
given = testing.groupby(by="userId").head(30)
testing = testing[~testing.index.isin(given.index)]
training = pd.concat([training,given])
print(training.size)
print(testing.size)
train(training)
rmse = get_rmse(testing)
print("rmse: ",rmse)

34011828
297876


HBox(children=(FloatProgress(value=0.0, max=99292.0), HTML(value='')))


rmse:  0.869975622764132


# given 40

In [7]:
training = pd.read_csv("processed/training-40.csv")
testing = pd.read_csv("processed/testing-40.csv")
given = testing.groupby(by="userId").head(40)
testing = testing[~testing.index.isin(given.index)]
training = pd.concat([training,given])
print(training.size)
print(testing.size)
train(training)
rmse = get_rmse(testing)
print("rmse: ",rmse)

34015785
293919


HBox(children=(FloatProgress(value=0.0, max=97973.0), HTML(value='')))


rmse:  0.8598963323782972


# given 50

In [8]:
training = pd.read_csv("processed/training-50.csv")
testing = pd.read_csv("processed/testing-50.csv")
given = testing.groupby(by="userId").head(50)
testing = testing[~testing.index.isin(given.index)]
training = pd.concat([training,given])
print(training.size)
print(testing.size)
train(training)
rmse = get_rmse(testing)
print("rmse: ",rmse)

34010397
299307


HBox(children=(FloatProgress(value=0.0, max=99769.0), HTML(value='')))


rmse:  0.8429234354640223


# given 60

In [9]:
training = pd.read_csv("processed/training-60.csv")
testing = pd.read_csv("processed/testing-60.csv")
given = testing.groupby(by="userId").head(60)
testing = testing[~testing.index.isin(given.index)]
training = pd.concat([training,given])
print(training.size)
print(testing.size)
train(training)
rmse = get_rmse(testing)
print("rmse: ",rmse)

33977997
331707


HBox(children=(FloatProgress(value=0.0, max=110569.0), HTML(value='')))


rmse:  0.8463533121705777


# given 70

In [10]:
training = pd.read_csv("processed/training-70.csv")
testing = pd.read_csv("processed/testing-70.csv")
given = testing.groupby(by="userId").head(70)
testing = testing[~testing.index.isin(given.index)]
training = pd.concat([training,given])
print(training.size)
print(testing.size)
train(training)
rmse = get_rmse(testing)
print("rmse: ",rmse)

34011909
297795


HBox(children=(FloatProgress(value=0.0, max=99265.0), HTML(value='')))


rmse:  0.8671030582847791


# given 80

In [11]:
training = pd.read_csv("processed/training-80.csv")
testing = pd.read_csv("processed/testing-80.csv")
given = testing.groupby(by="userId").head(80)
testing = testing[~testing.index.isin(given.index)]
training = pd.concat([training,given])
print(training.size)
print(testing.size)
train(training)
rmse = get_rmse(testing)
print("rmse: ",rmse)

33987759
321945


HBox(children=(FloatProgress(value=0.0, max=107315.0), HTML(value='')))


rmse:  0.8519722145232203


# given 90

In [12]:
training = pd.read_csv("processed/training-90.csv")
testing = pd.read_csv("processed/testing-90.csv")
given = testing.groupby(by="userId").head(90)
testing = testing[~testing.index.isin(given.index)]
training = pd.concat([training,given])
print(training.size)
print(testing.size)
train(training)
rmse = get_rmse(testing)
print("rmse: ",rmse)

34010712
298992


HBox(children=(FloatProgress(value=0.0, max=99664.0), HTML(value='')))


rmse:  0.8358649217635017


# given 100

In [13]:
training = pd.read_csv("processed/training-100.csv")
testing = pd.read_csv("processed/testing-100.csv")
given = testing.groupby(by="userId").head(100)
testing = testing[~testing.index.isin(given.index)]
training = pd.concat([training,given])
print(training.size)
print(testing.size)
train(training)
rmse = get_rmse(testing)
print("rmse: ",rmse)

33996579
313125


HBox(children=(FloatProgress(value=0.0, max=104375.0), HTML(value='')))


rmse:  0.8579538575329002
