In [226]:
!pip install git+https://github.com/goolig/dsClass.git

Collecting git+https://github.com/goolig/dsClass.git
  Cloning https://github.com/goolig/dsClass.git to /tmp/pip-req-build-y2vdpzul
Building wheels for collected packages: dsClass
  Building wheel for dsClass (setup.py) ... [?25ldone
[?25h  Created wheel for dsClass: filename=dsClass-1.0.27-py3-none-any.whl size=16372735 sha256=fada54a4d12e3735492966b8a3f78ec188dd79daafba0afda270749ec762258f
  Stored in directory: /tmp/pip-ephem-wheel-cache-qvm36ru0/wheels/cc/7d/ca/f5036d591d94a23e6ab49777866b7fc723500ce0dcdb2a9b2e
Successfully built dsClass
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [227]:
import csv
import pandas as pd
import pickle
import numpy as np
import random
from random import shuffle
import math
from math import sqrt
from dsClass.path_helper import *

In [228]:
trainingDFpath = get_file_path('ratings.pickle')
ratingsPath = get_file_path('ratings.csv')


In [229]:
# Collabrative Filtering data set taken from Collective Intelligence book.


dataset={
			'Lisa Rose': {'Lady in the Water': 2.5, 
							'Snakes on a Plane': 3.5,
							'Just My Luck': 3.0, 
							'Superman Returns': 3.5, 
							'You, Me and Dupree': 2.5,
							'The Night Listener': 3.0},
			'Gene Seymour': {'Lady in the Water': 3.0, 
							'Snakes on a Plane': 3.5,
							'Just My Luck': 1.5,
							 'Superman Returns': 5.0, 
							 'The Night Listener': 3.0,
							'You, Me and Dupree': 3.5},

			'Michael Phillips': {'Lady in the Water': 2.5, 
								'Snakes on a Plane': 3.0,
								'Superman Returns': 3.5,
								 'The Night Listener': 4.0},
			'Claudia Puig': {'Snakes on a Plane': 3.5, 
							'Just My Luck': 3.0,
							'The Night Listener': 4.5, 
							'Superman Returns': 4.0,
							'You, Me and Dupree': 2.5},
			'Mick LaSalle': {'Lady in the Water': 3.0, 
							'Snakes on a Plane': 4.0,
							'Just My Luck': 2.0, 
							'Superman Returns': 3.0, 
							'The Night Listener': 3.0,
							'You, Me and Dupree': 2.0},
			'Jack Matthews': {'Lady in the Water': 3.0, 
							'Snakes on a Plane': 4.0,
							'The Night Listener': 3.0, 
							'Superman Returns': 5.0, 
							'You, Me and Dupree': 3.5},
			'Toby': {'Snakes on a Plane':4.5,
					'You, Me and Dupree':1.0,
					'Superman Returns':4.0}}

movies = []
for user in dataset.keys():
    for movie in dataset[user].keys():
        if movie not in movies:
            movies += [movie]
movies

['Lady in the Water',
 'Snakes on a Plane',
 'Just My Luck',
 'Superman Returns',
 'You, Me and Dupree',
 'The Night Listener']

# "under the hood" functions:

In [230]:
def similarity_score(person1,person2):
	
	# Returns ratio Euclidean distance score of person1 and person2 

	both_viewed = {}		# To get both rated items by person1 and person2

	for item in dataset[person1]:
		if item in dataset[person2]:
			both_viewed[item] = 1

		# Conditions to check they both have an common rating items	
		if len(both_viewed) == 0:
			return 0

		# Finding Euclidean distance 
		sum_of_eclidean_distance = []	

		for item in dataset[person1]:
			if item in dataset[person2]:
				sum_of_eclidean_distance.append(pow(dataset[person1][item] - dataset[person2][item],2))
		sum_of_eclidean_distance = sum(sum_of_eclidean_distance)

		return 1/(1+sqrt(sum_of_eclidean_distance))

    
    

def getMoviesBothRated(user, other):
    both_rated = {}
    for item in dataset[user]:
        if item in dataset[other]:
            both_rated[item] = 1
    return both_rated





def getMoviesBothRatedFromDF(ratingsDF, userID, otherID):
    both_rated = {}
    for movieID in ratingsDF.columns:
        if (list(ratingsDF[movieID].loc[[userID]])[0] != '?') and (list(ratingsDF[movieID].loc[[otherID]])[0] != '?'):
            both_rated[movieID] = 1
    return both_rated





def pearson_correlation(person1,person2):

    # To get both rated items
    both_rated = getMoviesBothRated(person1,person2)
    number_of_ratings = len(both_rated)

    # Checking for number of ratings in common
    if number_of_ratings == 0:
        return 0

    # calculate each user's ratings average:
    person1_preferences_avg = np.mean([dataset[person1][item] for item in both_rated])
    person2_preferences_avg = np.mean([dataset[person2][item] for item in both_rated])

    # calculate each user's the sum of the squared normalized ratings:
    person1_norm_square_preferences_sum = sum([pow(dataset[person1][item] - person1_preferences_avg,2) for item in both_rated])
    person2_norm_square_preferences_sum = sum([pow(dataset[person2][item] - person2_preferences_avg,2) for item in both_rated])
    
    # calc the nominator and denominator of the pearson formula:
    numerator_value = sum([(dataset[person1][item] - person1_preferences_avg) * (dataset[person2][item] - person2_preferences_avg) for item in both_rated])
    denominator_value = sqrt(person1_norm_square_preferences_sum * person2_norm_square_preferences_sum)
    
    if denominator_value == 0:
        return 0
    else:
        r = numerator_value/(denominator_value*1.0)
        return r 

    

def most_similar_users(person,number_of_users):
	# returns the number_of_users (similar persons) for a given specific person.
	scores = [(pearson_correlation(person,other_person),other_person) for other_person in dataset if  other_person != person ]
	
	# Sort the similar persons so that highest scores person will appear at the first
	scores.sort()
	scores.reverse()
	return scores[0:number_of_users]




def user_reommendations(person):

	# Gets recommendations for a person by using a weighted average of every other user's rankings
	totals = {}
	simSums = {}
	rankings_list =[]
	for other in dataset:
		# don't compare me to myself
		if other == person:
			continue
		sim = pearson_correlation(person,other)
		print('similarity of ' + person + ' with ' + other + ' is: ' + str(sim))

		# ignore scores of zero or lower
		if sim <=0: 
			continue
		for item in dataset[other]:

			# only score movies i haven't seen yet
			if item not in dataset[person] or dataset[person][item] == 0:

			# Similrity * score
				totals.setdefault(item,0)
				totals[item] += dataset[other][item]* sim
				# sum of similarities
				simSums.setdefault(item,0)
				simSums[item]+= sim

		# Create the normalized list

	rankings = [(total/simSums[item],item) for item,total in totals.items()]
	rankings.sort()
	rankings.reverse()
	# returns the recommended items
	recommendataions_list = [(recommend_item, score) for score,recommend_item in rankings]
	return recommendataions_list
		




def getCoRatingList(user, other):
    # get movies both rated:
    both_rated = getMoviesBothRated(user, other)
    
    # calculate each user's ratings average:
    user_preferences_avg = np.mean([dataset[user][item] for item in both_rated])
    other_preferences_avg = np.mean([dataset[other][item] for item in both_rated])
    
    # for each movie both rated, return its ID and the normalized rating by each user:
    return [(movie, dataset[user][movie]-user_preferences_avg, dataset[other][movie]-other_preferences_avg) for movie in movies if (movie in dataset[user].keys()) and (movie in dataset[other].keys())]



def getCoRatingListFromDF(ratingsDF, userID, otherID):
    # get movies both rated:
    both_rated = getMoviesBothRatedFromDF(ratingsDF, userID, otherID)
    
    # calculate each user's ratings average:
    user_preferences_avg = np.mean([list(ratingsDF[col].loc[[userID]])[0] for col in ratingsDF.columns if (list(ratingsDF[col].loc[[userID]])[0] != '?') and (list(ratingsDF[col].loc[[otherID]])[0] != '?')])
    other_preferences_avg = np.mean([list(ratingsDF[col].loc[[otherID]])[0] for col in ratingsDF.columns if (list(ratingsDF[col].loc[[userID]])[0] != '?') and (list(ratingsDF[col].loc[[otherID]])[0] != '?')])
    
    # for each movie both rated, return its ID and the normalized rating by each user:
    return [(col, list(ratingsDF[col].loc[[userID]])[0]-user_preferences_avg, list(ratingsDF[col].loc[[otherID]])[0]-other_preferences_avg) for col in ratingsDF.columns if (list(ratingsDF[col].loc[[userID]])[0] != '?') and (list(ratingsDF[col].loc[[otherID]])[0] != '?')]



def numOfActualRatings(series):
    length = len(series)
    count = 0
    for idx in range(length):
        if (series[idx] != 0) and (series[idx] != '?'):
            count += 1
    return count


    

def calcAvgRandomMse(origRatingsDict, randomMethod, numOfRandomRatings, numOfIters):
    mselist = [0 for i in range(numOfIters)]

    for i in range(numOfIters):
        mselist[i] = calcRandomMse(origRatingsDict, randomMethod, numOfRandomRatings)
    mseAvg = np.mean(mselist)
    return mseAvg

# functions to complete:

In [231]:
def calcRandomMse(origRatingsDict, randomMethod, numOfRandomRatings):
    mse = 0
    
    randomRatings = list()
    
    keysList = list(origRatingsDict.keys())
    
    if randomMethod == 'uniform':
        # calculating uniformly random ratings:
        for i in range(numOfRandomRatings):
            randomRatings.append(random.randrange(0, 50) / 10)  #creates a list of random scores

    if randomMethod == 'stratified':
        # calculating a permutation of the original ratings:
        # hint - shuffle the indices of the original values.
        shuffledList = list()
        for i in range(len(origRatingsDict.keys())):
            shuffledList.append(i)
        random.shuffle(shuffledList)
        
        for i in range(numOfRandomRatings):
            randomRatings.append(origRatingsDict[keysList[shuffledList[i]]]['original rating'])
        
    for rIdx in range(len(randomRatings)):
        # calculating the square of the difference between the original and ranrom value and add it to mse variable:
        originalValue = origRatingsDict[keysList[rIdx]]['original rating']
        mse += math.pow(originalValue - randomRatings[rIdx], 2)

    # averaging the sum of the square differences:

    return mse / len(origRatingsDict.keys())



In [232]:
def getMovieRecommendation(ratingsDF, userID, movieID, normalizeRatings = False):
    userIDlist = list(ratingsDF.index)
    numOfOtherRated = 0
    maxAbsRate = -np.infty
    
    # calculate user's ratings average:
    user_preferences_avg = np.mean([list(ratingsDF[col].loc[[userID]])[0] for col in ratingsDF.columns if (list(ratingsDF[col].loc[[userID]])[0] != '?')])
    
    
    # Gets recommendations for a person by using a weighted average of every other user's rankings
    total = 0
    rankings_list =[]
    for otherID in userIDlist:
        
        # don't compare me to myself
        if otherID == userID:
            continue
            
        sim = pearson_correlation_from_DF(userID,otherID, ratingsDF)
        print('similarity of user ' + str(userID) + ' with user ' + str(otherID) + ' is: ' + str(sim))
            
        # calculate other user's ratings average:
        other_preferences_avg = np.mean([list(ratingsDF[col].loc[[otherID]])[0] for col in ratingsDF.columns if (list(ratingsDF[col].loc[[userID]])[0] != '?') and (list(ratingsDF[col].loc[[otherID]])[0] != '?')])

        movieRatingByOther = list(ratingsDF[movieID].loc[[otherID]])[0]
        movieRatingByUser = list(ratingsDF[movieID].loc[[userID]])[0]

        # if other user has rated current movie
        if movieRatingByOther != '?':
            numOfOtherRated += 1

            # normalize other's rating of movieID:
            normalizedScore = movieRatingByOther - other_preferences_avg

            # Similarity * score
            total += sim * normalizedScore
            
            
    # calculate recommendation:
    
    if sim == 0:
        recommendataion = 0
    else:
        recommendataion = total / sim
    
    if normalizeRatings:
        recommendataion += user_preferences_avg    
    
    return recommendataion

In [233]:
def pearson_correlation_from_DF(userID, otherID, ratingsDF):
    # To get both rated items
    both_rated = {}
    for movieID in ratingsDF.columns:
        movieRatingByOther = list(ratingsDF[movieID].loc[[otherID]])[0]
        movieRatingByUser = list(ratingsDF[movieID].loc[[userID]])[0]

        if (movieRatingByOther != '?') and (movieRatingByUser != '?'):
            both_rated[movieID] = 1

    number_of_ratings = len(both_rated)

    # Checking for number of ratings in common
    if number_of_ratings == 0:
        return 0

    listUserID = [list(ratingsDF[movieID].loc[[userID]])[0] for movieID in both_rated]
    listOtherID = [list(ratingsDF[movieID].loc[[otherID]])[0] for movieID in both_rated]

    # Add up all the preferences of each user
    sumUser = sum(listUserID)
    sumOther = sum(listOtherID)

    # Sum up the squares of preferences of each user
    sqPrefUser = sum([math.pow(curr, 2) for curr in listUserID])
    sqPrefOther = sum([math.pow(curr, 2) for curr in listOtherID])
    
    # Sum up the product value of both preferences for each item
    product = sum([ listUserID[i] * listOtherID[i] for i in range(len(listUserID))])
    
    
    # Calculate the pearson score

    #using formula from class
    avgUser = sumUser / number_of_ratings
    avgOther = sumOther / number_of_ratings
    
    userNormSquare = sum([math.pow(curr - avgUser ,2) for curr in listUserID])
    otherNormSquare = sum([math.pow(curr - avgOther ,2) for curr in listOtherID])
    
    numerator_value = sum([(listUserID[i] - avgUser) * (listOtherID[i] - avgOther) for i in range(len(listUserID))])
    denominator_value = sqrt(userNormSquare * otherNormSquare)
    
    
    if denominator_value == 0:
        return 0
    else:
        r = numerator_value/denominator_value
        return r 

In [234]:
user = 'Michael Phillips'
print(user_reommendations(user))

similarity of Michael Phillips with Lisa Rose is: 0.40451991747794525
similarity of Michael Phillips with Gene Seymour is: 0.20459830184114206
similarity of Michael Phillips with Claudia Puig is: 1.0
similarity of Michael Phillips with Mick LaSalle is: -0.2581988897471611
similarity of Michael Phillips with Jack Matthews is: 0.13483997249264842
similarity of Michael Phillips with Toby is: -1.0
[('Just My Luck', 2.8092760065251268), ('You, Me and Dupree', 2.694636703980363)]


In [235]:
other = 'Jack Matthews'
coRatingList = getCoRatingList(user, other)
coRatingList

[('Lady in the Water', -0.75, -0.75),
 ('Snakes on a Plane', -0.25, 0.25),
 ('Superman Returns', 0.25, 1.25),
 ('The Night Listener', 0.75, -0.75)]

In [236]:
# choose number of viewers from the folloeing list: [10, 25, 50, 100, 150, 200]
numOfUsers = 50

# Load "movielens" dataset into dataframe:

In [237]:
maxUserNumLineInFileDict = {10:1261,
                            25:4040,
                            50:7424,
                            100:15450,
                            150:22305,
                            200:29270}

ratingsDict = {}
users = []
movies = []

with open(ratingsPath) as ratingsCsvfile:
    ratings = csv.DictReader(ratingsCsvfile)
    
    ratingsCount = 0
    
    # extract list of ratings:
    for row in ratings:
        
        # considering only part of the users:
        if ratingsCount == maxUserNumLineInFileDict[numOfUsers]:
            break
        
        # initiate the users ratings list (if not already done previously):
        ratingsDict.setdefault(row['userId'],{})
        
        # add rating of current film user's rantings list:
        ratingsDict[row['userId']][row['movieId']] = float(row['rating'])
        
        # gather all users ids from the data:
        if row['userId'] not in users:
            users += [row['userId']]
            
        # gather all movies ids from the data:
        if row['movieId'] not in movies:
            movies += [row['movieId']]

        ratingsCount += 1
        
        
# constructing the data DF:
ratingsDF = pd.DataFrame(columns=movies)
i = 0
for user in ratingsDict.keys():
    userRates = []
    for mIdx in range(len(movies)):
        if movies[mIdx] in ratingsDict[user].keys():
            userRates += [ratingsDict[user][movies[mIdx]]]
        else:
            userRates += ['?']
            
    ratingsDF.loc[i] = userRates
    i = i+1

ratingsDF

Unnamed: 0,1,3,6,47,50,70,101,110,151,157,...,175485,175661,175693,175705,175707,175743,175781,179073,188301,190183
0,4,4,4,5,5,3,5,4,5,5,...,?,?,?,?,?,?,?,?,?,?
1,?,?,?,?,?,?,?,?,?,?,...,?,?,?,?,?,?,?,?,?,?
2,?,?,?,?,?,?,?,?,?,?,...,?,?,?,?,?,?,?,?,?,?
3,?,?,?,2,?,?,?,?,?,?,...,?,?,?,?,?,?,?,?,?,?
4,4,?,?,?,4,?,?,4,?,?,...,?,?,?,?,?,?,?,?,?,?
5,?,5,4,4,1,?,?,5,4,?,...,?,?,?,?,?,?,?,?,?,?
6,4.5,?,?,?,4.5,?,?,?,?,?,...,?,?,?,?,?,?,?,?,?,?
7,?,?,?,4,5,?,?,3,?,?,...,?,?,?,?,?,?,?,?,?,?
8,?,?,?,?,?,?,?,?,?,?,...,?,?,?,?,?,?,?,?,?,?
9,?,?,?,?,?,?,?,?,?,?,...,?,?,?,?,?,?,?,?,?,?


# create train+test set

### extracting all users that purchased more than 10 items:

In [238]:
minimalNumberOfRatingAccounts = 10
mupltipleNonZeroPositionsInCol = {}
for colIdx in [c+1 for c in range(len(ratingsDF.columns)-1)]:
    if numOfActualRatings(ratingsDF[ratingsDF.columns[colIdx]]) > minimalNumberOfRatingAccounts:
        mupltipleNonZeroPositionsInCol[ratingsDF.columns[colIdx]] = list(ratingsDF[ratingsDF.columns[colIdx]].to_numpy().nonzero()[0])#BugFix .values was added

#mupltipleNonZeroPositionsInCol

# replacing some of the cells to '?':

In [239]:
numOfCellsToDelete = 10

In [240]:
ratingsDFbackup = ratingsDF.copy(deep=True)

# picking numOfCellsToDelete columns to delete a cell from:
numOfCols = len(list(mupltipleNonZeroPositionsInCol.keys()))
randColsIdx = random.sample(range(numOfCols), k=numOfCellsToDelete)

cellsToTestOn = {}
for colIdx in randColsIdx:
    movie = list(mupltipleNonZeroPositionsInCol.keys())[colIdx]
    
    rowNum = 0
    currCellVal = ratingsDF[movie].iloc[mupltipleNonZeroPositionsInCol[movie][rowNum]]
    while (currCellVal == 0) or (currCellVal == '?'):
        rowNum += 1
        currCellVal = ratingsDF[movie].iloc[mupltipleNonZeroPositionsInCol[movie][rowNum]]
        
    ratingsDF[movie].iloc[mupltipleNonZeroPositionsInCol[movie][rowNum]] = '?'
    cellsToTestOn[movie] = rowNum   
    
# save ratings:
with open(trainingDFpath, 'wb') as handle:
    pickle.dump(ratingsDF, handle, protocol=pickle.HIGHEST_PROTOCOL)

# getting original values of cells to test on:

In [241]:
cellsToCompareOriginalValues = {}
for colIdx in randColsIdx:
    movie = list(mupltipleNonZeroPositionsInCol.keys())[colIdx]
    
    cellToCompare = mupltipleNonZeroPositionsInCol[movie][cellsToTestOn[movie]]
    cellOriginalValue = ratingsDFbackup[movie].iloc[cellToCompare]
    
    cellsToCompareOriginalValues[movie] = {}
    cellsToCompareOriginalValues[movie]['account'] = cellToCompare
    cellsToCompareOriginalValues[movie]['original rating'] =  cellOriginalValue
    
cellsToCompareOriginalValues

{'1036': {'account': 16, 'original rating': 4.0},
 '1213': {'account': 0, 'original rating': 5.0},
 '367': {'account': 0, 'original rating': 4.0},
 '318': {'account': 1, 'original rating': 3.0},
 '4306': {'account': 6, 'original rating': 4.0},
 '231': {'account': 0, 'original rating': 5.0},
 '380': {'account': 4, 'original rating': 2.0},
 '3793': {'account': 0, 'original rating': 5.0},
 '527': {'account': 0, 'original rating': 5.0},
 '5952': {'account': 6, 'original rating': 4.5}}

# calculate the recommendation for each cell we are testing on:

In [242]:
normalizeRatings = True

for movie in cellsToCompareOriginalValues.keys():
    account = cellsToCompareOriginalValues[movie]['account']
    recommendationVal = getMovieRecommendation(ratingsDF, account, movie, normalizeRatings)
    cellsToCompareOriginalValues[movie]['recommendation rating'] =  recommendationVal

similarity of user 16 with user 0 is: 0.03472881740470823
similarity of user 16 with user 1 is: 0.7319250547114
similarity of user 16 with user 2 is: 0
similarity of user 16 with user 3 is: -0.2189276851085556
similarity of user 16 with user 4 is: 0.3108349360801046
similarity of user 16 with user 5 is: 0.02400076803686596
similarity of user 16 with user 6 is: 0.2792892751075078
similarity of user 16 with user 7 is: 0.0
similarity of user 16 with user 8 is: 0
similarity of user 16 with user 9 is: -0.005896037837779141
similarity of user 16 with user 10 is: 0.42363055595779303
similarity of user 16 with user 11 is: -1.0
similarity of user 16 with user 12 is: 0
similarity of user 16 with user 13 is: -0.36283458513517247
similarity of user 16 with user 14 is: 0.5565887153821486
similarity of user 16 with user 15 is: -0.48791979160886184
similarity of user 16 with user 17 is: 0.24289601845186504
similarity of user 16 with user 18 is: 0.012223514059379067
similarity of user 16 with user 19 

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


similarity of user 0 with user 1 is: 0
similarity of user 0 with user 2 is: 0.16609095970747997
similarity of user 0 with user 3 is: 0.20501270550027909
similarity of user 0 with user 4 is: 0.18503014180929322
similarity of user 0 with user 5 is: -0.25104822262671683
similarity of user 0 with user 6 is: -0.1046656042462931
similarity of user 0 with user 7 is: 0.3649892750714124
similarity of user 0 with user 8 is: 0.9185586535436917
similarity of user 0 with user 9 is: -0.037986858819879316
similarity of user 0 with user 10 is: -0.08764340007155369
similarity of user 0 with user 11 is: 0
similarity of user 0 with user 12 is: 1.0
similarity of user 0 with user 13 is: 0.47455475311913276
similarity of user 0 with user 14 is: 0.20874726104649471
similarity of user 0 with user 15 is: 0.18060945639429235
similarity of user 0 with user 16 is: 0.03472881740470823
similarity of user 0 with user 17 is: 0.2168685254583517
similarity of user 0 with user 18 is: 0.3346707255299102
similarity of use

In [243]:
cellsToCompareOriginalValues

{'1036': {'account': 16,
  'original rating': 4.0,
  'recommendation rating': 4.211538461538462},
 '1213': {'account': 0,
  'original rating': 5.0,
  'recommendation rating': 4.356828193832599},
 '367': {'account': 0,
  'original rating': 4.0,
  'recommendation rating': 4.356828193832599},
 '318': {'account': 1,
  'original rating': 3.0,
  'recommendation rating': 3.982142857142857},
 '4306': {'account': 6,
  'original rating': 4.0,
  'recommendation rating': 3.216666666666667},
 '231': {'account': 0,
  'original rating': 5.0,
  'recommendation rating': 4.356828193832599},
 '380': {'account': 4,
  'original rating': 2.0,
  'recommendation rating': 3.6744186046511627},
 '3793': {'account': 0,
  'original rating': 5.0,
  'recommendation rating': 4.356828193832599},
 '527': {'account': 0,
  'original rating': 5.0,
  'recommendation rating': 4.356828193832599},
 '5952': {'account': 6,
  'original rating': 4.5,
  'recommendation rating': 3.216666666666667}}

# Evaluation (MSE):

### calculating the MSE of uniform randomly chosen ratings (averaged over 100 iterations):

In [244]:
mseAvg = calcAvgRandomMse(cellsToCompareOriginalValues, 'uniform', numOfCellsToDelete, 100)
mseAvg

5.970370000000002

### calculating the MSE of stratified randomly chosen ratings (averaged over 100 iterations):

In [268]:
mseAvg = calcAvgRandomMse(cellsToCompareOriginalValues, 'stratified', numOfCellsToDelete, 100)
mseAvg

1.889

### calculating the MSE of the recommended ratings:

In [246]:
normalizeRatings = True

mse = 0
for movie in cellsToCompareOriginalValues.keys():
    originalVal = cellsToCompareOriginalValues[movie]['original rating']
    recommendationVal = cellsToCompareOriginalValues[movie]['recommendation rating']
    squareDiff = math.pow(originalVal - recommendationVal, 2)
    mse += squareDiff
    
numOfCellsToCompare = len(cellsToCompareOriginalValues.keys())
mse = mse/(numOfCellsToCompare * 1.0)
mse

0.7855592580612678