In [1]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine as dist
from scipy.sparse.linalg import svds
# from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mse 
from sklearn.metrics.pairwise import cosine_similarity  
from copy import deepcopy 
from matplotlib import pyplot as plt
import warnings

warnings.filterwarnings('ignore')
np.set_printoptions(suppress = True)

# reference link: https://hendra-herviawan.github.io/Movie-Recommendation-based-on-KNN-K-Nearest-Neighbors.html
# Trainng Samples loaded from Github Repo
ratings_path = 'https://raw.githubusercontent.com/usef-kh/EC503Project/master/Datasets/ml-1m/ratings.dat'
titles_path = 'https://raw.githubusercontent.com/usef-kh/EC503Project/master/Datasets/ml-1m/movies.dat'

ratings = pd.read_csv(ratings_path, sep  = "::", names = ['UserID', 'MovieID', 'Rating', 'Timestamp'])
titles = pd.read_csv(titles_path, sep = "::", names = ['MovieID', 'Title', 'Genres'], encoding ='latin-1')
data = pd.merge(ratings, titles, on='MovieID')

ntotal = data.shape[0]
ntv = int(np.floor(ntotal*0.85))
ntest = ntotal - ntv

print(f"ntotal: {ntotal}\nntrain: {ntv}\nntest: {ntest}")

data.head()

ntotal: 1000209
ntrain: 850177
ntest: 150032


Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title,Genres
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama


In [2]:
def normalize(x):
  return (x - np.min(x)) / (np.max(x) - np.min(x))

stats = pd.DataFrame()
stats['Average Rating'] = data.groupby('MovieID')['Rating'].mean()
stats['Rating Count'] = data.groupby('MovieID')['Rating'].count()
stats['Normalized Rating Count'] = normalize(stats['Rating Count'])

stats.head()

Unnamed: 0_level_0,Average Rating,Rating Count,Normalized Rating Count
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,4.146846,2077,0.605778
2,3.201141,701,0.20426
3,3.016736,478,0.139189
4,2.729412,170,0.049314
5,3.006757,296,0.086081


In [3]:
all_genres = []

for index,row in titles.iterrows():
  #  separate the genres and put into a list
  temp = row.Genres.split('|')
   #  store all the genres into a list
  all_genres =  all_genres + temp
  # unique genres list
genres = list(set(all_genres))
lenGeneres = len(genres)
zeroVec = np.zeros(lenGeneres) 
#  store the genres and corresponding index into a dictionary
genreDict0 = dict(enumerate(genres))
#  store genres as key and index as value
genreDict = {value:key for key, value in genreDict0.items()}
print(genreDict)


{'Fantasy': 0, 'Horror': 1, "Children's": 2, 'Mystery': 3, 'Crime': 4, 'Drama': 5, 'War': 6, 'Documentary': 7, 'Musical': 8, 'Sci-Fi': 9, 'Thriller': 10, 'Film-Noir': 11, 'Comedy': 12, 'Animation': 13, 'Adventure': 14, 'Western': 15, 'Action': 16, 'Romance': 17}


In [0]:
#  create a class to store movie name, genres and rating
class Details():
  def __init__(self, n , g, nc, mr):
    self.name = n
    self.genres = g
    self.mean_rating = mr
    self.normalized_count = nc

  def __str__(self):
    return self.name + " " + str(self.mean_rating) + " " + str(self.normalized_count) + " " + str(self.genres) 
movieDict = {}
genreArray = []
nrc = []
possible_ids = []
for index,row in titles.iterrows():
  try:
      # create a zeros list
      genresVec = np.zeros(lenGeneres)
      # get genres
      genreTmp = row.Genres.split('|')

      for gen in genreTmp: 
         # get genres index
        genID = genreDict[gen]
        # assign the corresponding genres index to 1
        genresVec[genID] = 1

      # a vector storing normalized rating count
      nrc += [stats.loc[row.MovieID]['Normalized Rating Count']]
      
      # put the 0 and 1 vector into a whole matrix representing the genres
      genreArray += [genresVec]
      # a vector storing id of movies for later process, skipping wired data. 
      possible_ids += [row.MovieID]
      movieDict[row.MovieID] = Details(row.Title, genresVec, stats.loc[row.MovieID]['Normalized Rating Count'],  stats.loc[row.MovieID]['Average Rating'])
  except:  
    continue


In [9]:
# computing distance
countrate_dist = []
for rate in nrc:
  # use the current rate minus the whole vector to get difference. 
  temp = rate - nrc
  countrate_dist += [temp]
# compute the absolute value of rate difference. 
countrate_dist = np.absolute(countrate_dist)
# compute the cosine simularity, 1 - similarity will convert to distance. 
dists = 1 - cosine_similarity(genreArray,genreArray)
# the total distance including rating and genres
dist_final = dists + countrate_dist

print(dist_final[1][1])

-2.220446049250313e-16


In [11]:
# get the k nearest neighbors. 
def getNeighbors(movieID, dist_mtx, k):
    movieID -= 1  
    dist2movie = np.vstack((possible_ids, dist_mtx[movieID]))
    dist2movie = np.transpose(dist2movie)
    dist2movie = np.delete(dist2movie, movieID, axis = 0)
    dist2movie = dist2movie[np.argsort(dist2movie[:, 1])]
    return dist2movie[0:k, :]
K = 5
neighbors = getNeighbors(1, dist_final, K)

avg_rating = 0
for neighbor in neighbors:
  avg_rating += stats.loc[neighbor[0]]['Average Rating']/K

print(avg_rating)

3.900331411561667
