In [2]:
from collections import defaultdict
import pandas as pd

In [26]:
train_dataset = pd.read_csv('/content/drive/MyDrive/clean_data/train.csv')
validation_dataset = pd.read_csv('/content/drive/MyDrive/clean_data/valid.csv')
test_dataset = pd.read_csv('/content/drive/MyDrive/clean_data/test.csv')

dataset = pd.concat([train_dataset, validation_dataset]) # merges train and validation dataset into one beacause we don't need a validation set for similarity-based rating prediction
dataset.shape

(175869, 6)

In [27]:
usersPerItem = defaultdict(set)
itemsPerUser = defaultdict(set)
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)
ratingDict = {}

In [28]:
for index, row in dataset.iterrows():
  item = row['item']
  user = row['user']
  rating = row['rating']

  usersPerItem[item].add(user)
  itemsPerUser[user].add(item)

  ratingDict[(item, user)] = rating

  reviewsPerUser[user].append({'item': item, 'rating': rating})
  reviewsPerItem[item].append({'user': user, 'rating': rating})

In [29]:
ratingMean = dataset['rating'].mean()

In [30]:
userAverages = defaultdict(float)
itemAverages = defaultdict(float)

for u in itemsPerUser:
    rs = [ratingDict[(i,u)] for i in itemsPerUser[u]]
    userAverages[u] = sum(rs) / len(rs)
    
for i in usersPerItem:
    rs = [ratingDict[(i,u)] for u in usersPerItem[i]]
    itemAverages[i] = sum(rs) / len(rs)

### Implementing similarity functions

TODO:
1.   Implement Cosine and Pearson Similarity functions and check the MSE



In [31]:
def jaccard(s1, s2):
  num = len(s1.intersection(s2))
  den = len(s1.union(s2))

  if den == 0: 
    return 0
  return num/den

### Similarity-based Rating Estimation

In [35]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [52]:
def predictRating(user, item):
  ratings = []
  similarities = []

  for d in reviewsPerUser[user]:
    item2 = d['item']

    if item2 == item: continue

    if(itemAverages[item2] != 0):
      ratings.append(d['rating'] - itemAverages[item2])
    else:
      ratings.append(d['rating'] - ratingMean)

    similarities.append(jaccard(usersPerItem[item2], usersPerItem[item]))

  if (sum(similarities) > 0):
    weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
    if(itemAverages[item] != 0):
      return itemAverages[item] + sum(weightedRatings) / sum(similarities)
    else:
      return ratingMean + sum(weightedRatings) / sum(similarities)
  else:
    if(itemAverages[item] != 0):
      return itemAverages[item]
    else:
      return ratingMean

In [53]:
alwaysMean = [] # when you always predict mean
predictions = []
actual = []

for index, row in test_dataset.iterrows():
  user = row['user']
  item = row['item']
  actual_rating = row['rating']
  predicted_rating = predictRating(user, item)

  actual.append(actual_rating)
  alwaysMean.append(ratingMean)
  predictions.append(predicted_rating)

In [47]:
MSE(alwaysMean, actual)

2.2220352124670564

In [54]:
MSE(predictions, actual)

1.9498775185780508