In [37]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [38]:
from collections import defaultdict
import pandas as pd
import math

In [39]:
train_dataset = pd.read_csv('/content/drive/Shareddrives/CSE258/clean_data/train.csv')
validation_dataset = pd.read_csv('/content/drive/Shareddrives/CSE258/clean_data/valid.csv')
test_dataset = pd.read_csv('/content/drive/Shareddrives/CSE258/clean_data/test.csv')

dataset = pd.concat([train_dataset, validation_dataset]) # merges train and validation dataset into one beacause we don't need a validation set for similarity-based rating prediction
dataset.shape

(175869, 6)

In [45]:
train_dataset.head()

Unnamed: 0,item,user,paid,time,rating,review
0,Fisher_Price_Loving_Family_Sweet_Sounds_Dollhouse,karleigh,79.99,1071878400,4.0,i researched and looked at all the fisher pric...
1,Nokia_E62_Smartphone,mfw1982,199.0,1196294400,5.0,no it doesnt have a camera and yes the keys ar...
2,pr-Dell_DJ_15GB_MP3_Player,davydanger,199.0,1081987200,1.0,i was very excited to buy this product given a...
3,Blue_s_Clues_Bath_Time_Blue,kbmg,gift,990489600,2.0,my two kids ages 2 and 4 are big blues clues f...
4,Spider_Man_Gloves,pluckyduck,9.99,1031443200,3.0,dad was walking through toys r us the other we...


In [15]:
dataset.columns

Index(['item', 'user', 'paid', 'time', 'rating', 'review'], dtype='object')

In [26]:
usersPerItem = defaultdict(set)
itemsPerUser = defaultdict(set)
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)
ratingDict = {}

In [27]:
for index, row in dataset.iterrows():
  item = row['item']
  user = row['user']
  rating = row['rating']

  usersPerItem[item].add(user)
  itemsPerUser[user].add(item)

  ratingDict[(item, user)] = rating

  reviewsPerUser[user].append({'item': item, 'rating': rating})
  reviewsPerItem[item].append({'user': user, 'rating': rating})

In [28]:
ratingMean = dataset['rating'].mean()

In [29]:
userAverages = defaultdict(float)
itemAverages = defaultdict(float)

for u in itemsPerUser:
    rs = [ratingDict[(i,u)] for i in itemsPerUser[u]]
    userAverages[u] = sum(rs) / len(rs)
    
for i in usersPerItem:
    rs = [ratingDict[(i,u)] for u in usersPerItem[i]]
    itemAverages[i] = sum(rs) / len(rs)

### Implementing similarity functions

TODO:
1.   Implement Cosine and Pearson Similarity functions and check the MSE (done)



In [121]:
def Jaccard(s1, s2):
  num = len(s1.intersection(s2))
  den = len(s1.union(s2))

  if den == 0: 
    return 0
  return num/den

In [68]:
def Cosine(i1, i2):
    # Between two items
    inter = usersPerItem[i1].intersection(usersPerItem[i2])
    numer = 0
    denom1 = 0
    denom2 = 0
    for u in inter:
        numer += ratingDict[(i1,u)]*ratingDict[(i2,u)]
    for u in usersPerItem[i1]:
        denom1 += ratingDict[(i1,u)]**2
    for u in usersPerItem[i2]:
        denom2 += ratingDict[(i2,u)]**2
    denom = math.sqrt(denom1) * math.sqrt(denom2)
    if denom == 0: return 0
    return numer / denom

In [111]:
def CosineUsers(u1, u2):
    # Between two items
    inter = itemsPerUser[u1].intersection(itemsPerUser[u2])
    numer = 0
    denom1 = 0
    denom2 = 0
    for i in inter:
        numer += ratingDict[(i,u1)]*ratingDict[(i,u2)]
    for i in itemsPerUser[u1]:
        denom1 += ratingDict[(i,u1)]**2
    for i in itemsPerUser[u2]:
        denom2 += ratingDict[(i,u2)]**2
    denom = math.sqrt(denom1) * math.sqrt(denom2)
    if denom == 0: return 0
    return numer / denom

In [73]:
def Pearson(i1, i2):
    # Between two items
    iBar1 = itemAverages[i1]
    iBar2 = itemAverages[i2]
    inter = usersPerItem[i1].intersection(usersPerItem[i2])
    numer = 0
    denom1 = 0
    denom2 = 0
    for u in inter:
        numer += (ratingDict[(i1,u)] - iBar1)*(ratingDict[(i2,u)] - iBar2)
    for u in inter: #usersPerItem[i1]:
        denom1 += (ratingDict[(i1,u)] - iBar1)**2
    #for u in usersPerItem[i2]:
        denom2 += (ratingDict[(i2,u)] - iBar2)**2
    denom = math.sqrt(denom1) * math.sqrt(denom2)
    if denom == 0: return 0
    return numer / denom

In [112]:
def PearsonUsers(u1, u2):
    # Between two items
    iBar1 = userAverages[u1]
    iBar2 = userAverages[u2]
    inter = itemsPerUser[u1].intersection(itemsPerUser[u2])
    numer = 0
    denom1 = 0
    denom2 = 0
    for i in inter:
        numer += (ratingDict[(i,u1)] - iBar1)*(ratingDict[(i,u2)] - iBar2)
    for i in inter: #itemsPerUser[u1]:
        denom1 += (ratingDict[(i,u1)] - iBar1)**2
    #for i in itemsPerUser[u2]:
        denom2 += (ratingDict[(i,u2)] - iBar2)**2
    denom = math.sqrt(denom1) * math.sqrt(denom2)
    if denom == 0: return 0
    return numer / denom

### Similarity-based Rating Estimation

In [9]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [122]:
def predictRatingJaccard(user, item):
  ratings = []
  similarities = []

  for d in reviewsPerUser[user]:
    item2 = d['item']

    if item2 == item: continue

    if(itemAverages[item2] != 0):
      ratings.append(d['rating'] - itemAverages[item2])
    else:
      ratings.append(d['rating'] - ratingMean)

    similarities.append(Jaccard(usersPerItem[item2], usersPerItem[item]))

  if (sum(similarities) > 0):
    weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
    if(itemAverages[item] != 0):
      return itemAverages[item] + sum(weightedRatings) / sum(similarities)
    else:
      return ratingMean + sum(weightedRatings) / sum(similarities)
  else:
    if(itemAverages[item] != 0):
      return itemAverages[item]
    else:
      return ratingMean

In [123]:
def predictRatingJaccardUsers(user, item):
  ratings = []
  similarities = []

  for d in reviewsPerItem[item]:
    user2 = d['user']

    if user2 == user: continue

    if(userAverages[user2] != 0):
      ratings.append(d['rating'] - userAverages[user2])
    else:
      ratings.append(d['rating'] - ratingMean)

    similarities.append(Jaccard(itemsPerUser[user2], itemsPerUser[user]))

  if (sum(similarities) > 0):
    weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
    if(userAverages[user] != 0):
      return userAverages[user] + sum(weightedRatings) / sum(similarities)
    else:
      return ratingMean + sum(weightedRatings) / sum(similarities)
  else:
    if(userAverages[user] != 0):
      return userAverages[user]
    else:
      return ratingMean

In [109]:
def predictRatingCosine(user, item):
  ratings = []
  similarities = []

  for d in reviewsPerUser[user]:
    item2 = d['item']

    if item2 == item: continue

    if(itemAverages[item2] != 0):
      ratings.append(d['rating'] - itemAverages[item2])
    else:
      ratings.append(d['rating'] - ratingMean)

    similarities.append(Cosine(item2, item))

  if (sum(similarities) > 0):
    weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
    if(itemAverages[item] != 0):
      return itemAverages[item] + sum(weightedRatings) / sum(similarities)
    else:
      return ratingMean + sum(weightedRatings) / sum(similarities)
  else:
    if(itemAverages[item] != 0):
      return itemAverages[item]
    else:
      return ratingMean

In [114]:
def predictRatingCosineUsers(user, item):
  ratings = []
  similarities = []

  for d in reviewsPerItem[item]:
    user2 = d['user']

    if user2 == user: continue

    if(userAverages[user2] != 0):
      ratings.append(d['rating'] - userAverages[user2])
    else:
      ratings.append(d['rating'] - ratingMean)

    similarities.append(CosineUsers(user2, user))

  if (sum(similarities) > 0):
    weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
    if(userAverages[user] != 0):
      return userAverages[user] + sum(weightedRatings) / sum(similarities)
    else:
      return ratingMean + sum(weightedRatings) / sum(similarities)
  else:
    if(userAverages[user] != 0):
      return userAverages[user]
    else:
      return ratingMean

In [100]:
def predictRatingPearson(user, item):
  ratings = []
  similarities = []

  for d in reviewsPerUser[user]:
    item2 = d['item']

    if item2 == item: continue

    if(itemAverages[item2] != 0):
      ratings.append(d['rating'] - itemAverages[item2])
    else:
      ratings.append(d['rating'] - ratingMean)

    similarities.append(Pearson(item2, item))

  if (sum(similarities) > 0):
    weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
    if(itemAverages[item] != 0):
      return itemAverages[item] + sum(weightedRatings) / sum(similarities)
    else:
      return ratingMean + sum(weightedRatings) / sum(similarities)
  else:
    if(itemAverages[item] != 0):
      return itemAverages[item]
    else:
      return ratingMean

In [115]:
def predictRatingPearsonUsers(user, item):
  ratings = []
  similarities = []

  for d in reviewsPerItem[item]:
    user2 = d['user']

    if user2 == user: continue

    if(userAverages[user2] != 0):
      ratings.append(d['rating'] - userAverages[user2])
    else:
      ratings.append(d['rating'] - ratingMean)

    similarities.append(PearsonUsers(user2, user))

  if (sum(similarities) > 0):
    weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
    if(userAverages[user] != 0):
      return userAverages[user] + sum(weightedRatings) / sum(similarities)
    else:
      return ratingMean + sum(weightedRatings) / sum(similarities)
  else:
    if(userAverages[user] != 0):
      return userAverages[user]
    else:
      return ratingMean

In [124]:
alwaysMean = [] # when you always predict mean
predictionsJaccard = []
predictionsJaccardUsers = []
predictionsCosine = []
predictionsCosineUsers = []
predictionsPearson = []
predictionsPearsonUsers = []
actual = []

for index, row in test_dataset.iterrows():
  user = row['user']
  item = row['item']
  actual_rating = row['rating']
  predicted_rating_Jaccard = predictRatingJaccard(user, item)
  predicted_rating_Jaccard_Users = predictRatingJaccardUsers(user, item)
  predicted_rating_Cosine = predictRatingCosine(user, item)
  predicted_rating_Cosine_Users = predictRatingCosineUsers(user, item)
  predicted_rating_Pearson = predictRatingPearson(user, item)
  predicted_rating_Pearson_Users = predictRatingPearsonUsers(user, item)

  actual.append(actual_rating)
  alwaysMean.append(ratingMean)
  predictionsJaccard.append(predicted_rating_Jaccard)
  predictionsJaccardUsers.append(predicted_rating_Jaccard_Users)
  predictionsCosine.append(predicted_rating_Cosine)
  predictionsCosineUsers.append(predicted_rating_Cosine_Users)
  predictionsPearson.append(predicted_rating_Pearson)
  predictionsPearsonUsers.append(predicted_rating_Pearson_Users)

MSE using mean value prediction:

In [125]:
MSE(alwaysMean, actual)

2.2220352124670564

MSE using Jaccard similarity:

In [126]:
MSE(predictionsJaccard, actual)

1.9520315086437812

MSE using Jaccard similarity with users instead of items:

In [127]:
MSE(predictionsJaccardUsers, actual)

2.4439494212685773

MSE using Cosine similarity:

In [128]:
MSE(predictionsCosine, actual)

1.9522715270969415

MSE using Cosine similarity with users instead of items:

In [129]:
MSE(predictionsCosineUsers, actual)

2.445925868958561

MSE using Pearson similarity:

In [130]:
MSE(predictionsPearson, actual)

1.9629140986021547

MSE using Pearson similarity with users instead of items:

In [131]:
MSE(predictionsPearsonUsers, actual)

2.5303757251800105