In [172]:
import datetime
import gzip
import json
import math
import random
from collections import defaultdict

In [2]:
review_file = 'goodreads_reviews_comics_graphic.json'
compressed_file = review_file + '.gz'

In [57]:
path = compressed_file
f = gzip.open(path, 'rt', encoding="utf8")

In [58]:
dataset = []

for line in f:
    d = json.loads(line)
    #d = dict(zip(header, fields))
    d['rating'] = int(d['rating'])
    d['n_comments'] = int(d['n_comments'])
    d['n_votes'] = int(d['n_votes'])
    dataset.append(d)

In [59]:
usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
itemNames = {}
ratingDict = {} # To retrieve a rating for a specific user/item pair

for d in dataset:
    user,item = d['user_id'], d['book_id']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    ratingDict[(user,item)] = d['rating']
    itemNames[item] = d['review_text']

In [84]:
userAverages = {}
itemAverages = {}

for u in itemsPerUser:
    rs = [ratingDict[(u,i)] for i in itemsPerUser[u]]
    userAverages[u] = sum(rs) / len(rs)
    
for i in usersPerItem:
    rs = [ratingDict[(u,i)] for u in usersPerItem[i]]
    itemAverages[i] = sum(rs) / len(rs)

In [8]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [104]:
def mostSimilar(i, N):
    similarities = []
    users = usersPerItem[i]
    for i2 in usersPerItem:
        if i2 == i: continue
        sim = Jaccard(users, usersPerItem[i2])
        #sim = Pearson(i, i2) # Could use alternate similarity metrics straightforwardly
        similarities.append((sim,i2))
    similarities.sort(key=lambda x: x[0], reverse=True)
    #sorted(similarities, key=lambda x: x[0], reverse=True)
    return similarities[:N]

## Q1

In [30]:
mostSimilar('18471619', 10)

[(0.16666666666666666, '25334626'),
 (0.14285714285714285, '25659811'),
 (0.13793103448275862, '18369278'),
 (0.13157894736842105, '18430205'),
 (0.12903225806451613, '20299669'),
 (0.125, '17995154'),
 (0.12121212121212122, '18853527'),
 (0.12121212121212122, '23093378'),
 (0.12121212121212122, '23241671'),
 (0.11764705882352941, '26778333')]

## Q2

In [150]:
original_user = 'dc3763cdb9b2cae805882878eebb6a32'

### a)

In [119]:
mostSimilar(getFavouriteItem(original_user), 10)

[(0.16666666666666666, '25334626'),
 (0.14285714285714285, '25659811'),
 (0.13793103448275862, '18369278'),
 (0.13157894736842105, '18430205'),
 (0.12903225806451613, '20299669'),
 (0.125, '17995154'),
 (0.12121212121212122, '18853527'),
 (0.12121212121212122, '23093378'),
 (0.12121212121212122, '23241671'),
 (0.11764705882352941, '26778333')]

### b)

In [148]:
def mostSimilarUsers(base_user, N):
    similarities = []
    items = itemsPerUser[base_user]
    for user in itemsPerUser:
        if base_user == user: continue
        sim = Jaccard(items, itemsPerUser[user])
        similarities.append((sim, user))
    similarities.sort(key=lambda x: x[0], reverse=True)
    #sorted(similarities, key=lambda x: x[0], reverse=True)
    return similarities

In [152]:
similarUsers = [item[1] for item in mostSimilarUsers(original_user, 10)]

In [155]:
def getFavouriteItem(user):
    itemlist = [item for item in itemsPerUser[user] if item not in itemsPerUser[original_user]]
    ratinglist = [(ratingDict[(user, item)], item) for item in itemlist]
    if len(ratinglist) == 0:
        return None
    else:
        return max(ratinglist)[1]

In [158]:
itemList = [getFavouriteItem(user) for user in similarUsers]
itemList = [item for item in itemList if item is not None]
itemList[:10]

['5805',
 '23531233',
 '59715',
 '7736086',
 '22454333',
 '47558',
 '9516',
 '9041662',
 '991197',
 '7805977']

## Q3

In [64]:
def PearsonA(i1, i2):
    iBar1 = itemAverages[i1]
    iBar2 = itemAverages[i2]
    inter = usersPerItem[i1].intersection(usersPerItem[i2])
    numer = 0
    denom1 = 0
    denom2 = 0
    for u in inter:
        numer += (ratingDict[(u,i1)] - iBar1)*(ratingDict[(u,i2)] - iBar2)
    for u in inter:
        denom1 += (ratingDict[(u,i1)] - iBar1)**2
    for u in inter:
        denom2 += (ratingDict[(u,i2)] - iBar2)**2
    denom = math.sqrt(denom1) * math.sqrt(denom2)
    if denom == 0: return 0
    return numer / denom

In [73]:
def PearsonB(i1, i2):
    iBar1 = itemAverages[i1]
    iBar2 = itemAverages[i2]
    inter = usersPerItem[i1].intersection(usersPerItem[i2])
    numer = 0
    denom1 = 0
    denom2 = 0
    for u in inter:
        numer += (ratingDict[(u,i1)] - iBar1)*(ratingDict[(u,i2)] - iBar2)
    for u in usersPerItem[i1]:
        denom1 += (ratingDict[(u,i1)] - iBar1)**2
    for u in usersPerItem[i2]:
        denom2 += (ratingDict[(u,i2)] - iBar2)**2
    denom = math.sqrt(denom1) * math.sqrt(denom2)
    if denom == 0: 
        return 0
    return numer / denom

In [74]:
def mostSimilar(i, N, similarityFunc):
    similarities = []
    
    for i2 in usersPerItem:
        if i2 == i: continue
        sim = similarityFunc(i, i2)
        similarities.append((sim,i2))
    similarities.sort(key=lambda x: x[0], reverse=True)
    #sorted(similarities, key=lambda x: x[0], reverse=True)
    return similarities[:N]

### a)

In [71]:
mostSimilar('18471619', 10, PearsonA)

[(1.0000000000000002, '62953'),
 (1.0000000000000002, '29431094'),
 (1.0000000000000002, '23200006'),
 (1.0000000000000002, '23332879'),
 (1.0000000000000002, '23901106'),
 (1.0000000000000002, '7342071'),
 (1.0000000000000002, '3328828'),
 (1.0000000000000002, '993861'),
 (1.0000000000000002, '26251358'),
 (1.0000000000000002, '23131087')]

### b)

In [75]:
mostSimilar('18471619', 10, PearsonB)

[(0.31898549007874194, '20300526'),
 (0.18785865431369264, '13280885'),
 (0.17896391275176457, '18208501'),
 (0.16269036695641687, '21521612'),
 (0.16269036695641687, '25430791'),
 (0.1555075595594449, '1341758'),
 (0.1526351566298752, '6314737'),
 (0.15204888048160353, '4009034'),
 (0.1494406444160154, '988744'),
 (0.14632419481281997, '18430205')]

## Q4

In [76]:
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)

In [78]:
for d in dataset:
    user,item = d['user_id'], d['book_id']
    reviewsPerUser[user].append(d)
    reviewsPerItem[item].append(d)

In [80]:
ratingMean = sum([d['rating'] for d in dataset]) / len(dataset)

In [85]:
def predictRating(user, item):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['book_id']
        if i2 == item: continue
        ratings.append(d['rating'] - itemAverages[i2])
        similarities.append(Jaccard(usersPerItem[item],usersPerItem[i2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        N = len(usersPerItem[item])
        return ((itemAverages[item] * N - ratingDict[(user, item)])/(N - 1)) + sum(weightedRatings) / sum(similarities)
    else:
        return ratingMean

In [189]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [159]:
samples = 0
val = 0
for user in itemsPerUser:
    for item in itemsPerUser[user]:
        val += pow(ratingDict[(user, item)] - predictRating(user, item), 2)
        samples += 1
        if samples >= 10000:
            break
    if samples >= 10000:
        break

MSE_val = val / samples

In [160]:
MSE_val

0.712368110932227

## Q5

In [175]:
timingDict = {}
date_format = "%a %b %d %H:%M:%S %z %Y"
for d in dataset:
    timingDict[(d['user_id'], d['book_id'])] = datetime.datetime.strptime(d['date_updated'], date_format).timestamp()

In [184]:
timingList = [timingDict[key] for key in timingDict]
timingList.sort(reverse=True)
latestTime = timingList[0]

In [185]:
latestTime

1509911734.0

In [196]:
import math

In [234]:
def timeFunc(time, l=0.00000001):
    return math.exp(-l * time)

In [235]:
def predictTimeBasedRating(user, item):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['book_id']
        if i2 == item: continue
        ratings.append((d['rating'] - itemAverages[i2]))
        similarities.append(Jaccard(usersPerItem[item], usersPerItem[i2]) * timeFunc(abs(timingDict[(user, i2)] - timingDict[(user, item)])))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        N = len(usersPerItem[item])
        return itemAverages[item] + sum(weightedRatings) / sum(similarities)
    else:
        return ratingMean

In [236]:
samples = 0
val = 0
for user in itemsPerUser:
    for item in itemsPerUser[user]:
        val += pow(ratingDict[(user, item)] - predictTimeBasedRating(user, item), 2)
        samples += 1
        if samples >= 10000:
            break
    if samples >= 10000:
        break

MSE_val = val / samples

In [237]:
MSE_val

0.7047761113627481