In [8]:
import gzip
import random
import scipy
import math
from collections import defaultdict
from datetime import datetime, timezone, timedelta
import numpy as np
import csv

In [6]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

def readCSV(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    for l in c:
        d = dict(zip(header,l))
        yield d['user_id'],d['recipe_id'],d

In [130]:
userIDs = {}
itemIDs = {}
interactions = []



for user,recipe,d in readCSV("trainInteractions.csv.gz"):
    u = d['user_id']
    i = d['recipe_id']
    r = int(d['rating'])
    t = d['date']
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in itemIDs: itemIDs[i] = len(itemIDs)
    interactions.append((u,i,r,t))

In [131]:
nTrain = int(len(interactions) * 0.9)
nTest = len(interactions) - nTrain
interactionsTrain = interactions[:nTrain]
interactionsTest = interactions[nTrain:]

In [132]:
interactionsTest

[('28333459', '29237128', 5, '2006-01-05'),
 ('87291172', '83050681', 3, '2007-04-16'),
 ('88668005', '12299735', 5, '2002-12-16'),
 ('81677876', '91923417', 5, '2008-08-08'),
 ('21686313', '09960567', 5, '2010-06-15'),
 ('32445558', '34933911', 5, '2009-05-04'),
 ('74862759', '86867583', 5, '2009-05-21'),
 ('29401066', '61519864', 5, '2008-10-22'),
 ('41236533', '60334272', 5, '2015-10-15'),
 ('26800226', '82316690', 5, '2013-01-08'),
 ('97535102', '08323423', 5, '2008-09-20'),
 ('34625961', '55171081', 4, '2007-09-29'),
 ('74192512', '37384233', 5, '2009-02-07'),
 ('73530407', '28187835', 5, '2006-03-24'),
 ('54552478', '25668053', 5, '2009-09-15'),
 ('57551345', '28654323', 5, '2009-11-01'),
 ('35732124', '91616615', 0, '2011-12-25'),
 ('52465943', '40017256', 5, '2012-02-09'),
 ('89029297', '17203342', 4, '2009-09-22'),
 ('92958977', '45936199', 5, '2008-03-02'),
 ('59548225', '21385030', 4, '2007-09-30'),
 ('41644631', '05122176', 5, '2006-09-17'),
 ('72485096', '19248315', 5, '20

In [151]:
itemsPerUserTrain = defaultdict(set)
usersPerItemTrain = defaultdict(set)
ratingDict = defaultdict(set)
timeDict =  defaultdict()

for u,i,r,t in interactionsTrain:
    itemsPerUserTrain[u].add(i)
    usersPerItemTrain[i].add(u)
    timeDict[(u,i)] = t
    ratingDict[(u,i)] = r

In [180]:
allRatings = []
userRatings = defaultdict(list)

for user,recipe,d in readCSV("trainInteractions.csv.gz"):
    r = int(d['rating'])
    allRatings.append(r)
    userRatings[user].append(r)

globalAverage = sum(allRatings) / len(allRatings)
userAverage = {}
for u in userRatings:
    userAverage[u] = sum(userRatings[u]) / len(userRatings[u])

## Similarity

In [110]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [60]:
ratingMean = sum([d[2] for d in interactionsTrain]) / nTrain

In [104]:
userAverages = {}
itemAverages = {}
for u in itemsPerUserTrain:
    rs = [ratingDict[(u,i)] for i in itemsPerUserTrain[u]]
    if len(rs) == 0:
        userAverages[u] = ratingMean
    else:
        userAverages[u] = sum(rs) / len(rs)
    
for i in usersPerItemTrain:
    rs = [ratingDict[(u,i)] for u in usersPerItemTrain[i]]
    if len(rs) == 0:
        itemAverages[i] = ratingMean
    else:
        itemAverages[i] = sum(rs) / len(rs)

In [94]:
userAverages['88348277']

4.50314465408805

In [95]:
def Pearson_shared(u1, u2):
    # Between two users
    uBar1 = userAverages[u1]   
    uBar2 = userAverages[u2]
    inter = userAverages[u1].intersection(userAverages[u2])
    numer = 0
    denom1 = 0
    denom2 = 0
    for i in inter:
        numer += (ratingDict[(u1,i)] - uBar1)*(ratingDict[(u,i2)] - uBar2)
    for i in inter: #usersPerItem[i1]:
        denom1 += (ratingDict[(u1,i)] - uBar1)**2
    #for u in usersPerItem[i2]:
        denom2 += (ratingDict[(u2,i)] - uBar2)**2
    denom = math.sqrt(denom1) * math.sqrt(denom2)
    if denom == 0: return 0
    return numer / denom

In [105]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [181]:
def predictRating(user,item):
    ratings = []
    similarities = []
    for i2 in itemsPerUserTrain[user]:
        if i2 == item: continue
        ratings.append(ratingDict[(user,i2)] - itemAverages[i2])
        similarities.append(Jaccard(usersPerItemTrain[item],usersPerItemTrain[i2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return itemAverages[item] + sum(weightedRatings) / sum(similarities)
    elif u in userAverage:
        # User hasn't rated any similar items
        return userAverage[u]
    else:
        return globalAverage

In [182]:
labels = []
predictions = []
for d in interactionsTest:
        u = d[0]
        i = d[1]
        r = d[2]
        labels.append(r)
        if u in userIDs and i in itemIDs:
            pred = predictRating(u,i)
        else:
            pred = ratingMean
        predictions.append(pred)

In [184]:
MSE(predictions,labels)

1.0028864300427998

In [185]:
predictions = open("predictions_Rated.txt", 'w')
for l in open("stub_Rated.txt"):
    if l.startswith("user_id"):
    #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    if u in userIDs and i in itemIDs:
        pred = predictRating(u,i)
    else:
        pred = ratingMean
    predictions.write(u + '-' + i + ',' + str(pred) + '\n')

predictions.close()

In [135]:
e = timeDict[('88348277', '03969194')]
e

'2004-12-23'

In [146]:
datetime.strptime(e,"%Y-%m-%d")

23

In [152]:
timeDict

defaultdict(None,
            {('88348277', '03969194'): '2004-12-23',
             ('86699739', '27096427'): '2002-01-12',
             ('03425965', '44197323'): '2012-10-03',
             ('73973193', '24971400'): '2008-04-09',
             ('15215209', '60170202'): '2010-10-07',
             ('75799794', '39662395'): '2012-05-02',
             ('77745222', '88709727'): '2003-12-07',
             ('80598779', '09359141'): '2007-10-10',
             ('35769308', '83909791'): '2012-02-13',
             ('31763244', '20530585'): '2010-04-30',
             ('83301801', '65313065'): '2007-03-05',
             ('59693006', '27509968'): '2014-04-07',
             ('56695330', '40843667'): '2007-03-19',
             ('88412520', '58831434'): '2012-08-09',
             ('19632272', '39371043'): '2010-03-11',
             ('94291111', '17214164'): '2009-05-20',
             ('73851538', '32120276'): '2006-02-28',
             ('79256098', '44197323'): '2008-03-26',
             ('60936089', '0

In [147]:
def decay(user,item1,item2,k):
    time1 = datetime.strptime(timeDict[(user,item1)],"%Y-%m-%d")
    time2 = datetime.strptime(timeDict[(user,item2)],"%Y-%m-%d")
    deltaDays = np.abs((time1 - time2).days)
    return math.e**(-k*deltaDays)

In [178]:
def predictTimeRating(user,item,k):
    ratings = []
    similarities = [] 
    ft = []
    for i2 in itemsPerUserTrain[user]:
        if i2 == item: continue
        ratings.append(ratingDict[(user,i2)] - itemAverages[i2])
        similarities.append(Jaccard(usersPerItemTrain[item],usersPerItemTrain[i2]))
        if ((user,item) in timeDict.keys()) and ((user, i2) in timeDict.keys()):
            ft.append(decay(user,item,i2,k))
        else:
            ft.append( math.e**(1))
    Z = [(m*n) for m,n in zip(similarities,ft)]
    if (sum(Z) > 0):
        weightedRatings = [(x*y*z) for x,y,z in zip(ratings,similarities,ft)]
        return itemAverages[item] + sum(weightedRatings) / sum(Z)
    else:
        # User hasn't rated any similar items
        return ratingMean

In [179]:
labels = []
predTime = []
for d in interactionsTest:
        u = d[0]
        i = d[1]
        r = d[2]
        labels.append(r)
        if u in userIDs and i in itemIDs:
            pred =  predictTimeRating(u,i,k=0.00000000001)
        else:
            pred = ratingMean
        predTime.append(pred)

In [177]:
predTime

[4.580722222222223,
 4.083333333333333,
 4.575425592611237,
 4.520492621208566,
 5.291630295458213,
 5.459425429739605,
 4.778741260509254,
 5.231162196679438,
 4.765477628837722,
 5.053633677450077,
 5.133333333333334,
 4.527992715709011,
 4.580722222222223,
 4.580722222222223,
 4.192143377724773,
 4.530462353274793,
 4.690502864417642,
 4.580722222222223,
 4.580722222222223,
 5.105703663529307,
 4.580722222222223,
 4.580722222222223,
 4.300202911774543,
 4.732551319648095,
 5.17284378782667,
 5.092060567779065,
 4.580722222222223,
 3.2983561682172233,
 4.702802621370971,
 4.580722222222223,
 4.532558072899178,
 4.580722222222223,
 4.580722222222223,
 4.282854460941044,
 4.580722222222223,
 4.580722222222223,
 3.513801367139982,
 4.580722222222223,
 5.220072673440549,
 4.580722222222223,
 4.580722222222223,
 3.8638330054922787,
 4.580722222222223,
 4.580722222222223,
 5.063987724711965,
 4.3780012890893385,
 4.580722222222223,
 4.723235230179284,
 4.250101110410316,
 4.475820088514842

In [172]:
predictions

[4.580722222222223,
 4.083333333333333,
 4.575425592611237,
 4.520492621208566,
 5.291630295458213,
 5.459425429739605,
 4.778741260509254,
 5.231162196679438,
 4.765477628837722,
 5.053633677450077,
 5.133333333333334,
 4.527992715709011,
 4.580722222222223,
 4.580722222222223,
 4.192143377724773,
 4.530462353274793,
 4.690502864417642,
 4.580722222222223,
 4.580722222222223,
 5.105703663529307,
 4.580722222222223,
 4.580722222222223,
 4.300202911774543,
 4.732551319648095,
 5.17284378782667,
 5.092060567779065,
 4.580722222222223,
 3.2983561682172233,
 4.702802621370971,
 4.580722222222223,
 4.532558072899178,
 4.580722222222223,
 4.580722222222223,
 4.282854460941044,
 4.580722222222223,
 4.580722222222223,
 3.513801367139982,
 4.580722222222223,
 5.220072673440549,
 4.580722222222223,
 4.580722222222223,
 3.8638330054922787,
 4.580722222222223,
 4.580722222222223,
 5.063987724711965,
 4.3780012890893385,
 4.580722222222223,
 4.723235230179284,
 4.250101110410316,
 4.475820088514842

In [171]:
MSE(predictions,labels)

1.0764429358798537