In [1]:
# imports

In [2]:
import gzip
from collections import defaultdict
from sklearn import linear_model
import csv
import random
from sklearn.metrics import mean_squared_error

In [3]:
# read funcs

In [4]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

def readCSV(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    for l in c:
        d = dict(zip(header,l))
        yield d['user_id'],d['recipe_id'],d

In [5]:
# Data Structs

In [6]:

### Rating baseline: compute averages for each user, or return the global average if we've never seen the user before

allRatings = []
userRatings = defaultdict(list)
recipeRatings = defaultdict(list)
urPairs = []
ratings = []

for user,recipe,d in readCSV("trainInteractions.csv.gz"):
    r = int(d['rating'])
    urPairs.append([user,recipe])
    ratings.append(r)
    
trainIndex = 20000*9//10

urPairsTraining = urPairs[:trainIndex]
ratingsTraining = ratings[:trainIndex]

urPairsValidation = urPairs[trainIndex:20000]
ratingsValidation = ratings[trainIndex:20000]

urPairsPreTraining = urPairs[20000:]
ratingsPreTraining = ratings[20000:]

for i,(user,recipe) in enumerate(urPairsPreTraining):
    r = ratingsPreTraining[i]
    allRatings.append(r)
    userRatings[user].append(r)
    recipeRatings[recipe].append(r)

In [7]:
# Average Data

In [8]:
globalAverage = sum(allRatings) / len(allRatings)
userAverage = {}
recipeAverage = {}
for u in userRatings:
    userAverage[u] = sum(userRatings[u]) / len(userRatings[u])
for r in recipeRatings:
    recipeAverage[r] = sum(recipeRatings[r]) / len(recipeRatings[r])

In [9]:
def weightedAverage(n1, n2, w1):
    w2 = 1-w1
    return (n1*w1+n2*w2)

def MSE(pred, Y):
    return mean_squared_error(Y,pred)

def predict(urPairs, weight):
    pred = []
    for u,r in urPairs:
        if u in userAverage and r in recipeAverage:
            #print("ur")
            pred.append(weightedAverage(userAverage[u],recipeAverage[r],weight))
        elif u in userAverage:
            #print("u")
            pred.append(userAverage[u])
        elif r in recipeAverage:
            #print("r")
            pred.append(recipeAverage[r])
        else:
            #print("-")
            pred.append(globalAverage)
    return pred
            

In [10]:
def train(weight):
    pred = predict(urPairsTraining,weight)
    print(MSE(pred,ratingsTraining))

In [11]:
train(.70)

0.8265626076902222


In [12]:
train(.71)

0.8256401469974267


In [13]:
train(.72)

0.8247885588963234


In [14]:
train(.73)

0.8240078433869121


In [15]:
train(.74)

0.8232980004691928


In [16]:
train(.75)

0.8226590301431655


In [17]:
train(.76)

0.8220909324088306


In [18]:
train(.77)

0.8215937072661876


In [19]:
train(.78)

0.8211673547152369


In [20]:
train(.79)

0.8208118747559782


In [21]:
train(.8)

0.8205272673884115


In [22]:
train(.81)

0.8203135326125369


In [23]:
train(.82)

0.8201706704283545


In [24]:
train(.83)

0.8200986808358641


In [25]:
train(.84)

0.8200975638350657


In [26]:
train(.85)

0.8201673194259594


In [27]:
train(.86)

0.8203079476085453


In [28]:
train(.87)

0.8205194483828234


In [29]:
train(.88)

0.8208018217487933


In [30]:
train(.89)

0.8211550677064555


In [31]:
train(.9)

0.8215791862558097


In [37]:
predictions = open("predictions_Rated.txt", 'w')
for l in open("stub_Rated.txt"):
    if l.startswith("user_id"):
        #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    p = predict([[u,i]],.84)[0]
    #print(p)
    predictions.write(u + '-' + i + ',' + str(p) + '\n')
  
predictions.close()

