In [5]:
import gzip
from collections import defaultdict
from sklearn import linear_model
import csv
import json

In [6]:
import numpy as np
import random

In [7]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [8]:
def readCSV(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    for l in c:
        d = dict(zip(header,l))
        yield d['user_id'],d['recipe_id'],d

In [9]:
def loadCSV(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    dataset = []
    for line in c:
        d = dict(zip(header, line))
        dataset.append(d)
    return dataset

# Q1)

In [6]:
dataset = loadCSV('trainInteractions.csv.gz')

In [7]:
recipeCount = defaultdict(int)
totalCooked = 0

for user,recipe,d in readCSV("trainInteractions.csv.gz"):
    print(d)
    recipeCount[recipe] += 1
    totalCooked += 1
    break

{'user_id': '88348277', 'recipe_id': '03969194', 'date': '2004-12-23', 'rating': '5'}


In [7]:
len(dataset)

500000

In [22]:
train = dataset[:400000]
validation = dataset[400000:]

In [23]:
userPerRecipeValid = defaultdict(set)
recipePerUserValid = defaultdict(set)
recipeListValid = set([])
newValidationDataset = []

for datum in validation:
    user, recipe = datum['user_id'], datum['recipe_id']
    userPerRecipeValid[recipe].add(user)
    recipePerUserValid[user].add(recipe)
    recipeListValid.add(recipe)

recipeListValid = list(recipeListValid)
recipeListSize = len(recipeListValid)
for datum in validation:
    user, recipe = datum['user_id'], datum['recipe_id']
    newValidationDataset.append((user, recipe, 1))
    
    while True:
        index = random.randint(0, recipeListSize - 1)
        if recipeListValid[index] not in recipePerUserValid[user]:
            break
    
    newValidationDataset.append((user, recipeListValid[index], 0))

In [25]:
len(newValidationDataset)

200000

In [26]:
newValidationDataset[:10]

[('90764166', '01768679', 1),
 ('90764166', '44845944', 0),
 ('68112239', '24923981', 1),
 ('68112239', '64197312', 0),
 ('32173358', '57597698', 1),
 ('32173358', '58861769', 0),
 ('30893740', '16266088', 1),
 ('30893740', '78236641', 0),
 ('69780905', '62953151', 1),
 ('69780905', '28584315', 0)]

In [35]:
### Would-cook baseline: just rank which recipes are popular and which are not, and return '1' if a recipe is among the top-ranked

recipeCount = defaultdict(int)
totalCooked = 0

for d in train:
    user, recipe = d['user_id'], d['recipe_id']
    recipeCount[recipe] += 1
    totalCooked += 1

mostPopular = [(recipeCount[x], x) for x in recipeCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalCooked/2: break

correct_labels = 0
total_labels = 0
for d in newValidationDataset:
    user, recipe, label = d
    if recipe in return1:
        prediction = 1
    else:
        prediction = 0
    
    if prediction == label:
        correct_labels += 1
    total_labels += 1

accuracy = correct_labels/total_labels

In [36]:
accuracy

0.61539

# Q2)

In [50]:
thresholds = [0.1, 0.2, 0.3, 0.4, 0.43, 0.45, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.55, 0.6, 0.7, 0.8, 0.9]

In [51]:
### Would-cook baseline: just rank which recipes are popular and which are not, and return '1' if a recipe is among the top-ranked

recipeCount = defaultdict(int)
totalCooked = 0
best_accuracy = 0
better_threshold = 0

for d in train:
    user, recipe = d['user_id'], d['recipe_id']
    recipeCount[recipe] += 1
    totalCooked += 1

mostPopular = [(recipeCount[x], x) for x in recipeCount]
mostPopular.sort()
mostPopular.reverse()

for threshold in thresholds:
    return1 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count > (totalCooked * threshold): 
            break

    correct_labels = 0
    total_labels = 0
    for d in newValidationDataset:
        user, recipe, label = d
        if recipe in return1:
            prediction = 1
        else:
            prediction = 0

        if prediction == label:
            correct_labels += 1
        total_labels += 1

    accuracy = correct_labels/total_labels
    print(threshold, accuracy)
    if best_accuracy < accuracy:
        best_accuracy = accuracy
        better_threshold = threshold

0.1 0.54484
0.2 0.580665
0.3 0.603475
0.4 0.613305
0.43 0.61458
0.45 0.614885
0.48 0.61494
0.49 0.614555
0.5 0.61443
0.51 0.613975
0.52 0.61404
0.53 0.613185
0.55 0.61228
0.6 0.60929
0.7 0.599965
0.8 0.58739
0.9 0.570195


In [52]:
print(best_accuracy)
print(better_threshold)

0.61494
0.48


# Q3)

In [37]:
userPerRecipeTrain = defaultdict(set)
recipePerUserTrain = defaultdict(set)

for datum in train:
    user, recipe = datum['user_id'], datum['recipe_id']
    userPerRecipeTrain[recipe].add(user)
    recipePerUserTrain[user].add(recipe)

In [38]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [39]:
def mostSimilar(recipe, N):
    similarities = []
    users = userPerRecipeTrain[recipe]
    for i2 in userPerRecipeTrain:
        if i2 == i: continue
        sim = Jaccard(users, userPerRecipeTrain[i2])

        similarities.append((sim,i2))
    similarities.sort(key=lambda x: x[0], reverse=True)

    return similarities[:N]

In [44]:
sim_thresholds = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]

In [27]:
newValidationDataset[:2]

[('90764166', '01768679', 1), ('90764166', '44845944', 0)]

In [45]:
best_accuracy = 0
best_threshold = 0
for sim_threshold in sim_thresholds:
    correct_labels = 0
    total_labels = 0
    for d in newValidationDataset:
        user, recipe, label = d
        
        prediction = 0
        for userRecipe in recipePerUserTrain[user]:
            if Jaccard(userPerRecipeTrain[recipe], userPerRecipeTrain[userRecipe]) > sim_threshold:
                prediction = 1
                break

        if prediction == label:
            correct_labels += 1
        total_labels += 1

    accuracy = correct_labels/total_labels
    print(sim_threshold, accuracy)
    if best_accuracy < accuracy:
        best_accuracy = accuracy
        best_threshold = sim_threshold

print(best_accuracy, best_threshold)

0.0 0.596115
0.1 0.521945
0.2 0.514155
0.3 0.5076
0.4 0.502215
0.5 0.498775
0.6 0.49875
0.7 0.49865
0.8 0.498645
0.9 0.498645
0.95 0.498645
0.596115 0.0


# Q4)

In [46]:
popularity_threshold = 0.48
sim_threshold = 0.1

In [51]:
### Would-cook baseline: just rank which recipes are popular and which are not, and return '1' if a recipe is among the top-ranked

recipeCount = defaultdict(int)
totalCooked = 0
best_accuracy = 0
better_threshold = 0

for d in train:
    user, recipe = d['user_id'], d['recipe_id']
    recipeCount[recipe] += 1
    totalCooked += 1

mostPopular = [(recipeCount[x], x) for x in recipeCount]
mostPopular.sort()
mostPopular.reverse()


return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > (totalCooked * popularity_threshold): 
        break
        
correct_labels = 0
total_labels = 0
for d in newValidationDataset:
    user, recipe, label = d

    prediction = 0
    if recipe in return1:
        prediction = 1
    else:
        for userRecipe in recipePerUserTrain[user]:
            if Jaccard(userPerRecipeTrain[recipe], userPerRecipeTrain[userRecipe]) > sim_threshold:
                prediction = 1
                break

    if prediction == label:
        correct_labels += 1
    total_labels += 1

accuracy = correct_labels/total_labels

In [48]:
print(accuracy)

0.62531


# Q5)

In [50]:
### Would-cook baseline: just rank which recipes are popular and which are not, and return '1' if a recipe is among the top-ranked

recipeCount = defaultdict(int)
totalCooked = 0
best_accuracy = 0
better_threshold = 0

for d in train:
    user, recipe = d['user_id'], d['recipe_id']
    recipeCount[recipe] += 1
    totalCooked += 1

mostPopular = [(recipeCount[x], x) for x in recipeCount]
mostPopular.sort()
mostPopular.reverse()


return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > (totalCooked * popularity_threshold): 
        break
        
correct_labels = 0
total_labels = 0

predictions = open("predictions_Made.txt", 'w')
for l in open("stub_Made.txt"):
    if l.startswith("user_id"):
        #header
        predictions.write(l)
        continue
    user, recipe = l.strip().split('-')
    
    prediction = 0
    if recipe in return1:
        prediction = 1
    else:
        for userRecipe in recipePerUserTrain[user]:
            if Jaccard(userPerRecipeTrain[recipe], userPerRecipeTrain[userRecipe]) > sim_threshold:
                prediction = 1
                break
    predictions.write(user + '-' + recipe + ',' + str(prediction) + '\n')
    
predictions.close()

Username: vktiwari33

# Q9)

In [34]:
import numpy as np
import pandas as pd

In [18]:
import scipy
import scipy.optimize

In [123]:
from surprise import SVD, Reader, Dataset, accuracy, SVDpp
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV

In [19]:
dataset = []

In [20]:
for user,recipe,d in readCSV("trainInteractions.csv.gz"):
    d['rating'] = int(d['rating'])
    dataset.append(d)

In [21]:
train = dataset[:400000]
valid = dataset[400000:]

In [35]:
df = pd.DataFrame.from_dict(dataset)

In [37]:
temp_df = df
temp_df['user'] = temp_df['user_id']
temp_df['item'] = temp_df['recipe_id'] 
temp_df['rating'] = temp_df['rating']

In [39]:
#reader = Reader(line_format='user item date rating', sep=',')
#data = Dataset.load_from_file("trainInteractions.csv", reader=Reader(sep=','))

data = Dataset.load_from_df(temp_df[['user', 'item', 'rating']], reader=Reader(rating_scale=(1, 5)))

In [40]:
model = SVD()

In [41]:
trainset, validset = train_test_split(data, test_size=0.2)

In [42]:
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x198163bcb50>

In [43]:
predictions = model.test(validset)

In [48]:
sse = 0
for p in predictions:
    sse += (p.r_ui - p.est)**2

print(sse / len(predictions))

0.8311103780243708


In [56]:
accuracy.mse(predictions)

MSE: 0.8311


0.8311103780243715

In [57]:
accuracy.rmse(predictions)

RMSE: 0.9117


0.9116525533471463

In [66]:
predictions = open("predictions_Rated.txt", 'w')
for l in open("stub_Rated.txt"):
    if l.startswith("user_id"):
        #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    predictions.write(u + '-' + i + ',' + str(model.predict(u, i)[3]) + '\n')

predictions.close()

### Full Dataset

In [231]:
trainset, testset = train_test_split(data, test_size=0.00001)

In [130]:
model5 = SVD()

In [131]:
model5.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x198657720d0>

In [132]:
predictions = open("predictions_Rated.txt", 'w')
for l in open("stub_Rated.txt"):
    if l.startswith("user_id"):
        #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    predictions.write(u + '-' + i + ',' + str(model5.predict(u, i)[3]) + '\n')

predictions.close()

### With cross validation

In [49]:
model2 = SVD()

In [54]:
cross_validate(model2, data, measures=['RMSE'], cv=5, verbose=True,)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9015  0.9113  0.9129  0.9132  0.9082  0.9094  0.0043  
Fit time          27.92   27.59   26.55   26.87   26.68   27.12   0.53    
Test time         1.11    1.73    1.09    0.70    1.05    1.14    0.33    


{'test_rmse': array([0.90147892, 0.91133762, 0.91294679, 0.91315032, 0.90823964]),
 'fit_time': (27.915030002593994,
  27.588972091674805,
  26.55399990081787,
  26.869996547698975,
  26.67699909210205),
 'test_time': (1.1149706840515137,
  1.7300000190734863,
  1.0940017700195312,
  0.6970014572143555,
  1.0540030002593994)}

In [58]:
accuracy.mse(model2.test(validset))

MSE: 0.4951


0.49509256399893464

In [67]:
predictions = open("predictions_Rated.txt", 'w')
for l in open("stub_Rated.txt"):
    if l.startswith("user_id"):
        #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    predictions.write(u + '-' + i + ',' + str(model2.predict(u, i)[3]) + '\n')

predictions.close()

### Gridsearch for SVD params

In [213]:
param_grid = {'n_epochs': [14, 15, 16, 17, 18, 19, 20, 22, 25], 'lr_all': [0.002, 0.003, 0.004, 0.005, 0.006],
              'reg_all': [0.01, 0.05, 0.1, 0.15, 0.21]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5, joblib_verbose=5, n_jobs=-1)

In [214]:
gs.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   34.7s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 12.1min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 23.1min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 39.4min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 59.7min
[Parallel(n_jobs=-1)]: Done 866 tasks      | elapsed: 85.7min
[Parallel(n_jobs=-1)]: Done 1125 out of 1125 | elapsed: 119.9min finished


In [215]:
print(gs.best_score['rmse'])

0.9034271513610657


In [216]:
print(gs.best_params['rmse'])

{'n_epochs': 25, 'lr_all': 0.003, 'reg_all': 0.21}


In [184]:
param_grid = {'n_factors': [25, 50, 75]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5, joblib_verbose=3, n_jobs=-1)

In [185]:
gs.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  15 | elapsed:   32.0s remaining:   48.0s
[Parallel(n_jobs=-1)]: Done  12 out of  15 | elapsed:   57.7s remaining:   14.4s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  1.1min finished


In [186]:
print(gs.best_score['rmse'])

0.9048373070888835


In [174]:
print(gs.best_score['rmse'])

0.9079017760768003


In [178]:
print(gs.best_score['rmse'])

0.9064062205538119


In [187]:
print(gs.best_params['rmse'])

{'n_factors': 25}


In [162]:
print(gs.best_params['rmse'])

{'n_epochs': 20, 'lr_all': 0.004, 'reg_all': 0.21}


In [154]:
print(gs.best_params['rmse'])

{'n_epochs': 14, 'lr_all': 0.006, 'reg_all': 0.2}


In [146]:
print(gs.best_params['rmse'])

{'n_epochs': 17, 'lr_all': 0.005, 'reg_all': 0.2}


In [138]:
print(gs.best_params['rmse'])

{'n_epochs': 15, 'lr_all': 0.006, 'reg_all': 0.2}


In [111]:
print(gs.best_params['rmse'])

{'n_epochs': 10, 'lr_all': 0.007, 'reg_all': 0.1, 'n_factors': 75}


In [117]:
model3 = SVD(n_epochs=10, lr_all= 0.007, reg_all= 0.1, n_factors= 75) #3

In [139]:
model3 = SVD(n_epochs=15, lr_all= 0.007, reg_all= 0.2) #2

In [232]:
model3 = SVD(n_epochs=17, lr_all= 0.005, reg_all= 0.2) #1 0.82457

In [155]:
model3 = SVD(n_epochs=14, lr_all= 0.006, reg_all= 0.2) #? 0.82624

In [163]:
model3 = SVD(n_epochs=20, lr_all= 0.004, reg_all= 0.21) #? 0.82624 {'n_epochs': 20, 'lr_all': 0.004, 'reg_all': 0.21}

In [192]:
model3 = SVD(n_epochs=20, lr_all= 0.004, reg_all= 0.21, n_factors=25) #0.82526

In [192]:
model3 = SVD(n_epochs=19, lr_all= 0.005, reg_all= 0.2, n_factors=35) #0.82526

In [204]:
model3 = SVD(n_epochs=17, lr_all= 0.004, reg_all= 0.1, n_factors=35) #0.82605

In [205]:
model3 = SVD(n_epochs=17, lr_all= 0.004, reg_all= 0.1, n_factors=100) #0.82838

In [209]:
model3 = SVD(n_epochs=17, lr_all= 0.004, reg_all= 0.1, n_factors=25) #0.82561

In [222]:
model3 = SVD(n_epochs=25, lr_all= 0.003, reg_all= 0.21) #0.82754

In [243]:
model3 = SVD(n_epochs=18, lr_all= 0.004, reg_all= 0.21) #0.82558

In [244]:
model3.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1984890d730>

In [245]:
predictions = open("predictions_Rated.txt", 'w')
for l in open("stub_Rated.txt"):
    if l.startswith("user_id"):
        #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    predictions.write(u + '-' + i + ',' + str(model3.predict(u, i)[3]) + '\n')
predictions.close()

In [246]:
accuracy.mse(model3.test(validset))

MSE: 0.6333


0.6333360860690915

In [220]:
accuracy.mse(model3.test(validset))

MSE: 0.6291


0.6290985071368174

In [212]:
accuracy.mse(model3.test(validset))

MSE: 0.6789


0.6789254932153695

In [195]:
accuracy.mse(model3.test(validset))

MSE: 0.6534


0.6534190553028403

In [166]:
accuracy.mse(model3.test(validset))

MSE: 0.6225


0.6224816652080072

### SVD++

In [124]:
model4 = SVDpp()#(n_epochs=10, lr_all= 0.007, reg_all= 0.1, n_factors= 75)

In [126]:
model4.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x19843f92940>

In [127]:
predictions = open("predictions_Rated.txt", 'w')
for l in open("stub_Rated.txt"):
    if l.startswith("user_id"):
        #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    predictions.write(u + '-' + i + ',' + str(model4.predict(u, i)[3]) + '\n')
predictions.close()

In [128]:
accuracy.mse(model3.test(validset))

MSE: 0.8208


0.820822345991419

### Tensorflow Basic Regression

In [97]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

In [98]:
tf.__version__

'2.6.0'

In [71]:
import datetime

In [70]:
df

Unnamed: 0,user_id,recipe_id,date,rating,user,item
0,88348277,03969194,2004-12-23,5,88348277,03969194
1,86699739,27096427,2002-01-12,4,86699739,27096427
2,03425965,44197323,2012-10-03,5,03425965,44197323
3,73973193,24971400,2008-04-09,5,73973193,24971400
4,15215209,60170202,2010-10-07,5,15215209,60170202
...,...,...,...,...,...,...
499995,49948579,91427071,2008-08-23,5,49948579,91427071
499996,37419956,93818327,2006-07-03,5,37419956,93818327
499997,35102479,65083672,2012-03-27,5,35102479,65083672
499998,53665875,37059341,2008-12-13,5,53665875,37059341


In [79]:
data_df = df

In [76]:
datetime.datetime.strptime('2004-12-23', "%Y-%m-%d")

datetime.datetime(2004, 12, 23, 0, 0)

In [78]:
convert_date = np.vectorize(datetime.datetime.strptime)

In [81]:
data_df['datetime'] = convert_date(data_df['date'], '%Y-%m-%d')

In [87]:
data_df['day'] = data_df['datetime'].apply(lambda x: x.day)
data_df['month'] = data_df['datetime'].apply(lambda x: x.month)
data_df['year'] = data_df['datetime'].apply(lambda x: x.year)

In [89]:
data_df = data_df[['user_id', 'recipe_id', 'rating', 'day', 'month', 'year']]

In [91]:
train_data_df = data_df.sample(frac=0.8, random_state=0)
test_data_df = data_df.drop(train_data_df.index)

In [92]:
train_features_df = train_data_df.copy()
test_features_df = test_data_df.copy()

train_label_df = train_features_df.pop('rating')
test_label_df = test_features_df.pop('rating')

In [99]:
normalizer = tf.keras.layers.Normalization(axis=-1)

AttributeError: module 'tensorflow.keras.layers' has no attribute 'Normalization'

In [95]:
tf.__version__

'2.6.0'

In [12]:
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)

In [13]:
for d in train:
    user,item = d['user_id'], d['recipe_id']
    reviewsPerUser[user].append(d)
    reviewsPerItem[item].append(d)

In [14]:
ratingMean = sum([d['rating'] for d in train]) / len(train)

In [15]:
labels = [d['rating'] for d in train]

In [16]:
N = len(train)
nUsers = len(reviewsPerUser)
nItems = len(reviewsPerItem)
users = list(reviewsPerUser.keys())
items = list(reviewsPerItem.keys())

In [135]:
alpha = ratingMean

In [136]:
userBiases = defaultdict(float)
itemBiases = defaultdict(float)

In [137]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [157]:
def prediction(user, item):
    if user not in userBiases or item not in itemBiases:
        return alpha
    return alpha + userBiases[user] + itemBiases[item]

In [139]:
def unpack(theta):
    global alpha
    global userBiases
    global itemBiases
    alpha = theta[0]
    userBiases = dict(zip(users, theta[1:nUsers+1]))
    itemBiases = dict(zip(items, theta[1+nUsers:]))

In [164]:
def cost(theta, labels, dataset, lamb):
    unpack(theta)
    predictions = [prediction(d['user_id'], d['recipe_id']) for d in dataset]
    cost = MSE(predictions, labels)
    print("MSE = " + str(cost))
    for u in userBiases:
        cost += lamb*userBiases[u]**2
    for i in itemBiases:
        cost += lamb*itemBiases[i]**2
    return cost

In [165]:
def derivative(theta, labels, dataset, lamb):
    unpack(theta)
    N = len(dataset)
    dalpha = 0
    dUserBiases = defaultdict(float)
    dItemBiases = defaultdict(float)
    for d in dataset:
        u,i = d['user_id'], d['recipe_id']
        pred = prediction(u, i)
        diff = pred - d['rating']
        dalpha += 2/N*diff
        dUserBiases[u] += 2/N*diff
        dItemBiases[i] += 2/N*diff
    for u in userBiases:
        dUserBiases[u] += 2*lamb*userBiases[u]
    for i in itemBiases:
        dItemBiases[i] += 2*lamb*itemBiases[i]
    dtheta = [dalpha] + [dUserBiases[u] for u in users] + [dItemBiases[i] for i in items]
    return numpy.array(dtheta)

In [184]:
scipy.optimize.fmin_l_bfgs_b(cost, 
                             [alpha] + [0.0]*(nUsers+nItems),
                             derivative, 
                             args = (labels, train, 1))

MSE = 0.898807042703075
MSE = 1.4092942879265038
MSE = 0.8985950192911197
MSE = 0.8985951779117449


(array([ 4.58067353e+00, -8.58146680e-05, -8.06156759e-06, ...,
        -1.45132190e-06,  1.04630213e-06, -1.45114646e-06]),
 0.8986631878445489,
 {'grad': array([-6.61797442e-06,  2.22831645e-07,  6.45420112e-09, ...,
          6.22533785e-10,  9.15110290e-10,  6.22405709e-10]),
  'task': 'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL',
  'funcalls': 4,
  'nit': 2,
  'warnflag': 0})

In [167]:
y = []
y_pred = []
for d in valid:
    y.append(d['rating'])
    y_pred.append(prediction(d['user_id'], d['recipe_id']))

print(MSE(y_pred, y))

0.9094423694728887


# Q10)

In [168]:
largestBu = -100000
largestBuUser = None
smallestBu = 100000
smallestBuUser = None

for u in userBiases:
    if largestBu < userBiases[u]: 
        largestBu = userBiases[u]
        largestBuUser = u
    if smallestBu > userBiases[u]:
        smallestBu = userBiases[u]
        smallestBuUser = u

In [169]:
largestBi = -100000
largestBiItem = None
smallestBi = 100000
smallestBiItem = None

for i in itemBiases:
    if largestBi < itemBiases[i]: 
        largestBi = itemBiases[i]
        largestBiItem = i
    if smallestBi > itemBiases[i]:
        smallestBi = itemBiases[i]
        smallestBiItem = i

In [170]:
print(largestBuUser, smallestBuUser)

32445558 70705426


In [171]:
print(largestBiItem, smallestBiItem)

98124873 29147042


# Q11)

In [172]:
lambdas = [0.1, 0.3, 0.5, 0.7, 0.9, 1.1, 1.2]

In [173]:
min_mse = 100000
opt_lamb = None
for lamb in lambdas:
    _, mse, _ = scipy.optimize.fmin_l_bfgs_b(cost, 
                                 [alpha] + [0.0]*(nUsers+nItems),
                                 derivative, 
                                 args = (labels, valid, lamb))
    if min_mse > mse:
        min_mse = mse
        opt_lamb = lamb

MSE = 0.9041522520298038
MSE = 0.9065746152135636
MSE = 0.9041644373852212
MSE = 0.9041532582922053
MSE = 0.9041524159379389
MSE = 0.9041522845253479
MSE = 0.904152258774527
MSE = 0.9041522534442399
MSE = 0.9041522523276956
MSE = 0.9041522520932328
MSE = 0.904152252043906
MSE = 0.9041522520335278
MSE = 0.9041522520315187
MSE = 0.9041522520309915
MSE = 0.9041522520313834
MSE = 0.9041522520302345
MSE = 0.9041522520298881
MSE = 0.9041522520298035
MSE = 0.9041522520298643
MSE = 0.9041522520298143
MSE = 0.9041522520298074
MSE = 0.904152252029804
MSE = 0.9065746152135092
MSE = 0.9041555532702313
MSE = 0.9041526078893113
MSE = 0.9041523169186456
MSE = 0.9041522652635642
MSE = 0.9041522547942986
MSE = 0.9041522526107004
MSE = 0.9041522521529763
MSE = 0.9041522520565711
MSE = 0.9041522520366437
MSE = 0.9041522520322104
MSE = 0.9041522520310568
MSE = 0.9041522520313223
MSE = 0.9041522520305507
MSE = 0.9041522520301756
MSE = 0.9041522520298223
MSE = 0.9041522520298042
MSE = 0.9041522520298041
MSE

In [174]:
print(min_mse)
print(opt_lamb)

0.904152252029804
1.2


In [182]:
scipy.optimize.fmin_l_bfgs_b(cost, 
                             [alpha] + [0.0]*(nUsers+nItems),
                             derivative, 
                             args = (labels, dataset, 1.2))

MSE = 0.8987313676875054
MSE = 0.8861891996778084
MSE = 0.9367774454503509
MSE = 0.8854927942747061
MSE = 0.8896541050782545
MSE = 0.8888137440324333
MSE = 0.8888688677704336
MSE = 0.8888662342932155


(array([ 4.57210042e+00, -3.89869928e-03, -6.25858145e-04, ...,
        -1.13793643e-04,  7.19411161e-05, -1.12289754e-04]),
 0.8933592826848815,
 {'grad': array([-2.94641129e-06, -1.58627819e-07,  2.83147152e-08, ...,
          5.90888053e-09, -3.85861981e-09,  3.81020327e-09]),
  'task': 'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL',
  'funcalls': 8,
  'nit': 6,
  'warnflag': 0})

In [183]:
predictions = open("predictions_Rated.txt", 'w')
for l in open("stub_Rated.txt"):
    if l.startswith("user_id"):
        #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    predictions.write(u + '-' + i + ',' + str(prediction(u, i)) + '\n')

predictions.close()