In [1]:
import gzip
from collections import defaultdict
from sklearn import linear_model
import csv
import json

In [2]:
import numpy as np
import random

In [3]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [55]:
def readCSV(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    for l in c:
        d = dict(zip(header,l))
        yield d['user_id'],d['recipe_id'],d

In [5]:
def loadCSV(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    dataset = []
    for line in c:
        d = dict(zip(header, line))
        dataset.append(d)
    return dataset

# Q1)

In [6]:
dataset = loadCSV('trainInteractions.csv.gz')

In [7]:
len(dataset)

500000

In [22]:
train = dataset[:400000]
validation = dataset[400000:]

In [23]:
userPerRecipeValid = defaultdict(set)
recipePerUserValid = defaultdict(set)
recipeListValid = set([])
newValidationDataset = []

for datum in validation:
    user, recipe = datum['user_id'], datum['recipe_id']
    userPerRecipeValid[recipe].add(user)
    recipePerUserValid[user].add(recipe)
    recipeListValid.add(recipe)

recipeListValid = list(recipeListValid)
recipeListSize = len(recipeListValid)
for datum in validation:
    user, recipe = datum['user_id'], datum['recipe_id']
    newValidationDataset.append((user, recipe, 1))
    
    while True:
        index = random.randint(0, recipeListSize - 1)
        if recipeListValid[index] not in recipePerUserValid[user]:
            break
    
    newValidationDataset.append((user, recipeListValid[index], 0))

In [25]:
len(newValidationDataset)

200000

In [26]:
newValidationDataset[:10]

[('90764166', '01768679', 1),
 ('90764166', '44845944', 0),
 ('68112239', '24923981', 1),
 ('68112239', '64197312', 0),
 ('32173358', '57597698', 1),
 ('32173358', '58861769', 0),
 ('30893740', '16266088', 1),
 ('30893740', '78236641', 0),
 ('69780905', '62953151', 1),
 ('69780905', '28584315', 0)]

In [35]:
### Would-cook baseline: just rank which recipes are popular and which are not, and return '1' if a recipe is among the top-ranked

recipeCount = defaultdict(int)
totalCooked = 0

for d in train:
    user, recipe = d['user_id'], d['recipe_id']
    recipeCount[recipe] += 1
    totalCooked += 1

mostPopular = [(recipeCount[x], x) for x in recipeCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalCooked/2: break

correct_labels = 0
total_labels = 0
for d in newValidationDataset:
    user, recipe, label = d
    if recipe in return1:
        prediction = 1
    else:
        prediction = 0
    
    if prediction == label:
        correct_labels += 1
    total_labels += 1

accuracy = correct_labels/total_labels

In [36]:
accuracy

0.61539

# Q2)

In [50]:
thresholds = [0.1, 0.2, 0.3, 0.4, 0.43, 0.45, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.55, 0.6, 0.7, 0.8, 0.9]

In [51]:
### Would-cook baseline: just rank which recipes are popular and which are not, and return '1' if a recipe is among the top-ranked

recipeCount = defaultdict(int)
totalCooked = 0
best_accuracy = 0
better_threshold = 0

for d in train:
    user, recipe = d['user_id'], d['recipe_id']
    recipeCount[recipe] += 1
    totalCooked += 1

mostPopular = [(recipeCount[x], x) for x in recipeCount]
mostPopular.sort()
mostPopular.reverse()

for threshold in thresholds:
    return1 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count > (totalCooked * threshold): 
            break

    correct_labels = 0
    total_labels = 0
    for d in newValidationDataset:
        user, recipe, label = d
        if recipe in return1:
            prediction = 1
        else:
            prediction = 0

        if prediction == label:
            correct_labels += 1
        total_labels += 1

    accuracy = correct_labels/total_labels
    print(threshold, accuracy)
    if best_accuracy < accuracy:
        best_accuracy = accuracy
        better_threshold = threshold

0.1 0.54484
0.2 0.580665
0.3 0.603475
0.4 0.613305
0.43 0.61458
0.45 0.614885
0.48 0.61494
0.49 0.614555
0.5 0.61443
0.51 0.613975
0.52 0.61404
0.53 0.613185
0.55 0.61228
0.6 0.60929
0.7 0.599965
0.8 0.58739
0.9 0.570195


In [52]:
print(best_accuracy)
print(better_threshold)

0.61494
0.48


# Q3)

In [37]:
userPerRecipeTrain = defaultdict(set)
recipePerUserTrain = defaultdict(set)

for datum in train:
    user, recipe = datum['user_id'], datum['recipe_id']
    userPerRecipeTrain[recipe].add(user)
    recipePerUserTrain[user].add(recipe)

In [38]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [39]:
def mostSimilar(recipe, N):
    similarities = []
    users = userPerRecipeTrain[recipe]
    for i2 in userPerRecipeTrain:
        if i2 == i: continue
        sim = Jaccard(users, userPerRecipeTrain[i2])

        similarities.append((sim,i2))
    similarities.sort(key=lambda x: x[0], reverse=True)

    return similarities[:N]

In [44]:
sim_thresholds = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]

In [45]:
best_accuracy = 0
best_threshold = 0
for sim_threshold in sim_thresholds:
    correct_labels = 0
    total_labels = 0
    for d in newValidationDataset:
        user, recipe, label = d
        
        prediction = 0
        for userRecipe in recipePerUserTrain[user]:
            if Jaccard(userPerRecipeTrain[recipe], userPerRecipeTrain[userRecipe]) > sim_threshold:
                prediction = 1
                break

        if prediction == label:
            correct_labels += 1
        total_labels += 1

    accuracy = correct_labels/total_labels
    print(sim_threshold, accuracy)
    if best_accuracy < accuracy:
        best_accuracy = accuracy
        best_threshold = sim_threshold

print(best_accuracy, best_threshold)

0.0 0.596115
0.1 0.521945
0.2 0.514155
0.3 0.5076
0.4 0.502215
0.5 0.498775
0.6 0.49875
0.7 0.49865
0.8 0.498645
0.9 0.498645
0.95 0.498645
0.596115 0.0


# Q4)

In [46]:
popularity_threshold = 0.48
sim_threshold = 0.1

In [51]:
### Would-cook baseline: just rank which recipes are popular and which are not, and return '1' if a recipe is among the top-ranked

recipeCount = defaultdict(int)
totalCooked = 0
best_accuracy = 0
better_threshold = 0

for d in train:
    user, recipe = d['user_id'], d['recipe_id']
    recipeCount[recipe] += 1
    totalCooked += 1

mostPopular = [(recipeCount[x], x) for x in recipeCount]
mostPopular.sort()
mostPopular.reverse()


return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > (totalCooked * popularity_threshold): 
        break
        
correct_labels = 0
total_labels = 0
for d in newValidationDataset:
    user, recipe, label = d

    prediction = 0
    if recipe in return1:
        prediction = 1
    else:
        for userRecipe in recipePerUserTrain[user]:
            if Jaccard(userPerRecipeTrain[recipe], userPerRecipeTrain[userRecipe]) > sim_threshold:
                prediction = 1
                break

    if prediction == label:
        correct_labels += 1
    total_labels += 1

accuracy = correct_labels/total_labels

In [48]:
print(accuracy)

0.62531


# Q5)

In [50]:
### Would-cook baseline: just rank which recipes are popular and which are not, and return '1' if a recipe is among the top-ranked

recipeCount = defaultdict(int)
totalCooked = 0
best_accuracy = 0
better_threshold = 0

for d in train:
    user, recipe = d['user_id'], d['recipe_id']
    recipeCount[recipe] += 1
    totalCooked += 1

mostPopular = [(recipeCount[x], x) for x in recipeCount]
mostPopular.sort()
mostPopular.reverse()


return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > (totalCooked * popularity_threshold): 
        break
        
correct_labels = 0
total_labels = 0

predictions = open("predictions_Made.txt", 'w')
for l in open("stub_Made.txt"):
    if l.startswith("user_id"):
        #header
        predictions.write(l)
        continue
    user, recipe = l.strip().split('-')
    
    prediction = 0
    if recipe in return1:
        prediction = 1
    else:
        for userRecipe in recipePerUserTrain[user]:
            if Jaccard(userPerRecipeTrain[recipe], userPerRecipeTrain[userRecipe]) > sim_threshold:
                prediction = 1
                break
    predictions.write(user + '-' + recipe + ',' + str(prediction) + '\n')
    
predictions.close()

Username: vktiwari33

# Q9)

In [59]:
from surprise import SVD, Reader, Dataset

In [54]:
dataset = []

In [56]:
allRatings = []
userRatings = defaultdict(list)

for user,recipe,d in readCSV("trainInteractions.csv.gz"):
    dataset.append(d)
    r = int(d['rating'])
    allRatings.append(r)
    userRatings[user].append(r)

In [58]:
train = dataset[:400000]
valid = dataset[400000:]

In [67]:
reader = Reader(line_format='user_id recipe_id date rating', sep=',', skip_line1)
data = Dataset.load_from_file("trainInteractions.csv", reader=reader)

ValueError: line_format parameter is incorrect.

In [5]:
### Rating baseline: compute averages for each user, or return the global average if we've never seen the user before

allRatings = []
userRatings = defaultdict(list)

for user,recipe,d in readCSV("trainInteractions.csv.gz"):
    r = int(d['rating'])
    allRatings.append(r)
    userRatings[user].append(r)

globalAverage = sum(allRatings) / len(allRatings)
userAverage = {}
for u in userRatings:
    userAverage[u] = sum(userRatings[u]) / len(userRatings[u])

predictions = open("predictions_Rated.txt", 'w')
for l in open("stub_Rated.txt"):
    if l.startswith("user_id"):
        #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    if u in userAverage:
        predictions.write(u + '-' + i + ',' + str(userAverage[u]) + '\n')
    else:
        predictions.write(u + '-' + i + ',' + str(globalAverage) + '\n')

predictions.close()