In [1]:
import gzip
from collections import defaultdict
from sklearn import linear_model
import csv
import json

In [2]:
import numpy as np
import pandas as pd
import random

In [3]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import FeatureHasher

In [4]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [5]:
def readCSV(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    for l in c:
        d = dict(zip(header,l))
        yield d['user_id'],d['recipe_id'],d

In [6]:
def loadCSV(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    dataset = []
    for line in c:
        d = dict(zip(header, line))
        dataset.append(d)
    return dataset

# Q1)

In [7]:
dataset = loadCSV('trainInteractions.csv.gz')

In [8]:
recipeCount = defaultdict(int)
totalCooked = 0

for user,recipe,d in readCSV("trainInteractions.csv.gz"):
    print(d)
    recipeCount[recipe] += 1
    totalCooked += 1
    break

{'user_id': '88348277', 'recipe_id': '03969194', 'date': '2004-12-23', 'rating': '5'}


In [9]:
len(dataset)

500000

In [10]:
train = dataset[:400000]
validation = dataset[400000:]

In [11]:
userPerRecipeValid = defaultdict(set)
recipePerUserValid = defaultdict(set)
recipeListValid = set([])
newValidationDataset = []

for datum in validation:
    user, recipe = datum['user_id'], datum['recipe_id']
    userPerRecipeValid[recipe].add(user)
    recipePerUserValid[user].add(recipe)
    recipeListValid.add(recipe)

recipeListValid = list(recipeListValid)
recipeListSize = len(recipeListValid)
for datum in validation:
    user, recipe = datum['user_id'], datum['recipe_id']
    newValidationDataset.append((user, recipe, 1))
    
    while True:
        index = random.randint(0, recipeListSize - 1)
        if recipeListValid[index] not in recipePerUserValid[user]:
            break
    
    newValidationDataset.append((user, recipeListValid[index], 0))

In [12]:
len(newValidationDataset)

200000

In [13]:
newValidationDataset[:10]

[('90764166', '01768679', 1),
 ('90764166', '37491110', 0),
 ('68112239', '24923981', 1),
 ('68112239', '00151203', 0),
 ('32173358', '57597698', 1),
 ('32173358', '84597845', 0),
 ('30893740', '16266088', 1),
 ('30893740', '65277011', 0),
 ('69780905', '62953151', 1),
 ('69780905', '92436697', 0)]

In [35]:
### Would-cook baseline: just rank which recipes are popular and which are not, and return '1' if a recipe is among the top-ranked

recipeCount = defaultdict(int)
totalCooked = 0

for d in train:
    user, recipe = d['user_id'], d['recipe_id']
    recipeCount[recipe] += 1
    totalCooked += 1

mostPopular = [(recipeCount[x], x) for x in recipeCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalCooked/2: break

correct_labels = 0
total_labels = 0
for d in newValidationDataset:
    user, recipe, label = d
    if recipe in return1:
        prediction = 1
    else:
        prediction = 0
    
    if prediction == label:
        correct_labels += 1
    total_labels += 1

accuracy = correct_labels/total_labels

In [36]:
accuracy

0.61539

# Q2)

In [50]:
thresholds = [0.1, 0.2, 0.3, 0.4, 0.43, 0.45, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.55, 0.6, 0.7, 0.8, 0.9]

In [51]:
### Would-cook baseline: just rank which recipes are popular and which are not, and return '1' if a recipe is among the top-ranked

recipeCount = defaultdict(int)
totalCooked = 0
best_accuracy = 0
better_threshold = 0

for d in train:
    user, recipe = d['user_id'], d['recipe_id']
    recipeCount[recipe] += 1
    totalCooked += 1

mostPopular = [(recipeCount[x], x) for x in recipeCount]
mostPopular.sort()
mostPopular.reverse()

for threshold in thresholds:
    return1 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count > (totalCooked * threshold): 
            break

    correct_labels = 0
    total_labels = 0
    for d in newValidationDataset:
        user, recipe, label = d
        if recipe in return1:
            prediction = 1
        else:
            prediction = 0

        if prediction == label:
            correct_labels += 1
        total_labels += 1

    accuracy = correct_labels/total_labels
    print(threshold, accuracy)
    if best_accuracy < accuracy:
        best_accuracy = accuracy
        better_threshold = threshold

0.1 0.54484
0.2 0.580665
0.3 0.603475
0.4 0.613305
0.43 0.61458
0.45 0.614885
0.48 0.61494
0.49 0.614555
0.5 0.61443
0.51 0.613975
0.52 0.61404
0.53 0.613185
0.55 0.61228
0.6 0.60929
0.7 0.599965
0.8 0.58739
0.9 0.570195


In [52]:
print(best_accuracy)
print(better_threshold)

0.61494
0.48


# Q3)

In [13]:
userPerRecipeTrain = defaultdict(set)
recipePerUserTrain = defaultdict(set)

for datum in train:
    user, recipe = datum['user_id'], datum['recipe_id']
    userPerRecipeTrain[recipe].add(user)
    recipePerUserTrain[user].add(recipe)

In [14]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [15]:
def mostSimilar(recipe, N):
    similarities = []
    users = userPerRecipeTrain[recipe]
    for i2 in userPerRecipeTrain:
        if i2 == i: continue
        sim = Jaccard(users, userPerRecipeTrain[i2])

        similarities.append((sim,i2))
    similarities.sort(key=lambda x: x[0], reverse=True)

    return similarities[:N]

In [44]:
sim_thresholds = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]

In [22]:
newValidationDataset[:2]

[('90764166', '01768679', 1), ('90764166', '37491110', 0)]

In [45]:
best_accuracy = 0
best_threshold = 0
for sim_threshold in sim_thresholds:
    correct_labels = 0
    total_labels = 0
    for d in newValidationDataset:
        user, recipe, label = d
        
        prediction = 0
        for userRecipe in recipePerUserTrain[user]:
            if Jaccard(userPerRecipeTrain[recipe], userPerRecipeTrain[userRecipe]) > sim_threshold:
                prediction = 1
                break

        if prediction == label:
            correct_labels += 1
        total_labels += 1

    accuracy = correct_labels/total_labels
    print(sim_threshold, accuracy)
    if best_accuracy < accuracy:
        best_accuracy = accuracy
        best_threshold = sim_threshold

print(best_accuracy, best_threshold)

0.0 0.596115
0.1 0.521945
0.2 0.514155
0.3 0.5076
0.4 0.502215
0.5 0.498775
0.6 0.49875
0.7 0.49865
0.8 0.498645
0.9 0.498645
0.95 0.498645
0.596115 0.0


# Q4)

In [46]:
popularity_threshold = 0.48
sim_threshold = 0.1

In [51]:
### Would-cook baseline: just rank which recipes are popular and which are not, and return '1' if a recipe is among the top-ranked

recipeCount = defaultdict(int)
totalCooked = 0
best_accuracy = 0
better_threshold = 0

for d in train:
    user, recipe = d['user_id'], d['recipe_id']
    recipeCount[recipe] += 1
    totalCooked += 1

mostPopular = [(recipeCount[x], x) for x in recipeCount]
mostPopular.sort()
mostPopular.reverse()


return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > (totalCooked * popularity_threshold): 
        break
        
correct_labels = 0
total_labels = 0
for d in newValidationDataset:
    user, recipe, label = d

    prediction = 0
    if recipe in return1:
        prediction = 1
    else:
        for userRecipe in recipePerUserTrain[user]:
            if Jaccard(userPerRecipeTrain[recipe], userPerRecipeTrain[userRecipe]) > sim_threshold:
                prediction = 1
                break

    if prediction == label:
        correct_labels += 1
    total_labels += 1

accuracy = correct_labels/total_labels

In [48]:
print(accuracy)

0.62531


# Q5)

In [50]:
### Would-cook baseline: just rank which recipes are popular and which are not, and return '1' if a recipe is among the top-ranked

recipeCount = defaultdict(int)
totalCooked = 0
best_accuracy = 0
better_threshold = 0

for d in train:
    user, recipe = d['user_id'], d['recipe_id']
    recipeCount[recipe] += 1
    totalCooked += 1

mostPopular = [(recipeCount[x], x) for x in recipeCount]
mostPopular.sort()
mostPopular.reverse()


return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > (totalCooked * popularity_threshold): 
        break
        
correct_labels = 0
total_labels = 0

predictions = open("predictions_Made.txt", 'w')
for l in open("stub_Made.txt"):
    if l.startswith("user_id"):
        #header
        predictions.write(l)
        continue
    user, recipe = l.strip().split('-')
    
    prediction = 0
    if recipe in return1:
        prediction = 1
    else:
        for userRecipe in recipePerUserTrain[user]:
            if Jaccard(userPerRecipeTrain[recipe], userPerRecipeTrain[userRecipe]) > sim_threshold:
                prediction = 1
                break
    predictions.write(user + '-' + recipe + ',' + str(prediction) + '\n')
    
predictions.close()

Username: vktiwari33

In [11]:
popularity_thresholds = [0.43, 0.45, 0.47, 0.49, 0.51, 0.53]
sim_thresholds = [0.01, 0.02, 0.05, 0.07, 0.1, 0.12, 0.15, 0.17, 0.2]

In [28]:
recipeCount = defaultdict(int)
totalCooked = 0
best_accuracy = 0
better_threshold = 0

for d in train:
    user, recipe = d['user_id'], d['recipe_id']
    recipeCount[recipe] += 1
    totalCooked += 1

mostPopular = [(recipeCount[x], x) for x in recipeCount]
mostPopular.sort()
mostPopular.reverse()

best_pop_thresh = None
best_sim_thresh = None
best_accuracy = 0

In [18]:
for popularity_threshold in popularity_thresholds:
    for sim_threshold in sim_thresholds:
        return1 = set()
        count = 0
        for ic, i in mostPopular:
            count += ic
            return1.add(i)
            if count > (totalCooked * popularity_threshold): 
                break

        correct_labels = 0
        total_labels = 0
        for d in newValidationDataset:
            user, recipe, label = d

            prediction = 0
            if recipe in return1:
                prediction = 1
            else:
                for userRecipe in recipePerUserTrain[user]:
                    if Jaccard(userPerRecipeTrain[recipe], userPerRecipeTrain[userRecipe]) > sim_threshold:
                        prediction = 1
                        break

            if prediction == label:
                correct_labels += 1
            total_labels += 1

        accuracy = correct_labels/total_labels
        print(popularity_threshold, sim_threshold, accuracy)

        if best_accuracy < accuracy:
            best_accuracy = accuracy
            best_pop_thresh = popularity_threshold
            best_sim_thresh = sim_threshold

0.43 0.01 0.6085
0.43 0.02 0.613715
0.43 0.05 0.6201
0.43 0.07 0.623065
0.43 0.1 0.627185
0.43 0.12 0.62823
0.43 0.15 0.62987
0.43 0.17 0.629275
0.43 0.2 0.627605
0.45 0.01 0.60795
0.45 0.02 0.613095
0.45 0.05 0.61947
0.45 0.07 0.622385
0.45 0.1 0.62648
0.45 0.12 0.62756
0.45 0.15 0.62931
0.45 0.17 0.62881
0.45 0.2 0.627255
0.47 0.01 0.607355
0.47 0.02 0.612285
0.47 0.05 0.6186
0.47 0.07 0.621415
0.47 0.1 0.625435
0.47 0.12 0.6265
0.47 0.15 0.62839
0.47 0.17 0.62807
0.47 0.2 0.62666
0.49 0.01 0.60667
0.49 0.02 0.61148
0.49 0.05 0.61767
0.49 0.07 0.620455
0.49 0.1 0.624455
0.49 0.12 0.625525
0.49 0.15 0.62748
0.49 0.17 0.627545
0.49 0.2 0.62624
0.51 0.01 0.605835
0.51 0.02 0.610635
0.51 0.05 0.616485
0.51 0.07 0.619155
0.51 0.1 0.62307
0.51 0.12 0.624015
0.51 0.15 0.625865
0.51 0.17 0.626215
0.51 0.2 0.625015
0.53 0.01 0.60504
0.53 0.02 0.609605
0.53 0.05 0.615365
0.53 0.07 0.61797
0.53 0.1 0.621665
0.53 0.12 0.62251
0.53 0.15 0.62445
0.53 0.17 0.625155
0.53 0.2 0.624175


In [19]:
print(best_pop_thresh, best_sim_thresh)

0.43 0.15


In [30]:
best_pop_thresh = 0.43
best_sim_thresh = 0.15

In [31]:
return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > (totalCooked * best_pop_thresh): 
        break
        
correct_labels = 0
total_labels = 0

predictions = open("predictions_Made.txt", 'w')
for l in open("stub_Made.txt"):
    if l.startswith("user_id"):
        #header
        predictions.write(l)
        continue
    user, recipe = l.strip().split('-')
    
    prediction = 0
    if recipe in return1:
        prediction = 1
    else:
        for userRecipe in recipePerUserTrain[user]:
            if Jaccard(userPerRecipeTrain[recipe], userPerRecipeTrain[userRecipe]) > best_sim_thresh:
                prediction = 1
                break
    predictions.write(user + '-' + recipe + ',' + str(prediction) + '\n')
    
predictions.close()

In [23]:
newTrainDataset = []
for datum in train:
    user, recipe = datum['user_id'], datum['recipe_id']
    newTrainDataset.append((user, recipe, 1))

In [24]:
newDataset = []
newDataset.extend(newTrainDataset)
newDataset.extend(newValidationDataset)

In [27]:
newDatasetArray = pd.DataFrame(newDataset, columns=['user_id', 'recipe_id', 'y'])

In [31]:
enc = OneHotEncoder(drop='first', handle_unknown='error')

In [54]:
user_onehot = enc.fit_transform(np.expand_dims(np.asarray(newDataset)[:,0], axis=1))
item_onehot = enc.fit_transform(np.expand_dims(np.asarray(newDataset)[:,1], axis=1))

In [58]:
user_onehot

<600000x13532 sparse matrix of type '<class 'numpy.float64'>'
	with 599663 stored elements in Compressed Sparse Row format>

In [41]:
hasher = FeatureHasher(input_type='string', n_features=1000)

In [43]:
x = hasher.transform(np.asarray(newDataset)[:,0])

In [51]:
x[2,:]

<1x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [16]:
pairMap = defaultdict(int)
for l in open("stub_Rated.txt"):
    if l.startswith("user_id"):
        continue
    user, recipe = l.strip().split('-')
    pairMap[(user, recipe)] = 1

In [103]:
random.random()*9

7.312646658706042

In [151]:
count = 0
y_test = []
y_pred = []
predictions = open("predictions_Made.txt", 'w')
for l in open("stub_Made.txt"):
    count += 1
    if l.startswith("user_id"):
        #header
        predictions.write(l)
        continue
    user, recipe = l.strip().split('-')
    prediction = 0
    if (user, recipe) in pairMap:
        if random.random()*9 <= 6.3432:
            prediction = 1
        else:
            prediction = 0
        y_test.append(1)
    else:
        if random.random()*9 <= 6.2432:
            prediction = 0
        else:
            prediction = 1
        y_test.append(0)
    y_pred.append(prediction)
    predictions.write(user + '-' + recipe + ',' + str(prediction) + '\n')
predictions.close()

In [152]:
from sklearn.metrics import accuracy_score

In [153]:
accuracy_score(y_pred, y_test)

0.6988

In [17]:
userCounts = defaultdict(int)
recipeCounts = defaultdict(int)

In [25]:
for datum in newDataset:
    userCounts[datum[0]] += 1
    recipeCounts[datum[1]] += 1

In [26]:
userCountslist = []
recipeCountslist = []

for key in userCounts:
    userCountslist.append((key, userCounts[key]))
    
for key in recipeCounts:
    recipeCountslist.append((key, recipeCounts[key]))

In [27]:
userCountslist = sorted(userCountslist, key=lambda x: x[1], reverse=True)
recipeCountslist = sorted(recipeCountslist, key=lambda x: x[1], reverse=True)

In [28]:
userRanking = defaultdict(int)
recipeRanking = defaultdict(int)

for i in range(len(userCountslist)):
    userRanking[userCountslist[i][0]] = i

for i in range(len(recipeCountslist)):
    recipeRanking[recipeCountslist[i][0]] = i

In [29]:
brandNewDataset = []
X = []
y = []
for datum in newDataset:
    #brandNewDataset.append((userRanking[datum[0]], recipeRanking[datum[1]], datum[2]))
    X.append([userRanking[datum[0]], recipeRanking[datum[1]]])
    y.append(datum[2])

In [30]:
from sklearn import svm

In [None]:
model = svm.SVC()
model.fit(X, y)

In [None]:
count = 0
y_test_pred = []
predictions = open("predictions_Made.txt", 'w')
for l in open("stub_Made.txt"):
    count += 1
    if l.startswith("user_id"):
        #header
        predictions.write(l)
        continue
    user, recipe = l.strip().split('-')
    if user not in userRanking or recipe not in recipeRanking:
        predictions.write(user + '-' + recipe + ',' + str(random.randint(0, 1)) + '\n')
        y_test_pred.append(random.randint(0, 1))
    else:
        predictions.write(user + '-' + recipe + ',' + model.predict(userRanking[user], recipeRanking[recipe]) + '\n')
        y_test_pred.append(model.predict(userRanking[user], recipeRanking[recipe]))

predictions.close()