In [1]:
import gzip
from collections import defaultdict

import random
import numpy as np
import csv
# import tensorflow as tf
# from surprise import SVD, Reader, Dataset

In [2]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

def readCSV(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    for l in c:
        d = dict(zip(header,l))
        yield d['user_id'],d['recipe_id'],d

Split the date into Training and Validation sets

In [3]:
data = []
recipes = []
user_item = defaultdict(list)

for user,recipe,d in readCSV("trainInteractions.csv.gz"):
    data.append(d)
    recipes.append(recipe)
    user_item[user].append(recipe)

In [4]:
trainsize = 400000
train = data[:trainsize]
valid = data[trainsize:]

In [5]:
user_item_train = defaultdict(set)
item_user_train = defaultdict(set)
user_item_valid = defaultdict(list)
for d in train:
    user,item = d['user_id'],d['recipe_id']
    user_item_train[item].add(user)#
    item_user_train[user].add(item)
for d in valid:
    user,item = d['user_id'],d['recipe_id']
    user_item_valid[user].append(item)

In [6]:
valid_new = defaultdict(dict)
recipes = list(set(recipes))
randomed = set()
for d in valid:
    user = d['user_id']
    recipe = d['recipe_id']
    valid_new[user][recipe]=1
    while True:
        random_int = random.randint(0,len(recipes)-1)
        random_recipe = recipes[random_int]
        if random_recipe not in user_item[user] and random_recipe not in randomed:
            valid_new[user][random_recipe] = 0
            randomed.add(random_recipe)
            break

In [7]:
pos = 0
neg = 0
cnt =0
for user in valid_new:
    for recipe in valid_new[user]:
        cnt +=1
        if valid_new[user][recipe] == 1:
            pos+=1
        if valid_new[user][recipe] == 0:
            neg +=1
cnt, pos,neg

(200000, 100000, 100000)

### Q1

In [8]:
### Would-cook baseline: just rank which recipes are popular and which are not, and return '1' if a recipe is among the top-ranked

recipeCount = defaultdict(int)
totalCooked = 0

for user,recipe,_ in readCSV("trainInteractions.csv.gz"):
  recipeCount[recipe] += 1
  totalCooked += 1

mostPopular = [(recipeCount[x], x) for x in recipeCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
  count += ic
  return1.add(i)
  if count > totalCooked/2: break #只取前50%的


In [9]:
cnt = 0
correct = 0
for user in valid_new:
    for recipe in valid_new[user]:
        cnt +=1
        if recipe in return1 and valid_new[user][recipe] == 1:
            correct +=1
        if recipe not in return1 and valid_new[user][recipe] == 0:
            correct +=1
accuracy1 = correct/cnt
print("The accuracy of the baseline model is: " + str(accuracy1))

The accuracy of the baseline model is: 0.69481


### Q2

In [11]:
def adjustThreshold(start,end,step):
    res = []

    for t in np.arange(start,end,step):
        return2 = set()
        count = 0
        for ic, i in mostPopular:
            count += ic
            return2.add(i)
            if count > totalCooked*t: break
        
        cnt = 0
        correct = 0
        for user in valid_new:
            for recipe in valid_new[user]:
                cnt +=1
                if recipe in return2 and valid_new[user][recipe] == 1:
                    correct +=1
                if recipe not in return2 and valid_new[user][recipe] == 0:
                    correct +=1
        
        accuracy = correct/cnt
        
        res.append((t,accuracy))           
    return res

In [25]:
res = adjustThreshold(start=0.1,end=0.7,step=0.01)


In [26]:
res = sorted(res,key=lambda x: x[1], reverse=True)
print("The best threshold is: "+str(res[0][0])[:5])
print("The accuracy is: "+ str(res[0][1])[:5])

The best threshold is: 0.639
The accuracy is: 0.713


In [27]:
res

[(0.6399999999999997, 0.71349),
 (0.6499999999999997, 0.71313),
 (0.6599999999999997, 0.712805),
 (0.6299999999999997, 0.712545),
 (0.6699999999999997, 0.712525),
 (0.6199999999999998, 0.71184),
 (0.6799999999999997, 0.71179),
 (0.6899999999999997, 0.711205),
 (0.6099999999999998, 0.71088),
 (0.5999999999999998, 0.71013),
 (0.5899999999999997, 0.709305),
 (0.5799999999999997, 0.708395),
 (0.5699999999999997, 0.707415),
 (0.5599999999999997, 0.705755),
 (0.5499999999999998, 0.70398),
 (0.5399999999999998, 0.70253),
 (0.5299999999999998, 0.700995),
 (0.5199999999999998, 0.69909),
 (0.5099999999999998, 0.696865),
 (0.4999999999999998, 0.69481),
 (0.48999999999999977, 0.692545),
 (0.47999999999999976, 0.69032),
 (0.46999999999999986, 0.68824),
 (0.45999999999999985, 0.68572),
 (0.44999999999999984, 0.68306),
 (0.43999999999999984, 0.68053),
 (0.4299999999999998, 0.6778),
 (0.4199999999999998, 0.67495),
 (0.4099999999999998, 0.672095),
 (0.3999999999999998, 0.6689),
 (0.3899999999999999, 0.

### Q3

In [9]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [10]:
user_item_sim = defaultdict(dict) 
for user in valid_new:
    cooked_recipes = item_user_train[user]
    for g in valid_new[user]:
        max_sim = -1
        for g2 in cooked_recipes:
            if g2 == g: continue
            sim = Jaccard(user_item_train[g], user_item_train[g2])
            max_sim = max(max_sim,sim)
        user_item_sim[user][g] = max_sim

In [127]:
# user_item_sim['90764166']

In [34]:
cnt3 = 0
correct3 = 0
for threshold3 in np.arange(0.01,0.02,0.001):
    for user in valid_new:
        for recipe in valid_new[user]:
            cnt3 +=1
            if  user_item_sim[user][recipe] > threshold3 and valid_new[user][recipe] == 1:
                correct3 +=1
            if user_item_sim[user][recipe] <= threshold3 and valid_new[user][recipe] == 0:
                correct3 +=1
    accuracy3 = correct3/cnt3
    print(threshold3,accuracy3)
# print("The accuracy of the baseline model is: " + str(accuracy3))

0.01 0.59448
0.011 0.5946225
0.011999999999999999 0.59463
0.012999999999999998 0.59441875
0.013999999999999997 0.594132
0.014999999999999996 0.5938141666666666
0.015999999999999993 0.5934292857142857
0.016999999999999994 0.593015625
0.017999999999999995 0.5925566666666666
0.018999999999999993 0.5920215


In [123]:
def adjustThreshold3(start,end,step):
    cnt3 = 0
    correct3 = 0
    res = []
    for threshold3 in np.arange(start,end,step):
        for user in valid_new:
            for recipe in valid_new[user]:
                cnt3 +=1
                if  user_item_sim[user][recipe] > threshold3 and valid_new[user][recipe] == 1:
                    correct3 +=1
                if user_item_sim[user][recipe] <= threshold3 and valid_new[user][recipe] == 0:
                    correct3 +=1
        accuracy3 = correct3/cnt3
        res.append((threshold3,accuracy3))
    return res

In [28]:
print(" The performance on my validation set: ")

 The performance on my validation set: 


In [126]:
adjustThreshold3(0.01,0.02,0.001)

[(0.01, 0.59378),
 (0.011, 0.59382),
 (0.011999999999999999, 0.5937766666666666),
 (0.012999999999999998, 0.59354125),
 (0.013999999999999997, 0.593237),
 (0.014999999999999996, 0.5929091666666667),
 (0.015999999999999993, 0.5925142857142857),
 (0.016999999999999994, 0.592118125),
 (0.017999999999999995, 0.5916794444444444),
 (0.018999999999999993, 0.5911385)]

### Q4

In [11]:
def combine(t1,t2):
    return4 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return4.add(i)
        if count > totalCooked*t1: break
            
    cnt4 = 0
    correct4 = 0
    for user in valid_new:
        for recipe in valid_new[user]:
            cnt4 +=1
            if (user_item_sim[user][recipe] > t2 and recipe in return4) and valid_new[user][recipe] == 1:
                correct4 +=1
            if (user_item_sim[user][recipe] <= t2 and recipe not in return4) and valid_new[user][recipe] == 0:
                correct4 +=1
    
    accuracy4 = correct4/cnt4
    return accuracy4        

In [None]:
accuracys = []
for t1 in np.arange(0,1,0.01):
    for t2 in np.arange(0,0.5,0.01):
        res = combine(t1,t2)
        accuracys.append((t1,t2,res))

In [None]:
print("The performance of the combined model on validation set is shown below:")
accuracys

In [29]:
lst = sorted(accuracys,key=lambda x: x[2], reverse=True)


NameError: name 'accuracys' is not defined

### Q5

In [17]:
t1 = 0.73
t2 = 0.012
return5 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return5.add(i)
    if count > totalCooked*t1: break

In [18]:
predictions = open("predictions_Made.txt", 'w')
for l in open("stub_Made.txt"):
    if l.startswith("user_id"):
    #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    cooked_recipes = item_user_train[u]
    max_sim = -1
    for g2 in cooked_recipes:
        if g2 == i: continue
        sim = Jaccard(user_item_train[i], user_item_train[g2])
        max_sim = max(max_sim,sim)

    if i in return5 and max_sim > 0.012:
        predictions.write(u + '-' + i + ",1\n")
    else:
        predictions.write(u + '-' + i + ",0\n")
predictions.close()

In [19]:
print("My Kaggle user name is: Ashley9988")

My Kaggle user name is: Ashley9988
