In [1]:
import gzip
import csv
import random
import numpy
import math
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn import linear_model

In [2]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [3]:
def readCSV(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    for l in c:
        d = dict(zip(header,l))
        yield d['user_id'],d['recipe_id'],d

In [4]:
def getHeader(path):
    f = gzip.open(path, 'rt', encoding = "utf8")
    header = f.readline()
    header = header.strip().split('\t')
    header = header[0].split(',')
    return header

In [5]:
def parse(f):
    for l in gzip.open(f):
        yield eval(l)

In [6]:
dataset = list(parse("trainRecipes.json.gz"))

In [7]:
len(dataset)

200000

In [8]:
data = []
train = []
valid = []

In [9]:
for user,recipe,d in readCSV("trainInteractions.csv.gz"):
    data.append(d)

In [10]:
header = getHeader("trainInteractions.csv.gz")

In [11]:
train = data[:400000]
valid = data[400000:]

Useful precalculation

In [12]:
recipeCount = defaultdict(int)
totalCooked = 0
userSet = set()
recipeSet = set()
userPerRecipe = defaultdict(set)
recipePerUser = defaultdict(set)
ratingDict = {}

In [13]:
for d in train:
    user,recipe = d['user_id'], d['recipe_id']
    recipeCount[recipe] += 1
    totalCooked += 1
    userSet.add(user)
    recipeSet.add(recipe)
    userPerRecipe[recipe].add(user)
    recipePerUser[user].add(recipe)
    ratingDict[(user,recipe)] = int(d['rating'])

In [14]:
userAverages = {}
recipeAverages = {}

for u in recipePerUser:
    rs = [ratingDict[(u,i)] for i in recipePerUser[u]]
    if len(rs) == 0:
        userAverages[u] = 0
        continue
    userAverages[u] = sum(rs) / len(rs)
    
for i in userPerRecipe:
    rs = [ratingDict[(u,i)] for u in userPerRecipe[i]]
    if len(rs) == 0:
        recipeAverages[i] = 0
        continue
    recipeAverages[i] = sum(rs) / len(rs)

Jaccard

In [15]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

Cosine

In [16]:
def Cosine(i1, i2):
    # Between two items
    inter = userPerRecipe[i1].intersection(userPerRecipe[i2])
    numer = 0
    denom1 = 0
    denom2 = 0
    for u in inter:
        numer += ratingDict[(u,i1)]*ratingDict[(u,i2)]
    for u in userPerRecipe[i1]:
        denom1 += ratingDict[(u,i1)]**2
    for u in userPerRecipe[i2]:
        denom2 += ratingDict[(u,i2)]**2
    denom = math.sqrt(denom1) * math.sqrt(denom2)
    if denom == 0: return 0
    return numer / denom

Pearson Similarity

In [17]:
def Pearson(i1, i2):
    # Between two items
    if i1 in recipeAverages:
        iBar1 = recipeAverages[i1]
    else:
        iBar1 = 0
    if i2 in recipeAverages:
        iBar2 = recipeAverages[i2]
    else:
        iBar2 = 0
    inter = userPerRecipe[i1].intersection(userPerRecipe[i2])
    numer = 0
    denom1 = 0
    denom2 = 0
    for u in inter:
        numer += (ratingDict[(u,i1)] - iBar1)*(ratingDict[(u,i2)] - iBar2)
    for u in inter: #usersPerItem[i1]:
        denom1 += (ratingDict[(u,i1)] - iBar1)**2
    #for u in usersPerItem[i2]:
        denom2 += (ratingDict[(u,i2)] - iBar2)**2
    denom = math.sqrt(denom1) * math.sqrt(denom2)
    if denom == 0: return 0
    return numer / denom

In [None]:
# def Q3():
#     thresholds = numpy.arange(-0.5, 0.6, 0.1)
#     res = []
#     for threshold in thresholds:
#         correct = 0
#         for d in valid:
#             similarities = [0]
#             userCooked = recipePerUser[d['user_id']]
#             for recipe in userCooked:
#                 similarities.append(Pearson(d['recipe_id'], recipe))
                
#             if max(similarities) > threshold:
#                 correct += (d['rating'] != '0')

#         res.append(correct/len(valid))
#         print(correct/len(valid))
#         print(threshold)

#     plt.plot(thresholds, res, 'b-')
#     plt.xlabel('Threshold')
#     plt.ylabel('Accuracy for different thresholds')
#     plt.show()
#     return res

In [None]:
# res = Q3()

In [None]:
# print("Accuracy on Validation set with similarity threshold ", 0 + res.index(max(res)) * 0.1, " is")
# print(max(res))

Sim and pop on valid set

In [None]:
# def Q4():
    
#     pop_threshold = 1.77
#     sim_threshold = 0
    
#     mostPopular = [(recipeCount[x], x) for x in recipeCount]
#     mostPopular.sort()
#     mostPopular.reverse()

#     return1 = set()
#     count = 0
#     for ic, i in mostPopular:
#         count += ic
#         return1.add(i)
#         if count > totalCooked / pop_threshold:
#             break

#     correct = 0
#     for d in valid:
#         similarities = [0]
#         userCooked = recipePerUser[d['user_id']]
#         for recipe in userCooked:
#             similarities.append(Pearson(d['recipe_id'], recipe))

#         if max(similarities) > sim_threshold or d['recipe_id'] in return1:
#             correct += (d['rating'] != '0')
            
# #         if max(similarities) > sim_threshold:
# #             correct += (d['rating'] != '0')
    
#     print("Accuracy on Validation set with \npopularity threshold ", pop_threshold, 
#           " (totalCooked / threshold) \nand similarity threshold ", sim_threshold, " is")
#     print(correct/len(valid))

In [None]:
# Q4()

In [None]:
# Utility data structures
ingsPerItem = defaultdict(set)
itemsPerIng = defaultdict(set)

In [None]:
for d in dataset:
    r = d['recipe_id']
    for i in d['ingredients']:
        ingsPerItem[r].add(i)
        itemsPerIng[i].add(r)

In [None]:
def Q4():
    
    pop_threshold = 1.77
    sim_threshold = 0.012
    
    mostPopular = [(recipeCount[x], x) for x in recipeCount]
    mostPopular.sort()
    mostPopular.reverse()

    return1 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count > totalCooked / pop_threshold:
            break

    correct = 0
    for d in valid:
        similarities = [0]
        ings = ingsPerItem[d['recipe_id']]
        for i2 in ingsPerItem:
            if i2 == i: continue
            
            sim = Jaccard(ings, ingsPerItem[i2])
            similarities.append(sim)

        if max(similarities) > sim_threshold or d['recipe_id'] in return1:
            correct += (d['rating'] != '0')
    
    print("Accuracy on Validation set with \npopularity threshold ", pop_threshold, 
          " (totalCooked / threshold) \nand similarity threshold ", sim_threshold, " is")
    print(correct/len(valid))

In [None]:
Q4()

In [None]:
def Q6():
    
    pop_threshold = 1.77
    sim_threshold = 0
    
    recipeCount = defaultdict(int)
    
    for d in data:
        user,recipe = d['user_id'], d['recipe_id']
        recipeCount[recipe] += 1
    
    mostPopular = [(recipeCount[x], x) for x in recipeCount]
    mostPopular.sort()
    mostPopular.reverse()

    return1 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count > totalCooked / pop_threshold:
            break

    correct = 0
    for d in valid:
        similarities = [0]
        similaritiesIngre = [0]
        
        if d['user_id'] in recipePerUser:
            userCooked = recipePerUser[d['user_id']]
            for recipe in userCooked:
                similarities.append(Pearson(d['recipe_id'], recipe))
        

        if max(similarities) > sim_threshold or d['recipe_id'] in return1:
            correct += (d['rating'] != '0')
    
    print("Accuracy on Validation set with \npopularity threshold ", pop_threshold, 
          " (totalCooked / threshold) \nand similarity threshold ", sim_threshold, " is")
    
    print(correct/len(valid))   

Kaggle user name: ZiangX\
Score: 0.67190

In [20]:
def Q5():
    
    pop_threshold = 1.77
    sim_threshold = 0
    
    recipeCount = defaultdict(int)
    
    for d in data:
        user,recipe = d['user_id'], d['recipe_id']
        recipeCount[recipe] += 1
    
    mostPopular = [(recipeCount[x], x) for x in recipeCount]
    mostPopular.sort()
    mostPopular.reverse()

    return1 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count > totalCooked / pop_threshold:
            break               
                
    predictions = open("predictions_Made1.77.txt", 'w')
    for l in open("stub_Made.txt"):
        if l.startswith("user_id"):
            predictions.write(l)
            continue
        u,i = l.strip().split('-')
        similarities = [0]
        
        if u in recipePerUser:
            for recipe in recipePerUser[u]:
                similarities.append(Pearson(i, recipe))
        
        if max(similarities) > sim_threshold or i in return1:
            predictions.write(u + '-' + i + ",1\n")
        else:
            predictions.write(u + '-' + i + ",0\n")

    predictions.close()

In [21]:
Q5()

In [None]:
def base():
    recipeCount = defaultdict(int)
    totalCooked = 0

    for user,recipe,_ in readCSV("trainInteractions.csv.gz"):
        recipeCount[recipe] += 1
        totalCooked += 1

    mostPopular = [(recipeCount[x], x) for x in recipeCount]
    mostPopular.sort()
    mostPopular.reverse()

    return1 = set()
    count = 0
    for ic, i in mostPopular:
      count += ic
      return1.add(i)
      if count > totalCooked/1.88: break

    predictions = open("predictions_Made25.txt", 'w')
    for l in open("stub_Made.txt"):
      if l.startswith("user_id"):
        #header
        predictions.write(l)
        continue
      u,i = l.strip().split('-')
      if i in return1:
        predictions.write(u + '-' + i + ",1\n")
      else:
        predictions.write(u + '-' + i + ",0\n")

    predictions.close()

In [None]:
base()