In [1]:
#Read in Data

In [2]:
import gzip
import csv
import random
import math

def readCSV(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    for l in c:
        d = dict(zip(header,l))
        yield d['user_id'],d['recipe_id'],d

from collections import defaultdict

#Collect Data from file
userRecipeDataFULL = []
didCookFULL = []

usersPerRecipeTotal = defaultdict(set) 
recipesPerUserTotal = defaultdict(set)

# - read from trainInteractions.csv.gz
for u,r,_ in readCSV("trainInteractions.csv.gz"):
    # Create Training set (small to start)
    userRecipeDataFULL.append([u,r])
    usersPerRecipeTotal[r].add(u)
    recipesPerUserTotal[u].add(r)
    didCookFULL.append(1)
    

In [3]:
# - randomly sample non-cooked item for each U,I pair and add

In [4]:
recipes = list(usersPerRecipeTotal)

newUserRecipeData = []
for u,r in userRecipeDataFULL:
    newRecipe = None
    while newRecipe == None or newRecipe in recipesPerUserTotal[u]:
        newRecipe = random.choice(recipes)
    newUserRecipeData.append([u,newRecipe])
    didCookFULL.append(0)
userRecipeDataFULL.extend(newUserRecipeData)


randomIndices = list(range(len(userRecipeDataFULL)))
random.shuffle(randomIndices)


userRecipeData_Suffled = []
didCook_Suffled = []

for ri in randomIndices:
    userRecipeData_Suffled.append(userRecipeDataFULL[ri])
    didCook_Suffled.append(didCookFULL[ri])
    
userRecipeDataFULL=userRecipeData_Suffled
didCookFULL=didCook_Suffled


In [5]:
#smaller portion for processor speed

In [6]:
size = 10000 #len(userRecipeDataFULL)

userRecipeData = userRecipeDataFULL[:size]
didCook = didCookFULL[:size]

In [7]:
# - split into train and validation 

In [8]:

TrainEnd = round(len(userRecipeData)*.9)
ValidationEnd = len(userRecipeData)

userRecipeTrain = userRecipeData[:TrainEnd]
didCookTrain = didCook[:TrainEnd]
userRecipeValidation = userRecipeData[TrainEnd:ValidationEnd]
didCookValidation = didCook[TrainEnd:ValidationEnd]

In [9]:
# Create HelperDicts for training

In [10]:


cookedDict = defaultdict(lambda:0)
usersPerRecipe = defaultdict(set) 
recipesPerUser = defaultdict(set) 


for i, (u,r) in enumerate(userRecipeDataFULL):
    usersPerRecipe[r].add(u)
    recipesPerUser[u].add(r)
    cookedDict[u,r] = didCookFULL[i]
    
recipesList = list(usersPerRecipe) 
userList =list(recipesPerUser)

In [11]:
# Jaccard similarity

In [12]:
def jacardSimilarity(s1,s2):
    return len(s1.intersection(s2))/len(s1.union(s2))

In [13]:
jaccardSimilarityR_Cache = {}
jaccardSimilarityU_Cache = {}


def jaccardSimilarityRecipe(r1,r2):
#     if (r1,r2) in jaccardSimilarityR_Cache:
#         return jaccardSimilarityR_Cache[(r1,r2)]
#     sim = jacardSimilarity(usersPerRecipe[r1], usersPerRecipe[r2])
#     jaccardSimilarityR_Cache[(r1,r2)] = sim
#     jaccardSimilarityR_Cache[(r2,r1)] = sim
#     return sim
    return jacardSimilarity(usersPerRecipe[r1], usersPerRecipe[r2])

def jaccardSimilarityUser(u1,u2):
#     if (u1,u2) in jaccardSimilarityU_Cache:
#         return jaccardSimilarityU_Cache[(u1,u2)]
#     sim = jacardSimilarity(recipesPerUser[u1], recipesPerUser[u2])
#     jaccardSimilarityU_Cache[(u1,u2)] = sim
#     jaccardSimilarityU_Cache[(u2,u1)] = sim
#     return sim
    return jacardSimilarity(recipesPerUser[u1], recipesPerUser[u2])


In [14]:
# Cosine similarity

In [15]:
def cosineSimilarityR(r1, r2):
    norm1 = sum([cookedDict[(u,r1)]**2 for u in usersPerRecipe[r1]])    
    norm2 = sum([cookedDict[(u,r2)]**2 for u in usersPerRecipe[r2]])
    denom = math.sqrt(norm1) * math.sqrt(norm2)
 
    if denom == 0: return 0
    
    inter = usersPerRecipe[r1].intersection(usersPerRecipe[r2]) 
    numer = sum([cookedDict[(u,r1)]*cookedDict[(u,r2)] for u in inter])
    
    return numer / denom

def cosineSimilarityU(u1, u2):
    norm1 = sum([cookedDict[(u1,r)]**2 for r in recipesPerUser[u1]])    
    norm2 = sum([cookedDict[(u2,r)]**2 for r in recipesPerUser[u2]])
    denom = math.sqrt(norm1) * math.sqrt(norm2)
 
    if denom == 0: return 0
    
    inter = recipesPerUser[u1].intersection(recipesPerUser[u2]) 
    numer = sum([cookedDict[(u1,r)]*cookedDict[(u2,r)] for r in inter])

    return numer / denom

In [16]:
cosineSimilarityR_Cache = {}
cosineSimilarityU_Cache = {}


def cosineSimilarityRecipe(r1,r2):
#     if (r1,r2) in cosineSimilarityR_Cache:
#         return cosineSimilarityR_Cache[(r1,r2)]
#     sim = cosineSimilarityR(r1, r2)
#     cosineSimilarityR_Cache[(r1,r2)] = sim
#     cosineSimilarityR_Cache[(r2,r1)] = sim
#     return sim
    return cosineSimilarityR(r1, r2)

def cosineSimilarityUser(u1,u2):
#     if (u1,u2) in cosineSimilarityU_Cache:
#         return cosineSimilarityU_Cache[(u1,u2)]
#     sim = cosineSimilarityU(u1, u2)
#     cosineSimilarityU_Cache[(u1,u2)] = sim
#     cosineSimilarityU_Cache[(u2,u1)] = sim
#     return sim
    return cosineSimilarityU(u1, u2)

In [17]:
# Pearson Similarity 

In [18]:
def pearsonSimilarityRecipe(r1, r2):
    # Between two items
    iBar1 = len(usersPerRecipe[r1])/len(recipesPerUser)
    iBar2 = len(usersPerRecipe[r2])/len(recipesPerUser)
    inter = usersPerRecipe[r1].intersection(usersPerRecipe[r2]) 
    numer = denom1 = denom2 = 0
    for u in inter:
        numer += (cookedDict[(u,r1)] - iBar1)*(cookedDict[(u ,r2)] - iBar2) 
    for u in inter:
        denom1 += (cookedDict[(u,r1)] - iBar1)**2 
        denom2 += (cookedDict[(u,r2)] - iBar2)**2
    denom = math.sqrt(denom1) * math.sqrt(denom2) 
    if denom == 0: return 0
    return numer / denom

def pearsonSimilarityUser(u1, u2):
    # Between two items
    iBar1 = len(recipesPerUser[u1])/len(recipesPerUser)
    iBar2 = len(recipesPerUser[u2])/len(recipesPerUser)
    inter = recipesPerUser[u1].intersection(recipesPerUser[u2]) 
    numer = denom1 = denom2 = 0
    for u in inter:
        numer += (cookedDict[(u1,r)] - iBar1)*(cookedDict[(u2,r)] - iBar2) 
    for u in inter:
        denom1 += (cookedDict[(u1,r)] - iBar1)**2 
        denom2 += (cookedDict[(u2,r)] - iBar2)**2
    denom = math.sqrt(denom1) * math.sqrt(denom2) 
    if denom == 0: return 0
    return numer / denom
   

In [19]:
# popularity 

In [20]:

popularityRecipe = {}
for r in usersPerRecipe:
    popularityRecipe[r] = len(usersPerRecipe[r])
    
maxPop = max(popularityRecipe.values())
popularitiesTotal = 0
for r in popularityRecipe:
    popularityRecipe[r] = popularityRecipe[r]/maxPop
    popularitiesTotal += popularityRecipe[r]

avgPopularity = popularitiesTotal/len(popularityRecipe)


In [21]:

def extractPopularityFeaturesRecipe(u,r):
    if r in popularityRecipe:
        return popularityRecipe[r]
    else:
        return avgPopularity


def extractPopularityFeaturesUser(u,r):
    userRecipePopularities = [popularityRecipe[r] for r in recipesPerUser[u]]
    if len(userRecipePopularities) == 0 :
        return avgPopularity
    else:
        return sum(userRecipePopularities)/len(userRecipePopularities)

In [22]:
# Extract Features for Similarity funcs

In [23]:
# Extract Feature Functions

def extractSimilarityFeatures_JaccardRecipe(user,recipe):
    recipeSet = recipesPerUser[user]
    similarities = []
    for r2 in recipeSet:
        similarities.append(jaccardSimilarityRecipe(recipe,r2))
    if len(similarities) == 0 : return 0
    return sum(similarities)/len(similarities)

def extractSimilarityFeatures_JaccardUser(user,recipe):
    userSet = usersPerRecipe[recipe]
    similarities = []
    for u2 in userSet:
        similarities.append(jaccardSimilarityUser(user,u2))
    if len(similarities) == 0 : return 0
    return sum(similarities)/len(similarities)

def extractSimilarityFeatures_CosineRecipe(user,recipe):
    recipeSet = recipesPerUser[user]
    similarities = []
    for r2 in recipeSet:
        similarities.append(cosineSimilarityRecipe(recipe,r2))
    if len(similarities) == 0 : return 0
    return sum(similarities)/len(similarities)

def extractSimilarityFeatures_CosineUser(user,recipe):
    userSet = usersPerRecipe[recipe]
    similarities = []
    for u2 in userSet:
        similarities.append(cosineSimilarityUser(user,u2))
    if len(similarities) == 0 : return 0
    return sum(similarities)/len(similarities)

def extractSimilarityFeatures_PearsonRecipe(user,recipe):
    recipeSet = recipesPerUser[user]
    similarities = []
    for r2 in recipeSet:
        similarities.append(pearsonSimilarityRecipe(recipe,r2))
    if len(similarities) == 0 : return 0
    return sum(similarities)/len(similarities)

def extractSimilarityFeatures_PearsonUser(user,recipe):
    userSet = usersPerRecipe[recipe]
    similarities = []
    for u2 in userSet:
        similarities.append(pearsonSimilarityUser(user,u2))
    if len(similarities) == 0 : return 0
    return sum(similarities)/len(similarities)


In [24]:
def accuracy(pred,y):
    return sum([p ==y for p,y in zip(pred,y)])/len(pred)

In [25]:
from sklearn.linear_model import LogisticRegression

In [26]:
# get features

In [27]:
def extractFeatures(urPairs):
    
    featureFunctions = [#extractSimilarityFeatures_JaccardRecipe,
        #extractSimilarityFeatures_JaccardUser,
        extractSimilarityFeatures_CosineRecipe,
        #extractSimilarityFeatures_CosineUser,
        #extractSimilarityFeatures_PearsonRecipe,
        #extractSimilarityFeatures_PearsonUser,
        #extractPopularityFeaturesUser,
        #extractPopularityFeaturesRecipe
    ]
    
    featureVectors = []
    for fi, f in enumerate(featureFunctions):
        featureVectors.append([])
        for i,(u,r) in enumerate(urPairs):
            featureVectors[fi].append([1, f(u,r)])
    return featureVectors

In [28]:
featureVectors = extractFeatures(userRecipeTrain)

In [29]:
#Esemble
def trainEnsemble(featureVectors, Y):
    featureModels = []
    trainPredictions = []
    for fv in featureVectors:
        newModel = LogisticRegression()
        newModel.fit(fv,Y)
        preds = newModel.predict(fv)
        featureModels.append(newModel)
        trainPredictions.append(preds)
    # Add Bias
    ensembleFeatures = []
    for i in range(len(trainPredictions[0])):
        ensembleFeatures.append([1])
        for j in range(len(trainPredictions)):
            ensembleFeatures[i].append(trainPredictions[j][i])
    ensembleModel = LogisticRegression()
    ensembleModel.fit(ensembleFeatures,Y)
    ensemblePredictions = ensembleModel.predict(ensembleFeatures)
    return featureModels,ensembleFeatures,ensembleModel,ensemblePredictions
    

In [30]:
def predictWithEnsemble(featureVectors,featureModels,ensembleModel):
    trainPredictions = []
    for i,fv in enumerate(featureVectors):
        trainPredictions.append(featureModels[i].predict(fv))
    # Add Bias
    ensembleFeatures = []
    for i in range(len(trainPredictions[0])):
        ensembleFeatures.append([1])
        for j in range(len(trainPredictions)):
            ensembleFeatures[i].append(trainPredictions[j][i])
    predictions =  ensembleModel.predict(ensembleFeatures)
    return featureModels,ensembleFeatures,predictions

In [31]:
# Train Model

In [32]:
len(featureVectors)

1

In [33]:
featureModels,ensembleFeatures,ensembleModel,ensemblePredictions = trainEnsemble(featureVectors,didCookTrain)
print("training accuracy:",accuracy(ensemblePredictions,didCookTrain))

training accuracy: 0.8877777777777778


In [34]:
# Test on Validation Set

In [35]:
featureVectorsV = extractFeatures(userRecipeValidation)
featureModels_V, ensembleFeatures_V, ensemblePredictions_V = predictWithEnsemble(featureVectorsV,featureModels,ensembleModel)
print("training accuracy:",accuracy(ensemblePredictions_V,didCookValidation))

training accuracy: 0.902


In [36]:
# Extract Stub User/Item Pairs

stubUserRecipe = []

for l in open("stub_Made.txt"):
    if l.startswith("user_id"):
        continue
    u, r = l.strip().split('-')
    stubUserRecipe.append([u, r])

In [None]:
# Predict Stub
featureVectorsS = extractFeatures(stubUserRecipe)
featureModels_S, ensembleFeaturesf_S, predictions_S = predictWithEnsemble(featureVectorsS,featureModels,ensembleModel)

In [None]:
# Save Stub Predictions to File
predictionsFile = open("predictions_Made.txt", 'w')
predictionsFile.write("user_id-recipe_id,prediction\n")

for i,(u,r) in enumerate(stubUserRecipe):
    predictionsFile.write(u + '-' + r + ","+str(predictions_S[i])+"\n")

predictionsFile.close()