# Section 1 (Regression)

In [12]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
import numpy
import string
import random
from sklearn import linear_model
import dateutil.parser
import numpy as np

In [2]:
def parse(f):
    for l in gzip.open(f):
        yield eval(l)

In [3]:
# Download data from below:
# https://cseweb.ucsd.edu/classes/fa21/cse258-b/files/
dataset = list(parse("trainRecipes.json.gz"))

In [4]:
len(dataset)

200000

In [5]:
train = dataset[:150000]
valid = dataset[150000:175000]
test = dataset[175000:]

In [6]:
dataset[1]

{'name': 'double delicious cookie bars',
 'minutes': 40,
 'contributor_id': '26865936',
 'submitted': '2007-08-27',
 'steps': 'preheat oven to 350f\tin 13x9-inch baking pan , melt butter in oven\tsprinkle crumbs evenly over butter\tpour milk evenly over crumbs\ttop with remaining ingredients\tpress down firmly\tbake 25-30 minutes or until lightly browned\tcool completely , chill if desired , and cut into bars',
 'description': 'from "all time favorite recipes". for fun, try substituting butterscotch or white chocolate chips for the semi-sweet and/or peanut butter chips. make sure you cool it completely or the bottom will crumble!',
 'ingredients': ['butter',
  'graham cracker crumbs',
  'sweetened condensed milk',
  'semi-sweet chocolate chips',
  'peanut butter chips'],
 'recipe_id': '98015212'}

In [7]:
### Question 1

In [9]:
def feat1a(d):
    f = [1]
    f.append(len(d['steps']))
    return f
    

In [13]:
X_train = [feat1a(d) for d in train]
X_test = [feat1a(d) for d in test]

y_train = [d['minutes'] for d in train]
y_test = [d['minutes'] for d in test]

print("The feature vector of the first training sample:")
print(feat1a(train[0]))

model_a = np.linalg.lstsq(X_train, y_train,rcond=None)
test_predictions_a = np.array(X_test).dot(model_a[0])
MSE_test_a = ((y_test - test_predictions_a).dot(y_test - test_predictions_a))/len(y_test)

print("The MSE of the testing model a is " + str (MSE_test_a))

The feature vector of the first training sample:
[1, 743]
The MSE of the testing model a is 6252.530902729382


In [20]:
time = [d['submitted'] for d in dataset]

In [21]:
for d in dataset:
    t = dateutil.parser.parse(d['submitted'])
    d['parsed_date'] = t

In [22]:
year = []
for d in dataset:
    t = dateutil.parser.parse(d['submitted'])
    year.append(t.year)

In [23]:
minYear = min([d['parsed_date'].year for d in dataset])
maxYear = max([d['parsed_date'].year for d in dataset])

In [24]:
minYear, maxYear

(1999, 2018)

In [25]:
yearLength = 2018-1999 + 1 # One-hot encoding length

In [33]:
def feat1b(d):
    f = [1]
    mon = [0]*12
    pd = d['parsed_date']
    mon[pd.month-1] = 1
    year = [0]*yearLength
    year[pd.year - minYear] = 1
    return f + mon[:-1] + year[:-1]    

In [34]:
X_train = [feat1b(d) for d in train]
X_test = [feat1b(d) for d in test]
print("The first feature vector should look like these:")
print(X_train[0])

The first feature vector should look like these:
[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [36]:
model_b = np.linalg.lstsq(X_train, y_train,rcond=None)

In [37]:
test_predictions_b = np.array(X_test).dot(model_b[0])# predict  = theta * X
MSE_test_b = ((y_test - test_predictions_b).dot(y_test - test_predictions_b))/len(y_test)
print("The MSE of the testing model is " + str (MSE_test_b))

The MSE of the testing model is 6396.833687711815


In [83]:
ingredients = {}
for d in dataset:
    for i in d['ingredients']:
        if i not in ingredients:
            ingredients[i] = 0
        ingredients[i] += 1

In [85]:
lst = []
for i in ingredients:
    lst.append([i,ingredients[i]])
lst = sorted(lst, key=lambda x: x[1], reverse=True)
mostpopular = [i[0] for i in lst[:50]]
print(mostpopular)

['salt', 'butter', 'sugar', 'onion', 'water', 'eggs', 'olive oil', 'flour', 'milk', 'garlic cloves', 'pepper', 'brown sugar', 'garlic', 'all-purpose flour', 'baking powder', 'egg', 'salt and pepper', 'parmesan cheese', 'lemon juice', 'baking soda', 'vegetable oil', 'vanilla', 'black pepper', 'cinnamon', 'tomatoes', 'sour cream', 'garlic powder', 'vanilla extract', 'oil', 'honey', 'garlic clove', 'cream cheese', 'onions', 'celery', 'cheddar cheese', 'unsalted butter', 'mayonnaise', 'soy sauce', 'chicken broth', 'paprika', 'extra virgin olive oil', 'worcestershire sauce', 'fresh parsley', 'cornstarch', 'fresh ground black pepper', 'parsley', 'carrots', 'chili powder', 'ground cinnamon', 'bacon']


In [86]:
def feat1c(d):
    v = [0]*50
    for i in d['ingredients']:
        if i in mostpopular:
            v[mostpopular.index(i)] = 1
    return v

print(feat1c(dataset[0]))

[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [87]:
def feat(d, a = True, b = True, c = True):
    # Hint: for Questions 1 and 2, might be useful to set up a function like this
    #       which allows you to "select" which features are included
    X =[]
    y = []
    for d in dataset:
        x = []
        if a:
            x += feat1a(d)
        if b:
            x += feat1b(d)
        if c:
            x += feat1c(d)
        X.append(x)
        y.append(int(d['minutes']))
    return X, y

In [88]:
def MSE(y, ypred):
    return ((y- ypred).dot(y - ypred))/len(y_test)

In [None]:
def experiment(mod,a, b, c):
    X_train, y_train = feat(train, a, b, c)
    X_test, y_test = feat(test, a, b, c)
    mod.fit(X_train, y_train)
    y_pred = mod.predict(X_test)
    return MSE(y_test, y_pred)

In [90]:
mod = linear_model.LinearRegression()

In [91]:
### Question 2

In [None]:
experiment(mod,True,False,False)

In [92]:
MSE_ablations1 = experiment(mod,False,True,True)
print(MSE_ablations1)

46635.61203903802


In [67]:
MSE_ablations2 = experiment(mod,True,False,True)
print(MSE_ablations2)

45841.13846291681


In [68]:
MSE_ablations3 = experiment(mod,True,True,False)
print(MSE_ablations3)

48693.300065570365


In [70]:
MSE_ALL = experiment(mod,True,True,True)
print(MSE_ALL)

45743.02541593012


The most important feature is 50-dimensional binary vector since when excluding it, the MSE is much more bigger than excluding the other two features compared with the all MSE.

In [28]:
### Question 3

In [29]:
def pipeline():
    for lamb in [0.001, 0.01, 0.1, 1, 10, 100, 1000]:
        

In [31]:
### Question 4
#(open ended)

# Section 2 (Classification)

In [103]:
### Question 5

In [104]:
X_train = [feat1c(d) for d in train]
X_test = [feat1c(d) for d in test]
y_train = ['butter' in d['ingredients'] for d in train]
y_test = ['butter' in d['ingredients'] for d in test]

In [98]:
mod = linear_model.LogisticRegression(C=1.0,class_weight='balanced')
mod.fit(X_train,y_train)
pred = mod.predict(X_test)

In [100]:
def BER(predictions, y):
    # Implement following this logic or otherwise
    TP = sum([(p and l) for (p,l) in zip(predictions, y)])
    FP = sum([(p and not l) for (p,l) in zip(predictions, y)])
    TN = sum([(not p and not l) for (p,l) in zip(predictions, y)])
    FN = sum([(not p and l) for (p,l) in zip(predictions, y)])  
    return (TP + TN) / (TP + FP + TN + FN)

In [102]:
BER(pred,y_train)

0.63708

In [34]:
def feat2(d, dict_size, mostPopularInd):
    fIng = [0] * dict_size
    for i in d['ingredients']:
        if i == 'butter':
            continue
        if i in mostPopularInd:
            fIng[mostPopularInd[i]] = 1
    return fIng

In [35]:
def experiment(reg = 1, dict_size = 50):
    # Hint: run an experiment with a particular regularization strength, and a particular one-hot encoding size
    # extract features...
    # (etc.)
    mod = linear_model.LogisticRegression(C=reg, class_weight='balanced', solver = 'lbfgs')
    # (etc.)

In [37]:
### Question 6

In [38]:
def pipeline():
    for C in [0.01, 1, 100]:
        for dsize in [50, 100, 500]:
            # Example values, can pick any others...

In [40]:
### Question 7
#(open ended)

# Section 3 (Recommender Systems)

In [None]:
### Question 8

In [71]:
# Utility data structures
ingsPerItem = defaultdict(set)
itemsPerIng = defaultdict(set)

In [72]:
for d in dataset:
    r = d['recipe_id']
    for i in d['ingredients']:
        ingsPerItem[r].add(i)
        itemsPerIng[i].add(r)

In [75]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom
    

In [105]:
def mostSimilar8(i, N):
    similarities = []
    ings = ingsPerItem[i]
    for i2 in ingsPerItem:
        if i2 == i: continue
        sim = Jaccard(ings, ingsPerItem[i2])
        similarities.append((sim,i2))
    similarities=sorted(similarities,key = lambda x:[-x[0],x[1]]
#                         ,reverse=True
                       )
    return similarities[:N]

In [106]:
item = '06432987'
n = 5
print("The five most similar recipes to the first recipe and their Jaccard similaritie are:")
mostSimilar8(item,n)

The five most similar recipes to the first recipe and their Jaccard similaritie are:


[(0.4166666666666667, '68523854'),
 (0.38461538461538464, '12679596'),
 (0.36363636363636365, '56301588'),
 (0.36363636363636365, '79675099'),
 (0.35714285714285715, '87359281')]

In [None]:
### Question 9

In [107]:
def mostSimilar9(i, N):
    similarities = []
    items = itemsPerIng[i]
    for u2 in itemsPerIng:
        if u2 == i: continue
        sim = Jaccard(items, itemsPerIng[u2])
        if sim == 1: continue
        similarities.append((sim,u2))
    similarities=sorted(similarities,key = lambda x:[-x[0],x[1]]
#                         ,reverse=True
                       )    
    return similarities[:N]

In [108]:
i = 'butter'
N = 5
print("The five most similar ingredients to butter and their Jaccard similaritie are:")
mostSimilar9(i,N)

The five most similar ingredients to butter and their Jaccard similaritie are:


[(0.22315311514274808, 'salt'),
 (0.2056685424969639, 'flour'),
 (0.19100394157199166, 'eggs'),
 (0.17882420717656095, 'sugar'),
 (0.17040052045973944, 'milk')]

In [None]:
### Question 10
#(open ended)