In [17]:
import csv
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

from collections import defaultdict
from datetime import datetime

import numpy
import scipy.optimize
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [2]:
def read_csv(path):
    rows = []
    with open(path, 'r') as file:
        csvreader = csv.reader(file)
        header = next(csvreader)
        for row in csvreader:
            rows.append(row)
    return header, rows

In [3]:
interaction_header, interactions = read_csv('archive/RAW_interactions.csv')
interactions[0]

['38094',
 '40893',
 '2003-02-17',
 '4',
 'Great with a salad. Cooked on top of stove for 15 minutes.Added a shake of cayenne and a pinch of salt.  Used low fat sour cream.  Thanks.']

In [4]:
n_interactions = len(interactions)

interactionsTrain = interactions[:int(n_interactions * 0.95)]
interactionsValid = interactions[int(n_interactions * 0.95):]

interactionsPerUser = defaultdict(list)
interactionsPerRecipe = defaultdict(list)

for user, recipe, date, rating, review in interactionsTrain:
    rating = int(rating)
    interactionsPerUser[user].append((recipe,rating,date,review))
    interactionsPerRecipe[recipe].append((user,rating,date,review))

In [5]:
recipe_header, recipes = read_csv('archive/RAW_recipes.csv')
recipes[0]

['arriba   baked winter squash mexican style',
 '137739',
 '55',
 '47892',
 '2005-09-16',
 "['60-minutes-or-less', 'time-to-make', 'course', 'main-ingredient', 'cuisine', 'preparation', 'occasion', 'north-american', 'side-dishes', 'vegetables', 'mexican', 'easy', 'fall', 'holiday-event', 'vegetarian', 'winter', 'dietary', 'christmas', 'seasonal', 'squash']",
 '[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]',
 '11',
 "['make a choice and proceed with recipe', 'depending on size of squash , cut into half or fourths', 'remove seeds', 'for spicy squash , drizzle olive oil or melted butter over each cut squash piece', 'season with mexican seasoning mix ii', 'for sweet squash , drizzle melted honey , butter , grated piloncillo over each cut squash piece', 'season with sweet mexican spice mix', 'bake at 350 degrees , again depending on size , for 40 minutes up to an hour , until a fork can easily pierce the skin', 'be careful not to burn the squash especially if you opt to use sugar or butter', 'if yo

In [6]:
################
# User Feature 
################

In [7]:
user_feature_vector = {}

# Rating distribution (6)
# Rating variance (1)
# Number of reviews (1)
# Time of first review (3)
# Time of last review (3)
# Average review length (normalized) (1)
# Review frequency (per day) (1)


In [8]:
max_review_length = max([len(i[4]) for i in interactions])

# User Feature Per User
for user in interactionsPerUser.keys():
    user_interactions = interactionsPerUser[user]
    features = [0] * 16

    for recipe,rating,date,review in user_interactions:
        rating = int(rating)
        features[rating] += 1

    all_ratings = [int(i[1]) for i in user_interactions]
    variance = numpy.var(all_ratings)
    features[6] = variance

    features[7] = len(user_interactions)

    all_dates = [datetime.strptime(i[2], '%Y-%m-%d') for i in user_interactions]
    all_dates.sort()
    # Time of first review
    features[8] = all_dates[0].year
    features[9] = all_dates[0].month
    features[10] = all_dates[0].day
    # Time of last review
    features[11] = all_dates[-1].year
    features[12] = all_dates[-1].month
    features[13] = all_dates[-1].day

    all_reviews = [i[3] for i in user_interactions]
    avg_review_length = sum([len(r) for r in all_reviews]) / len(all_reviews)
    features[14] = avg_review_length / max_review_length

    day_diff = (all_dates[-1] - all_dates[0]).days
    avg_review_per_day = len(all_reviews) / day_diff if day_diff else 0
    features[15] = avg_review_per_day

    user_feature_vector[user] = features

    if None in features:
        print(features)

In [9]:
################
# Recipe Feature 
################

In [10]:
recipe_feature_vector = {}

# minutes (1)
# submitted (3)
# tag count (1)
# n_steps (1) 
# steps length (1)
# description length (1)
# n_ingredients (1)

max_step_length = max([len(r[8]) for r in recipes])
max_desc_lenth = max([len(r[9]) for r in recipes])

for r in recipes:
    recipe_id = r[1]
    features = [0] * 9

    minutes = int(r[2])
    submitted = datetime.strptime(r[4], '%Y-%m-%d')
    n_tag = len(eval(r[5]))
    n_steps = int(r[7])
    step_len = len(r[8])
    desc_len = len(r[9])
    n_ingredients = int(r[11])

    features[0] = minutes
    features[1] = submitted.year
    features[2] = submitted.month
    features[3] = submitted.day
    features[4] = n_tag
    features[5] = n_steps
    features[6] = step_len / max_step_length
    features[7] = desc_len / max_desc_lenth
    features[8] = n_ingredients

    recipe_feature_vector[recipe_id] = features

    if None in features:
            print(features)

In [11]:
def feature(user, recipe):
    return [1] + user_feature_vector[user] + recipe_feature_vector[recipe]

In [49]:
# Multinomial logistic regression model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, C=2, class_weight='balanced')

X_train = [feature(d[0], d[1]) for d in interactionsTrain]
y_train = [int(d[3]) for d in interactionsTrain]

# Train the model
scaler = StandardScaler()
model.fit(scaler.fit_transform(X_train), y_train)

# X_test = [feature(d[0], d[1]) for d in interactionsValid]
# y_test = [int(d[2]) for d in interactionsValid]
# y_test_pred = model.predict(X_test)  # These are the most probable classes (ratings) for each input


In [50]:
# X_test = [feature(d[0], d[1]) for d in interactionsValid]
y_test = [int(d[3]) for d in interactionsValid]
y_test_pred_lr = []

for user, recipe, date, rating, review in interactionsValid:
    if user not in user_feature_vector or recipe not in recipe_feature_vector:
        pred = 5
    else:
        pred = model.predict(scaler.fit_transform([feature(user, recipe)]))[0]
    y_test_pred_lr.append(pred)

sum(1 for x,y in zip(y_test, y_test_pred_lr) if x == y) / len(y_test)

0.7260460269520832

In [47]:
y_test_pred_base = [5] * len(y_test)
sum(1 for x,y in zip(y_test, y_test_pred_base) if x == y) / len(y_test)

0.7260460269520832

In [48]:
y_test_pred_lr == y_test_pred_base

True

In [51]:
from imblearn.over_sampling import SMOTE
from collections import Counter

print('Original dataset shape:', Counter(y_train))

smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(scaler.fit_transform(X_train), y_train)

print('Resampled dataset shape:', Counter(y_train_resampled))

model.fit(X_train_resampled, y_train_resampled)

y_test = [int(d[3]) for d in interactionsValid]
y_test_pred_lr = []

for user, recipe, date, rating, review in interactionsValid:
    if user not in user_feature_vector or recipe not in recipe_feature_vector:
        pred = 5
    else:
        pred = model.predict(scaler.fit_transform([feature(user, recipe)]))[0]
    y_test_pred_lr.append(pred)

sum(1 for x,y in zip(y_test, y_test_pred_lr) if x == y) / len(y_test)

Original dataset shape: Counter({5: 775256, 4: 178113, 0: 57952, 3: 38768, 2: 13421, 1: 12238})
Resampled dataset shape: Counter({4: 775256, 5: 775256, 0: 775256, 2: 775256, 3: 775256, 1: 775256})


0.7260460269520832