In [1]:
import csv

from collections import defaultdict
from datetime import datetime
import time

import numpy
from numpy import dot
from numpy.linalg import norm

from nltk.sentiment import SentimentIntensityAnalyzer

In [2]:
def read_csv(path):
    rows = []
    with open(path, 'r') as file:
        csvreader = csv.reader(file)
        header = next(csvreader)
        for row in csvreader:
            rows.append(row)
    return header, rows

def cosine_similarity(list1, list2):
    norm1 = norm(list1)
    norm2 = norm(list2)
    if norm1 == 0 or norm2 == 0:
        return 0
    return dot(list1, list2) / (norm1 * norm2)

def analyze_sentiments(reviews):
    sia = SentimentIntensityAnalyzer()
    sentiments = []

    for review in reviews:
        score = sia.polarity_scores(review)
        sentiments.append(score)

    return sentiments

def date_to_epoch_in_days(date_str):
    dt_obj = datetime.strptime(date_str, '%Y-%m-%d')

    return dt_obj.timestamp() / (60 * 60 * 24)

In [3]:
interaction_header, interactions = read_csv('archive/RAW_interactions.csv')
interaction_header

['user_id', 'recipe_id', 'date', 'rating', 'review']

In [4]:
n_interactions = len(interactions)

interactionsTrain = interactions[:int(n_interactions * 0.95)]
interactionsValid = interactions[int(n_interactions * 0.95):]

interactionsPerUser = defaultdict(list)
interactionsPerRecipe = defaultdict(list)

for user, recipe, date, rating, review in interactionsTrain:
    rating = int(rating)
    interactionsPerUser[user].append((recipe,date,rating,review))
    interactionsPerRecipe[recipe].append((user,date,rating,review))

In [5]:
all_users = interactionsPerUser.keys()
all_recipes = interactionsPerRecipe.keys()
all_reviews = [review for _,_,_,_,review in interactionsTrain]
all_sentiments = analyze_sentiments(all_reviews)

In [6]:
user_recipe_to_sentiment = {}
user_recipe_to_interaction = {}

for i in range(len(interactionsTrain)):
    user, recipe = interactionsTrain[i][0], interactionsTrain[i][1]
    sentiment = all_sentiments[i]

    user_recipe_to_sentiment[(user, recipe)] = sentiment['compound']
    user_recipe_to_interaction[(user, recipe)] = interactionsTrain[i]

In [7]:
recipe_header, recipes = read_csv('archive/RAW_recipes.csv')
recipe_header

['name',
 'id',
 'minutes',
 'contributor_id',
 'submitted',
 'tags',
 'nutrition',
 'n_steps',
 'steps',
 'description',
 'ingredients',
 'n_ingredients']

In [8]:
recipe_id_to_recipe = {}
recipes_per_tag = defaultdict(set)

for r in recipes:
    recipe_id = r[1] 
    recipe_id_to_recipe[recipe_id] = r

    tags = eval(r[5])
    for tag in tags:
        recipes_per_tag[tag].add(recipe_id)

In [9]:
################
# Model
################

In [32]:
def predict(user, recipe):
    candidate_users = [i[0] for i in interactionsPerRecipe[recipe]]
    
    recipe_id = recipe
    recipe = recipe_id_to_recipe[recipe_id]

    recipes_share_tags = set()
    for tag in eval(recipe[5]):
        recipes_share_tags = recipes_share_tags.union(recipes_per_tag[tag])
    recipes_share_tags = list(recipes_share_tags)

    avg_cos_sim_list = []
    # pred = None

    for u2 in candidate_users:
        cos_sim_list = []
        for r2 in recipes_share_tags:
            if (user, r2) not in user_recipe_to_interaction or (u2, r2) not in user_recipe_to_interaction:
                cos_sim_list.append(0)
            else:
                user_interaction = user_recipe_to_interaction[(user, r2)]
                u2_interaction = user_recipe_to_interaction[(u2, r2)]

                # [rating, sentiment, days_since_epoch]
                user_vec = [
                    int(user_interaction[3]), 
                    user_recipe_to_sentiment[(user, r2)], 
                    date_to_epoch_in_days(user_interaction[2]) / 20000 # rescale
                ]
                u2_vec = [
                    int(u2_interaction[3]), 
                    user_recipe_to_sentiment[(u2, r2)], 
                    date_to_epoch_in_days(u2_interaction[2]) / 20000 # rescale
                ]
                cos_sim_list.append(cosine_similarity(user_vec, u2_vec))

        avg_cos_sim = sum(cos_sim_list) / len(cos_sim_list)
        avg_cos_sim_list.append((avg_cos_sim, u2))

    top3_sim_users = [user for sim, user in sorted(avg_cos_sim_list, reverse=True)[:3]]
    top3_user_rating = [int(user_recipe_to_interaction[(u, recipe_id)][3]) for u in top3_sim_users]

    pred = max(top3_user_rating, key=top3_user_rating.count)

    return pred

In [33]:
################
# Validate
################

In [34]:
y_valid = [int(d[3]) for d in interactionsValid]
y_valid_pred = []

for i in interactionsValid:
    user, recipe = i[0], i[1]

    if user not in interactionsPerUser or recipe not in interactionsPerRecipe:
        pred = 5
    else:
        pred = predict(user, recipe)

    y_valid_pred.append(pred)

print(y_valid[:10])
print(y_valid_pred[:10])

sum(1 for x,y in zip(y_valid, y_valid_pred) if x == y) / len(y_valid)

# 0.7244917783782829

11
11
11
11
11
11
14
4
1
1
1
1
1
1
4
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1

0.7244917783782829

In [16]:
# Base
y_valid_base = [5] * len(y_valid)
sum(1 for x,y in zip(y_valid, y_valid_base) if x == y) / len(y_valid)

0.7260460269520832

In [6]:
################
# User Feature 
################

In [7]:
user_feature_vector = {}

# Rating distribution (6)
# Rating variance (1)
# Number of reviews (1)
# Time of first review (3)
# Time of last review (3)
# Average review length (normalized) (1)
# Review frequency (per day) (1)


In [8]:
max_review_length = max([len(i[4]) for i in interactions])

# User Feature Per User
for user in interactionsPerUser.keys():
    user_interactions = interactionsPerUser[user]
    features = [0] * 16

    for recipe,rating,date,review in user_interactions:
        rating = int(rating)
        features[rating] += 1

    all_ratings = [int(i[1]) for i in user_interactions]
    variance = numpy.var(all_ratings)
    features[6] = variance

    features[7] = len(user_interactions)

    all_dates = [datetime.strptime(i[2], '%Y-%m-%d') for i in user_interactions]
    all_dates.sort()
    # Time of first review
    features[8] = all_dates[0].year
    features[9] = all_dates[0].month
    features[10] = all_dates[0].day
    # Time of last review
    features[11] = all_dates[-1].year
    features[12] = all_dates[-1].month
    features[13] = all_dates[-1].day

    all_reviews = [i[3] for i in user_interactions]
    avg_review_length = sum([len(r) for r in all_reviews]) / len(all_reviews)
    features[14] = avg_review_length / max_review_length

    day_diff = (all_dates[-1] - all_dates[0]).days
    avg_review_per_day = len(all_reviews) / day_diff if day_diff else 0
    features[15] = avg_review_per_day

    user_feature_vector[user] = features

    if None in features:
        print(features)

In [9]:
################
# Recipe Feature 
################

In [10]:
recipe_feature_vector = {}

# minutes (1)
# submitted (3)
# tag count (1)
# n_steps (1) 
# steps length (1)
# description length (1)
# n_ingredients (1)

max_step_length = max([len(r[8]) for r in recipes])
max_desc_lenth = max([len(r[9]) for r in recipes])

for r in recipes:
    recipe_id = r[1]
    features = [0] * 9

    minutes = int(r[2])
    submitted = datetime.strptime(r[4], '%Y-%m-%d')
    n_tag = len(eval(r[5]))
    n_steps = int(r[7])
    step_len = len(r[8])
    desc_len = len(r[9])
    n_ingredients = int(r[11])

    features[0] = minutes
    features[1] = submitted.year
    features[2] = submitted.month
    features[3] = submitted.day
    features[4] = n_tag
    features[5] = n_steps
    features[6] = step_len / max_step_length
    features[7] = desc_len / max_desc_lenth
    features[8] = n_ingredients

    recipe_feature_vector[recipe_id] = features

    if None in features:
            print(features)

In [11]:
def feature(user, recipe):
    return [1] + user_feature_vector[user] + recipe_feature_vector[recipe]

In [13]:
# Multinomial logistic regression model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, C=1.0, class_weight='balanced')

X_train = [feature(d[0], d[1]) for d in interactionsTrain]
y_train = [int(d[3]) for d in interactionsTrain]

from imblearn.over_sampling import RandomOverSampler
from collections import Counter

scaler = StandardScaler()
ros = RandomOverSampler(random_state=42)

X_train_ros, y_train_ros = ros.fit_resample(scaler.fit_transform(X_train), y_train)

print('Original dataset shape:', Counter(y_train))
print('Resampled dataset shape:', Counter(y_train_ros))

model.fit(X_train_ros, y_train_ros)

Original dataset shape: Counter({5: 775256, 4: 178113, 0: 57952, 3: 38768, 2: 13421, 1: 12238})
Resampled dataset shape: Counter({4: 775256, 5: 775256, 0: 775256, 2: 775256, 3: 775256, 1: 775256})


In [None]:
# Multinomial logistic regression model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, C=2, class_weight='balanced')

X_train = [feature(d[0], d[1]) for d in interactionsTrain]
y_train = [int(d[3]) for d in interactionsTrain]

# Train the model
scaler = StandardScaler()
model.fit(scaler.fit_transform(X_train), y_train)

# X_test = [feature(d[0], d[1]) for d in interactionsValid]
# y_test = [int(d[2]) for d in interactionsValid]
# y_test_pred = model.predict(X_test)  # These are the most probable classes (ratings) for each input


In [14]:
# X_test = [feature(d[0], d[1]) for d in interactionsValid]
y_test = [int(d[3]) for d in interactionsValid]
y_test_pred_lr = []

for user, recipe, date, rating, review in interactionsValid:
    if user not in user_feature_vector or recipe not in recipe_feature_vector:
        pred = 5
    else:
        pred = model.predict(scaler.fit_transform([feature(user, recipe)]))[0]
    y_test_pred_lr.append(pred)

sum(1 for x,y in zip(y_test, y_test_pred_lr) if x == y) / len(y_test)

0.7260460269520832

In [47]:
y_test_pred_base = [5] * len(y_test)
sum(1 for x,y in zip(y_test, y_test_pred_base) if x == y) / len(y_test)

0.7260460269520832

In [48]:
y_test_pred_lr == y_test_pred_base

True

In [51]:
from imblearn.over_sampling import SMOTE
from collections import Counter

print('Original dataset shape:', Counter(y_train))

smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(scaler.fit_transform(X_train), y_train)

print('Resampled dataset shape:', Counter(y_train_resampled))

model.fit(X_train_resampled, y_train_resampled)

y_test = [int(d[3]) for d in interactionsValid]
y_test_pred_lr = []

for user, recipe, date, rating, review in interactionsValid:
    if user not in user_feature_vector or recipe not in recipe_feature_vector:
        pred = 5
    else:
        pred = model.predict(scaler.fit_transform([feature(user, recipe)]))[0]
    y_test_pred_lr.append(pred)

sum(1 for x,y in zip(y_test, y_test_pred_lr) if x == y) / len(y_test)

Original dataset shape: Counter({5: 775256, 4: 178113, 0: 57952, 3: 38768, 2: 13421, 1: 12238})
Resampled dataset shape: Counter({4: 775256, 5: 775256, 0: 775256, 2: 775256, 3: 775256, 1: 775256})


0.7260460269520832

In [52]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, C=1, class_weight='balanced')

ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ros.fit_resample(scaler.fit_transform(X_train), y_train)

print('Original dataset shape:', Counter(y_train))
print('Resampled dataset shape:', Counter(y_train_ros))

# Now use X_train_ros and y_train_ros for training your model
model.fit(X_train_ros, y_train_ros)

y_test = [int(d[3]) for d in interactionsValid]
y_test_pred_lr = []

for user, recipe, date, rating, review in interactionsValid:
    if user not in user_feature_vector or recipe not in recipe_feature_vector:
        pred = 5
    else:
        pred = model.predict(scaler.fit_transform([feature(user, recipe)]))[0]
    y_test_pred_lr.append(pred)

sum(1 for x,y in zip(y_test, y_test_pred_lr) if x == y) / len(y_test)

Original dataset shape: Counter({5: 775256, 4: 178113, 0: 57952, 3: 38768, 2: 13421, 1: 12238})
Resampled dataset shape: Counter({4: 775256, 5: 775256, 0: 775256, 2: 775256, 3: 775256, 1: 775256})


0.7260460269520832