In [230]:
import csv
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

from collections import defaultdict
from datetime import datetime

import numpy
import scipy.optimize
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler


In [105]:
def read_csv(path):
    rows = []
    with open(path, 'r') as file:
        csvreader = csv.reader(file)
        header = next(csvreader)
        for row in csvreader:
            rows.append(row)
    return header, rows

In [106]:
interaction_header, interactions = read_csv('../archive/RAW_interactions.csv')
interactions[0]

['38094',
 '40893',
 '2003-02-17',
 '4',
 'Great with a salad. Cooked on top of stove for 15 minutes.Added a shake of cayenne and a pinch of salt.  Used low fat sour cream.  Thanks.']

In [107]:
n_interactions = len(interactions)

interactionsTrain = interactions[:int(n_interactions * 0.95)]
interactionsValid = interactions[int(n_interactions * 0.95):]

interactionsPerUser = defaultdict(list)
interactionsPerRecipe = defaultdict(list)

for user, recipe, date, rating, review in interactionsTrain:
    rating = int(rating)
    interactionsPerUser[user].append((recipe,rating,date,review))
    interactionsPerRecipe[recipe].append((user,rating,date,review))

In [108]:
recipe_header, recipes = read_csv('../archive/RAW_recipes.csv')
recipes[0]

['arriba   baked winter squash mexican style',
 '137739',
 '55',
 '47892',
 '2005-09-16',
 "['60-minutes-or-less', 'time-to-make', 'course', 'main-ingredient', 'cuisine', 'preparation', 'occasion', 'north-american', 'side-dishes', 'vegetables', 'mexican', 'easy', 'fall', 'holiday-event', 'vegetarian', 'winter', 'dietary', 'christmas', 'seasonal', 'squash']",
 '[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]',
 '11',
 "['make a choice and proceed with recipe', 'depending on size of squash , cut into half or fourths', 'remove seeds', 'for spicy squash , drizzle olive oil or melted butter over each cut squash piece', 'season with mexican seasoning mix ii', 'for sweet squash , drizzle melted honey , butter , grated piloncillo over each cut squash piece', 'season with sweet mexican spice mix', 'bake at 350 degrees , again depending on size , for 40 minutes up to an hour , until a fork can easily pierce the skin', 'be careful not to burn the squash especially if you opt to use sugar or butter', 'if yo

In [109]:
################
# User Feature 
################

In [110]:
user_feature_vector = {}

# Rating distribution (6)
# Rating variance (1)
# Number of reviews (1)
# Time of first review (3)
# Time of last review (3)
# Average review length (normalized) (1)
# Review frequency (per day) (1)


In [111]:
max_review_length = max([len(i[4]) for i in interactions])

# User Feature Per User
for user in interactionsPerUser.keys():
    user_interactions = interactionsPerUser[user]
    features = [0] * 16

    for recipe,rating,date,review in user_interactions:
        rating = int(rating)
        features[rating] += 1

    all_ratings = [int(i[1]) for i in user_interactions]
    variance = numpy.var(all_ratings)
    features[6] = variance

    features[7] = len(user_interactions)

    all_dates = [datetime.strptime(i[2], '%Y-%m-%d') for i in user_interactions]
    all_dates.sort()
    # Time of first review
    features[8] = all_dates[0].year
    features[9] = all_dates[0].month
    features[10] = all_dates[0].day
    # Time of last review
    features[11] = all_dates[-1].year
    features[12] = all_dates[-1].month
    features[13] = all_dates[-1].day

    all_reviews = [i[3] for i in user_interactions]
    avg_review_length = sum([len(r) for r in all_reviews]) / len(all_reviews)
    features[14] = avg_review_length / max_review_length

    day_diff = (all_dates[-1] - all_dates[0]).days
    avg_review_per_day = len(all_reviews) / day_diff if day_diff else 0
    features[15] = avg_review_per_day

    user_feature_vector[user] = features

    if None in features:
        print(features)

In [112]:
################
# Recipe Feature 
################

In [113]:
recipe_feature_vector = {}

# minutes (1)
# submitted (3)
# tag count (1)
# n_steps (1) 
# steps length (1)
# description length (1)
# n_ingredients (1)

max_step_length = max([len(r[8]) for r in recipes])
max_desc_lenth = max([len(r[9]) for r in recipes])

for r in recipes:
    recipe_id = r[1]
    features = [0] * 9

    minutes = int(r[2])
    submitted = datetime.strptime(r[4], '%Y-%m-%d')
    n_tag = len(eval(r[5]))
    n_steps = int(r[7])
    step_len = len(r[8])
    desc_len = len(r[9])
    n_ingredients = int(r[11])

    features[0] = minutes
    features[1] = submitted.year
    features[2] = submitted.month
    features[3] = submitted.day
    features[4] = n_tag
    features[5] = n_steps
    features[6] = step_len / max_step_length
    features[7] = desc_len / max_desc_lenth
    features[8] = n_ingredients

    recipe_feature_vector[recipe_id] = features

    if None in features:
            print(features)

In [114]:
def feature(user, recipe):
    return [1] + user_feature_vector[user] + recipe_feature_vector[recipe]

In [141]:
# Multinomial logistic regression model

X_train = [feature(d[0], d[1]) for d in interactionsTrain]
y_train = [int(d[3]) for d in interactionsTrain]




In [231]:
# model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, C=2, class_weight='balanced')
model = Ridge()

In [232]:

# Train the model
scaler = StandardScaler()
transformed_train_X = scaler.fit_transform(X_train)
# transformed_train_X[:,0] = numpy.ones(len(transformed_train_X)) 
model.fit(transformed_train_X, y_train)

# X_test = [feature(d[0], d[1]) for d in interactionsValid]
# y_test = [int(d[2]) for d in interactionsValid]
# y_test_pred = model.predict(X_test)  # These are the most probable classes (ratings) for each input

In [233]:
a = transformed_train_X[0:1]
a

array([[ 0.        , -0.39205845, -0.3007464 , -0.40093401, -0.43588856,
        -0.45630743, -0.40763083, -0.56172569, -0.44949165, -1.27394596,
        -0.11631279, -0.6618398 , -2.33329588, -1.06543961, -0.0945539 ,
        -0.55800889, -0.32878167, -0.00416058, -1.05448726,  0.79132227,
         0.61877672, -0.85906063, -0.96720524, -0.79839288, -0.94345562,
         0.01053673]])

In [234]:
b = numpy.array([[ 0.        , -0.39205845, -0.3007464 , -0.40093401, -0.43588856,
        -0.45630743, -0.40763083, -0.56172569, -0.44949165, -1.27394596,
        -0.11631279, -0.6618398 , -2.33329588, -1.06543961, -0.0945539 ,
        -0.55800889, -0.32878167, -0.00416058, -1.05448726,  0.79132227,
         0.61877672, -0.85906063, -0.96720524, -0.79839288, -0.94345562,
         0.01053673]])
model.predict(b)

array([4.79888364])

In [229]:
a==b

array([[ True, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False]])

In [235]:
model.predict(transformed_train_X[0:1])

array([4.79888364])

In [236]:
train_preds = model.predict(transformed_train_X)
(numpy.clip(numpy.round(train_preds), 0, 5) == y_train).mean()

0.4602118711817266

In [204]:
y_train[:10]

[4, 5, 4, 5, 5, 4, 0, 2, 4, 5]

In [205]:
train_preds [:10]

array([4.79878217, 3.97047722, 4.728212  , 4.93987399, 4.80087692,
       4.72313768, 4.16548774, 3.96354634, 4.49810094, 4.52483004])

In [237]:
# X_test = [feature(d[0], d[1]) for d in interactionsValid]
y_test = [int(d[3]) for d in interactionsValid]
y_test_pred_lr = []

for user, recipe, date, rating, review in interactionsValid:
    if user not in user_feature_vector or recipe not in recipe_feature_vector:
        pred = 5
    else:
        transformed = scaler.transform([feature(user, recipe)])
        transformed[0] = 1 # bias term
        pred = model.predict(transformed)[0]
    y_test_pred_lr.append(pred)

# sum(1 for x,y in zip(y_test, y_test_pred_lr) if x == y) / len(y_test)
(numpy.clip(numpy.round(y_test_pred_lr), 0, 5) == y_test).mean()

0.24046698104876454

In [154]:
for user, recipe, date, rating, review in interactionsValid[:3]:
  display(rating)  
  display( (scaler.transform([feature(user, recipe)])))

'5'

array([[ 0.        , -0.12286057, -0.3007464 , -0.13236927,  0.66957265,
         1.18251579,  3.5273818 , -0.59953443,  3.31007899, -0.74052138,
        -0.95344617, -1.58930651,  1.4842169 ,  1.19417764, -0.99410289,
         0.47841405,  1.5404121 , -0.00421243,  2.04561596, -0.66368648,
        -1.20491032, -1.25757757, -0.27780187,  0.02414115,  0.51318233,
        -0.26065834]])

'5'

array([[ 0.        , -0.05556109, -0.3007464 , -0.40093401, -0.09574665,
         0.31758131,  2.57396464, -0.61920167,  2.31513839, -0.20709681,
        -0.11631279,  0.72936028,  1.4842169 ,  1.4766298 , -0.20699752,
        -0.48444463,  1.27492607, -0.00421243,  2.04561596, -0.66368648,
        -1.20491032, -1.25757757, -0.27780187,  0.02414115,  0.51318233,
        -0.26065834]])

'4'

array([[ 0.00000000e+00, -2.57459506e-01, -3.00746403e-01,
        -4.00934015e-01, -1.80782125e-01,  4.61624064e+00,
         1.52347228e+00, -4.76943812e-01,  2.05722858e+00,
         5.96154829e-02, -3.95357252e-01, -5.45906458e-01,
         3.93498964e-01, -5.00535296e-01, -2.06997520e-01,
        -1.43771003e-01,  2.34913049e+00, -4.21243272e-03,
         2.04561596e+00, -6.63686484e-01, -1.20491032e+00,
        -1.25757757e+00, -2.77801874e-01,  2.41411491e-02,
         5.13182329e-01, -2.60658342e-01]])

In [160]:
a = numpy.array([[ 1.        , -0.25745951, -0.3007464 , -0.40093401, -0.43588856,
        -0.47581723, -0.39029597,  0.38529547, -0.43521996,  0.85975235,
         1.55795395, -0.42997312, -0.42453949,  1.4766298 ,  1.02988234,
        -0.2302076 , -0.12338489, -0.00416058, -1.05448726,  0.79132227,
         0.61877672, -0.85906063, -0.96720524, -0.79839288, -0.94345562,
         0.01053673]])
model.predict(a)

array([1.12588168])

In [155]:
for user, recipe, date, rating, review in interactionsTrain[:3]:
  display(rating)  
  display( (scaler.transform([feature(user, recipe)])))

'4'

array([[ 0.        , -0.39205845, -0.3007464 , -0.40093401, -0.43588856,
        -0.45630743, -0.40763083, -0.56172569, -0.44949165, -1.27394596,
        -0.11631279, -0.6618398 , -2.33329588, -1.06543961, -0.0945539 ,
        -0.55800889, -0.32878167, -0.00416058, -1.05448726,  0.79132227,
         0.61877672, -0.85906063, -0.96720524, -0.79839288, -0.94345562,
         0.01053673]])

'5'

array([[ 0.        , -0.25745951, -0.3007464 , -0.40093401, -0.43588856,
        -0.47581723, -0.39029597,  0.38529547, -0.43521996,  0.85975235,
         1.55795395, -0.42997312, -0.42453949,  1.4766298 ,  1.02988234,
        -0.2302076 , -0.12338489, -0.00416058, -1.05448726,  0.79132227,
         0.61877672, -0.85906063, -0.96720524, -0.79839288, -0.94345562,
         0.01053673]])

'4'

array([[ 0.        , -0.39205845, -0.3007464 , -0.40093401, -0.39337082,
        -0.3652617 , -0.427277  , -0.46483316, -0.45153046, -1.27394596,
         1.27890949,  1.30902697, -2.60597537,  0.34682117,  0.46766422,
        -0.65212199, -0.15242293, -0.00421321, -1.05448726,  1.08232402,
         1.30265937,  3.12610877, -0.7948544 , -0.85459086, -1.07114791,
        -1.34543864]])

In [152]:
y_test_pred_lr[0:10]

[-578832465.332674,
 -578832465.332674,
 -578832465.332674,
 -578832465.332674,
 -578832465.332674,
 -578832465.332674,
 -578832465.332674,
 -578832465.332674,
 -578832465.332674,
 -578832465.332674]

In [117]:
y_test_pred_base = [5] * len(y_test)
sum(1 for x,y in zip(y_test, y_test_pred_base) if x == y) / len(y_test)

0.7260460269520832

In [118]:
y_test_pred_lr == y_test_pred_base

True

In [119]:
from imblearn.over_sampling import SMOTE
from collections import Counter

print('Original dataset shape:', Counter(y_train))

smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(scaler.fit_transform(X_train), y_train)

print('Resampled dataset shape:', Counter(y_train_resampled))

model.fit(X_train_resampled, y_train_resampled)

y_test = [int(d[3]) for d in interactionsValid]
y_test_pred_lr = []

for user, recipe, date, rating, review in interactionsValid:
    if user not in user_feature_vector or recipe not in recipe_feature_vector:
        pred = 5
    else:
        pred = model.predict(scaler.fit_transform([feature(user, recipe)]))[0]
    y_test_pred_lr.append(pred)

sum(1 for x,y in zip(y_test, y_test_pred_lr) if x == y) / len(y_test)

ImportError: cannot import name '_MissingValues' from 'sklearn.utils._param_validation' (c:\Users\Lenovo\anaconda3\lib\site-packages\sklearn\utils\_param_validation.py)