In [1]:
# Mounting Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [38]:
# Importing Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import pairwise_distances

In [40]:
recipes = pd.read_csv('/content/drive/MyDrive/Semester-2/Recommendation System/Assignment_2/RAW_recipes.csv')

In [41]:
recipe_reviews_df = pd.read_csv('/content/drive/MyDrive/Semester-2/Recommendation System/Assignment_2/interactions_train.csv')  # or any other method of loading the data
user_reviews = recipe_reviews_df.groupby('user_id').agg(reviews_count=('recipe_id', 'count')).reset_index()
user_reviews = user_reviews.sort_values('reviews_count', ascending=False).head(10000).reset_index(drop=True)

recipe_reviews = recipe_reviews_df.groupby('recipe_id').agg(reviews_count=('user_id', 'count')).reset_index()
recipe_reviews = recipe_reviews.sort_values('reviews_count', ascending=False).head(10000).reset_index(drop=True)

partially_reviewed = recipe_reviews_df.merge(user_reviews.drop('reviews_count', axis=1)).merge(recipe_reviews.drop('reviews_count', axis=1))

In [42]:
user_reviews.head()

Unnamed: 0,user_id,reviews_count
0,424680,6437
1,37449,4581
2,383346,3656
3,169430,3465
4,128473,3338


In [43]:
recipe_reviews.head()

Unnamed: 0,recipe_id,reviews_count
0,27208,1091
1,89204,1075
2,32204,897
3,39087,894
4,69173,787


In [44]:
partially_reviewed.head()

Unnamed: 0,user_id,recipe_id,date,rating,u,i
0,2312,2886,2001-04-18,5.0,1674,106975
1,11044,2886,2002-07-25,5.0,3425,106975
2,10979,2886,2001-06-18,5.0,1112,106975
3,9869,2886,2001-09-17,5.0,506,106975
4,22655,2886,2009-02-07,3.0,1628,106975


In [45]:
partially_reviewed = partially_reviewed[['user_id','recipe_id','rating']]

In [46]:
partially_reviewed.shape

(266168, 3)

In [47]:
print('unique users:',len(partially_reviewed.user_id.unique()))
print('unique recipes:',len(partially_reviewed.recipe_id.unique()))

unique users: 9966
unique recipes: 10000


In [48]:
new_userID = dict(zip(list(partially_reviewed['user_id'].unique()),
                      list(range(len(partially_reviewed['user_id'].unique())))))
new_userID

{2312: 0,
 11044: 1,
 10979: 2,
 9869: 3,
 22655: 4,
 15712: 5,
 26313: 6,
 25455: 7,
 25792: 8,
 10033: 9,
 8688: 10,
 28397: 11,
 29014: 12,
 31083: 13,
 30209: 14,
 36713: 15,
 29063: 16,
 42058: 17,
 43505: 18,
 32772: 19,
 60989: 20,
 55589: 21,
 64583: 22,
 30228: 23,
 76074: 24,
 85714: 25,
 88717: 26,
 96436: 27,
 93997: 28,
 69248: 29,
 25941: 30,
 101376: 31,
 59588: 32,
 98919: 33,
 92641: 34,
 111347: 35,
 91655: 36,
 91584: 37,
 126104: 38,
 60716: 39,
 135887: 40,
 121852: 41,
 122175: 42,
 144671: 43,
 148715: 44,
 130513: 45,
 140806: 46,
 10320: 47,
 155675: 48,
 155595: 49,
 159645: 50,
 142386: 51,
 162086: 52,
 163601: 53,
 151638: 54,
 172369: 55,
 168087: 56,
 162725: 57,
 153918: 58,
 184081: 59,
 188119: 60,
 189475: 61,
 99221: 62,
 186979: 63,
 194611: 64,
 203111: 65,
 201064: 66,
 207375: 67,
 207176: 68,
 215350: 69,
 218535: 70,
 217226: 71,
 215260: 72,
 227039: 73,
 224088: 74,
 227496: 75,
 124268: 76,
 143721: 77,
 235074: 78,
 203741: 79,
 257993: 80,

In [49]:
new_recipeID = dict(zip(list(partially_reviewed['recipe_id'].unique()),
                        list(range(len(partially_reviewed['recipe_id'].unique())))))
new_recipeID

{2886: 0,
 3368: 1,
 11020: 2,
 9023: 3,
 9254: 4,
 13619: 5,
 15364: 6,
 15654: 7,
 14337: 8,
 18513: 9,
 20492: 10,
 16698: 11,
 33454: 12,
 37490: 13,
 31639: 14,
 18487: 15,
 45539: 16,
 35779: 17,
 35547: 18,
 52840: 19,
 61718: 20,
 19901: 21,
 75817: 22,
 14359: 23,
 48537: 24,
 39911: 25,
 78189: 26,
 86868: 27,
 65131: 28,
 106251: 29,
 140466: 30,
 109283: 31,
 21761: 32,
 132411: 33,
 97648: 34,
 109006: 35,
 202935: 36,
 239592: 37,
 200008: 38,
 5478: 39,
 15340: 40,
 19257: 41,
 44995: 42,
 55680: 43,
 24709: 44,
 79462: 45,
 17300: 46,
 55394: 47,
 53767: 48,
 83287: 49,
 51235: 50,
 37638: 51,
 118475: 52,
 30018: 53,
 70224: 54,
 63750: 55,
 57033: 56,
 90822: 57,
 213535: 58,
 2713: 59,
 17222: 60,
 3400: 61,
 19596: 62,
 24685: 63,
 23418: 64,
 4368: 65,
 40621: 66,
 4460: 67,
 16110: 68,
 38618: 69,
 56453: 70,
 99570: 71,
 9168: 72,
 17092: 73,
 3683: 74,
 21320: 75,
 10075: 76,
 8599: 77,
 12668: 78,
 22201: 79,
 16726: 80,
 29184: 81,
 15656: 82,
 14255: 83,
 186

In [50]:
data = partially_reviewed.replace({'user_id': new_userID, 'recipe_id': new_recipeID})
data.head()

Unnamed: 0,user_id,recipe_id,rating
0,0,0,5.0
1,1,0,5.0
2,2,0,5.0
3,3,0,5.0
4,4,0,3.0


In [51]:
recipe = recipes[['name', 'id', 'ingredients']].merge(partially_reviewed[['recipe_id']], 
                                                left_on = 'id', right_on = 'recipe_id', 
                                                how = 'right').drop(['id'], axis = 1).drop_duplicates().reset_index(drop = True)
recipe

Unnamed: 0,name,ingredients,recipe_id
0,best banana bread,"['butter', 'granulated sugar', 'eggs', 'banana...",2886
1,blackberry pie iii,"['sugar', 'all-purpose flour', 'cornstarch', '...",3368
2,casablanca chicken,"['boneless skinless chicken breasts', 'olive o...",11020
3,memaw s collard greens,"['collard greens', 'smoked bacon', 'sugar', 's...",9023
4,grilled salmon,"['fresh salmon', 'butter', 'garlic', 'salt', '...",9254
...,...,...,...
9995,crispy shrimp stuffed pork meatballs rsc,"['breadcrumbs', 'parmesan cheese', 'hidden val...",495134
9996,cheesy potato italian sausage balls 5fix,['simply potatoes traditional mashed potatoes'...,496573
9997,cheddar stuffed potato cake and egg 5fix,"['olive oil', 'simply potatoes traditional mas...",497382
9998,mac n cheese and spinach strata sp5,"['simply macaroni & cheese', 'frozen spinach',...",514423


In [52]:
# Adjusting Rating...
mean = data.groupby(['user_id'], as_index = False, sort = False).mean().rename(columns = {'rating':'rating_mean'})
data = data.merge(mean[['user_id','rating_mean']], how = 'left')
data.insert(2, 'rating_adjusted', data['rating'] - data['rating_mean'])
data

Unnamed: 0,user_id,recipe_id,rating_adjusted,rating,rating_mean
0,0,0,0.358974,5.0,4.641026
1,1,0,0.307692,5.0,4.692308
2,2,0,0.000000,5.0,5.000000
3,3,0,0.094431,5.0,4.905569
4,4,0,-1.000000,3.0,4.000000
...,...,...,...,...,...
266163,9840,9999,1.176471,5.0,3.823529
266164,9841,9999,0.500000,5.0,4.500000
266165,9951,9999,0.333333,5.0,4.666667
266166,9959,9999,0.000000,5.0,5.000000


In [53]:
# Train-Test Split...
train_data, test_data = train_test_split(data, test_size = 0.25)
n_users = data.user_id.unique()
n_items = data.recipe_id.unique()
train_data_matrix = np.zeros((n_users.shape[0], n_items.shape[0]))
for row in train_data.itertuples():
    train_data_matrix[row[1]-1, row[2]-1] = row[3]
display(train_data_matrix.shape)
display(train_data_matrix)

(9966, 10000)

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.30769231],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.09443099],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.35897436, 0.        , 0.        , ..., 0.        , 0.        ,
        0.35897436]])

In [54]:
test_data_matrix = np.zeros((n_users.shape[0], n_items.shape[0]))
for row in test_data.itertuples():
    test_data_matrix[row[1]-1, row[2]-1] = row[3]
display(test_data_matrix.shape)
display(test_data_matrix)

(9966, 10000)

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.35897436, -0.64102564, ...,  0.        ,
         0.        ,  0.        ]])

In [55]:
item_similarity = 1 - pairwise_distances(train_data_matrix.T, metric = 'cosine')
display(item_similarity.shape)
display(item_similarity)

(10000, 10000)

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00249396],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.00523503],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.11873674,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.11873674, 1.        ,
        0.        ],
       [0.00249396, 0.00523503, 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [56]:
def predict(ratings, similarity):
    pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis = 1)]) 
    return pred

In [57]:
item_pred = predict(train_data_matrix, item_similarity)
display(item_pred.shape)
display(item_pred)

(9966, 10000)

array([[ 3.44568621e-05,  3.87895637e-05,  1.12532451e-04, ...,
         0.00000000e+00,  0.00000000e+00,  1.06825716e-02],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-1.14415602e-04,  8.52386386e-04,  3.56489951e-04, ...,
         0.00000000e+00,  0.00000000e+00,  2.91074238e-03],
       ...,
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 3.18367002e-02, -2.42550463e-03, -1.91384079e-03, ...,
         0.00000000e+00,  0.00000000e+00,  1.23995705e-02]])

In [58]:
item_pred_df = pd.DataFrame(item_pred, columns = list(n_items))
item_pred_df.insert(0, 'user_id', list(n_users))

In [59]:
item_pred_df

Unnamed: 0,user_id,0,1,2,3,4,5,6,7,8,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,0,0.000034,0.000039,0.000113,-0.000033,0.000092,-0.000286,0.000060,-0.000204,0.000488,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010683
1,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
2,2,-0.000114,0.000852,0.000356,0.001626,0.000032,-0.000266,-0.000598,-0.000147,0.001746,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002911
3,3,-0.019131,0.005912,-0.000083,0.002600,0.001904,0.000824,0.001553,0.002306,-0.001952,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.075542
4,4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9961,9961,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
9962,9962,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
9963,9963,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
9964,9964,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000


In [60]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def RMSE(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    
    return sqrt(mean_squared_error(prediction, ground_truth))
item_RMSE = RMSE(item_pred, test_data_matrix)
print('item_RMSE = {}'.format(item_RMSE))

item_RMSE = 0.8661934049161691


In [61]:
def getRecommendations_UserBased(user_id, top_n = 10):
    for old_user, new_user in new_userID.items():
        if user_id == new_user:
            print(f'Top {top_n} Recommended Recipes for Original User ID: {old_user}\n')
    
    movie_rated = list(data['recipe_id'].loc[data['user_id'] == user_id])
    item = item_pred_df.loc[item_pred_df['user_id'] == user_id].copy()
    item.drop(item_pred_df[movie_rated], axis = 1, inplace = True)
    unwatch_sorted = item.iloc[:,1:].sort_values(by = item.index[0], axis = 1, ascending = False)
    dict_top_n = unwatch_sorted.iloc[:, :top_n].to_dict(orient = 'records')

    i = 1
    for recipe_id in list(dict_top_n[0].keys()):
        for old_recipe, new_recipe in new_recipeID.items():
            if recipe_id == new_recipe:
                name = recipe[recipe['recipe_id'] == old_recipe]['name'].values[0]
                ingredients = recipe[recipe['recipe_id'] == old_recipe]['ingredients'].values[0]

                print(f'Top {i} Original Recipe ID: {old_recipe} - {name}\n Ingredients: {ingredients}\n')
                
                i += 1
                
    return dict_top_n[0]

In [62]:
recommendations = getRecommendations_UserBased(2)
recommendations

Top 10 Recommended Recipes for Original User ID: 10979

Top 1 Original Recipe ID: 11661 - oven roast beef
 Ingredients: ['sirloin tip roast', 'dry onion soup mix', 'cream of mushroom soup', 'water']

Top 2 Original Recipe ID: 29984 - honey steak marinade
 Ingredients: ['soy sauce', 'honey', 'vinegar', 'fresh ginger', 'garlic', 'olive oil', 'flank steaks']

Top 3 Original Recipe ID: 33843 - baked salsa chicken breast
 Ingredients: ['boneless skinless chicken breast halves', 'salsa', 'brown sugar', 'balsamic vinegar', 'dijon mustard']

Top 4 Original Recipe ID: 23686 - mean chef s maple brine
 Ingredients: ['brown sugar', 'maple syrup', 'kosher salt', 'garlic', 'bay leaves', 'fresh ginger', 'dried chili pepper flakes', 'soy sauce', 'fresh thyme', 'water']

Top 5 Original Recipe ID: 12354 - mexican flan  baked caramel custard
 Ingredients: ['sugar', 'eggs', 'milk', 'vanilla', 'orange rind', 'fruit']

Top 6 Original Recipe ID: 98345 - pot stickers  chinese dumplings
 Ingredients: ['ground 

{1722: 0.1814800643894581,
 1887: 0.10103749032997845,
 1823: 0.08284437451452778,
 1768: 0.07262397532631026,
 1675: 0.06612550630856073,
 1753: 0.06612475996243122,
 7737: 0.06414057059651325,
 1874: 0.06252412206403096,
 1778: 0.05992197519705414,
 1862: 0.05614749455362678}