In [359]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import scipy.sparse as sp
from joblib import Parallel, delayed

In [2]:
recipes = pd.read_csv("recipes.csv")
reviews = pd.read_csv("reviews.csv")

In [3]:
reviews

Unnamed: 0,ReviewId,RecipeId,AuthorId,AuthorName,Rating,Review,DateSubmitted,DateModified
0,2,992,2008,gayg msft,5,better than any you can get at a restaurant!,2000-01-25T21:44:00Z,2000-01-25T21:44:00Z
1,7,4384,1634,Bill Hilbrich,4,"I cut back on the mayo, and made up the differ...",2001-10-17T16:49:59Z,2001-10-17T16:49:59Z
2,9,4523,2046,Gay Gilmore ckpt,2,i think i did something wrong because i could ...,2000-02-25T09:00:00Z,2000-02-25T09:00:00Z
3,13,7435,1773,Malarkey Test,5,easily the best i have ever had. juicy flavor...,2000-03-13T21:15:00Z,2000-03-13T21:15:00Z
4,14,44,2085,Tony Small,5,An excellent dish.,2000-03-28T12:51:00Z,2000-03-28T12:51:00Z
...,...,...,...,...,...,...,...,...
1401977,2090339,139499,2002080368,terrylbiggs,2,I was disappointed. I couldn't wait to make th...,2020-12-27T23:57:54Z,2020-12-27T23:57:54Z
1401978,2090340,148484,41805321,rogerberry,5,Nothing to drain. And I don’t heat up the liqu...,2020-12-28T00:44:42Z,2020-12-28T00:44:42Z
1401979,2090341,264191,2002901848,Reiketsukan 6.,5,Good base recipe for someone to start with. I ...,2020-12-28T01:04:43Z,2020-12-28T01:04:43Z
1401980,2090345,411791,2002901938,Sue M.,5,Thank you so much for this amazing recipe! I l...,2020-12-28T03:07:10Z,2020-12-28T03:07:10Z


### User-User Recommendation
---

In [122]:
author_review_counts = reviews.groupby('AuthorId').size()
author_review_count_greater_than_4 = author_review_counts[author_review_counts > 14]

recipe_review_count = reviews.groupby('RecipeId').size()
recipe_review_count_greater_than_4 = recipe_review_count[recipe_review_count > 19]

recipes_with_many_reviews = reviews[reviews['AuthorId'].isin(author_review_count_greater_than_4.index)]
recipes_with_many_reviews = recipes_with_many_reviews[recipes_with_many_reviews['RecipeId'].isin(recipe_review_count_greater_than_4.index)]

recipes_with_many_reviews.reset_index(inplace=True)

recipes_with_many_reviews = recipes_with_many_reviews.drop('index', axis=1)

In [153]:
recipes_with_many_reviews

Unnamed: 0,ReviewId,RecipeId,AuthorId,AuthorName,Rating,Review,DateSubmitted,DateModified
0,236,4807,2695,Hungarian Gypsy,2,"I'm sorry, but I tried this method for my Chri...",2000-12-27T13:47:50Z,2000-12-27T13:47:50Z
1,243,810,2312,Gay Gilmore,0,"Good, but I wished they were a bit more moist.",2001-01-02T16:15:26Z,2001-01-02T16:15:26Z
2,292,5466,2312,Gay Gilmore,1,"I'm pretty sure this recipe is a joke, but I d...",2001-01-16T10:53:59Z,2001-01-16T10:53:59Z
3,335,12134,2695,Hungarian Gypsy,5,This dish was excellent. The sauce turned out...,2001-01-19T16:40:14Z,2001-01-19T16:40:14Z
4,396,2713,5523,Dave C,4,Good one even for cold nights in Toronto.,2001-02-01T18:37:13Z,2001-02-01T18:37:13Z
...,...,...,...,...,...,...,...,...
344023,2090251,49088,2038463,DownHomeDinner,5,This will definitely be a repeat in my recipe ...,2020-12-25T19:23:03Z,2020-12-25T19:23:03Z
344024,2090259,43023,894666,bubbleyumm82,5,Just got a homemade pasta machine as a gift an...,2020-12-25T21:27:07Z,2020-12-25T21:27:07Z
344025,2090280,73866,843817,shadowgirl...,5,I made this recipe a few years back several ti...,2020-12-26T14:15:09Z,2020-12-26T14:15:09Z
344026,2090287,26370,2038463,DownHomeDinner,0,"While making this, I wasn't so sure I would ev...",2020-12-26T17:00:43Z,2020-12-26T17:00:43Z


In [304]:
average_rating = recipes_with_many_reviews.groupby("RecipeId").mean("Rating")
df_filtered = average_rating[average_rating['Rating'] >= 4.75]

In [311]:
df_filtered.sample(n=10)

Unnamed: 0_level_0,ReviewId,AuthorId,Rating
RecipeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
111777,734807.9,31910960.0,4.866142
9351,722956.9,361050.5,4.773292
206391,980452.8,626565.0,4.8
251220,942287.4,15155000.0,4.853659
282232,1185760.0,443714.4,5.0
46665,449622.4,223744.9,4.818182
64961,554070.6,335988.9,4.8125
53484,474756.0,185997.5,4.75
234531,1184386.0,1116603.0,4.909091
154477,732769.5,83635210.0,4.875


---

In [None]:
authorIds = recipes_with_many_reviews["AuthorId"].unique()

In [None]:
sorted_authorIds = np.sort(authorIds)

In [6]:
reviews_pivoted = recipes_with_many_reviews.pivot(index='AuthorId', columns='RecipeId', values='Rating').fillna(0)

In [7]:
binary_matrix = reviews_pivoted.where(reviews_pivoted ==  0, 1)

In [8]:
def density(df):
    total_cells = df.size
    cells_greater_than_zero = df[df > 0].count().sum()
    return (cells_greater_than_zero / total_cells) * 100

In [9]:
reviews_pivoted

RecipeId,44,49,56,62,76,102,116,129,142,153,...,516527,517764,517863,518068,518069,518145,518151,519642,533699,533997
AuthorId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1533,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1535,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1634,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1676,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1792,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2002273175,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2002312797,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2002321540,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2002404048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
print(density(reviews_pivoted))

0.30801663173620875


In [11]:
avg_ratings = reviews.groupby('RecipeId').agg(avg_rating = ('Rating', 'mean'),
                                                number_of_ratings = ('Rating', 'count')).reset_index()

avg_ratings59 = avg_ratings[avg_ratings['number_of_ratings']>59]

avg_ratings59.sort_values(by='number_of_ratings', ascending=False)["RecipeId"].head()

32406    45809
1109      2886
18447    27208
64074    89204
27386    39087
Name: RecipeId, dtype: int64

In [12]:
similarity_matrix = cosine_similarity(reviews_pivoted)

In [13]:
similarity_matrix

array([[1.        , 0.01233141, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.01233141, 1.        , 0.        , ..., 0.        , 0.00683476,
        0.02124806],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.00683476, 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.02124806, 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [14]:
similarity_matrix_df = pd.DataFrame(similarity_matrix, index=reviews_pivoted.index, columns=reviews_pivoted.index)

In [15]:
similarity_matrix_df

AuthorId,1533,1535,1634,1676,1792,1891,1962,2178,2310,2312,...,2002093000,2002144249,2002157583,2002169932,2002256447,2002273175,2002312797,2002321540,2002404048,2002754832
AuthorId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1533,1.000000,0.012331,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000
1535,0.012331,1.000000,0.0,0.0,0.0,0.01857,0.037146,0.018175,0.011138,0.027976,...,0.018837,0.000000,0.000000,0.021477,0.014439,0.0,0.028488,0.0,0.006835,0.021248
1634,0.000000,0.000000,1.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000
1676,0.000000,0.000000,0.0,1.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.127329,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000
1792,0.000000,0.000000,0.0,0.0,1.0,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.050215,0.000000,0.0,0.000000,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2002273175,0.000000,0.000000,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000
2002312797,0.000000,0.028488,0.0,0.0,0.0,0.00000,0.079318,0.000000,0.020295,0.037482,...,0.051484,0.000000,0.097879,0.000000,0.029233,0.0,1.000000,0.0,0.000000,0.000000
2002321540,0.000000,0.000000,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,1.0,0.000000,0.000000
2002404048,0.000000,0.006835,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.015521,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,1.000000,0.000000


In [16]:
row_zeros = similarity_matrix_df.eq(0).sum(axis=1)

row_with_least_zeros = row_zeros.idxmin()

print("Row with the least number of 0s:", row_with_least_zeros)

Row with the least number of 0s: 140132


In [17]:
similarity_matrix_df = pd.DataFrame(similarity_matrix)

row_sums = similarity_matrix_df.sum(axis=1)

row_with_highest_similarity = row_sums.idxmax()

print("Row with the highest accumulated similarity:", row_with_highest_similarity)

Row with the highest accumulated similarity: 2612


In [18]:
flattened_matrix = similarity_matrix.flatten()

filtered_values = flattened_matrix[flattened_matrix < .9]

max_val = filtered_values.max()

In [19]:
result = similarity_matrix_df.where(similarity_matrix_df == max_val).dropna(how='all').dropna(axis=1, how='all')

print(result)

row_index, col_index = result.index[0], result.columns[0]

          664       683
664       NaN  0.816388
683  0.816388       NaN


In [21]:
select_userid =  8200
similarities = similarity_matrix_df[select_userid].drop(select_userid)
weights = similarities/similarities.sum()

In [243]:
n = 10 
user_similarity_threshold =  0.50
# Get top n similar users
similar_users = similarity_matrix_df[similarity_matrix_df[select_userid]>user_similarity_threshold][select_userid].sort_values(ascending=False)[:n]

In [244]:
similar_users

8200    1.0
Name: 8200, dtype: float64

In [245]:
item_similarity = cosine_similarity(reviews_pivoted.T)

In [246]:
item_similarity

array([[1.        , 0.28080141, 0.04377029, ..., 0.        , 0.        ,
        0.        ],
       [0.28080141, 1.        , 0.0469507 , ..., 0.        , 0.        ,
        0.        ],
       [0.04377029, 0.0469507 , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [247]:
def recommend_items(user_id, num_recommendations=10):
    user_ratings = reviews_pivoted.loc[user_id]
    
    user_rating_indices = user_ratings.index.to_numpy()
    user_rating_nums = np.arange(len(user_rating_indices))
    similar_items = item_similarity[user_rating_nums]

    item_scores = similar_items.dot(user_ratings)
    item_scores = item_scores / np.abs(user_ratings).sum()
    
    item_scores_df = pd.DataFrame(item_scores, columns=['scores'])
    item_scores_df.index = reviews_pivoted.columns  

    item_scores_df = item_scores_df.sort_values(by='scores', ascending=False)
    return item_scores_df.index[:num_recommendations]

In [251]:
recommended_items = recommend_items(1533, 20)
print(recommended_items)

Index([ 63869,   2137,  14953,  10457,  48582,  57549,  12027, 100688,  16831,
        37252,  10554,  27141,  27733,  41707, 132916,  60085,  26499,  46877,
       110139,  21812],
      dtype='int64', name='RecipeId')


In [252]:
recommended_recipes = recipes.loc[recommended_items]

In [253]:
recommended_recipes

Unnamed: 0_level_0,RecipeId,Name,AuthorId,AuthorName,CookTime,PrepTime,TotalTime,DatePublished,Description,Images,...,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions
RecipeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
63869,68347,Potato Salad,87877,Toby Jermain,PT30M,PT1H,PT1H30M,2003-08-05T20:05:00Z,This potato salad is great !!! just the way I ...,character(0),...,4.9,154.9,730.0,44.2,4.0,6.1,8.6,,,"c(""Cook potatoes slowly in a large saucepan of..."
2137,3628,Rabbit and Prune Casserole,1543,Doreen Randal,,PT0S,PT0S,1999-09-27T05:36:00Z,Make and share this Rabbit and Prune Casserole...,character(0),...,7.7,32.1,91.9,42.8,3.8,19.4,2.4,4.0,,"c(""Marinate rabbit overnight in a mixture of, ..."
14953,18255,Oven Baked Pork Chops,28662,Tammi,PT35M,PT15M,PT50M,2002-01-27T19:56:00Z,Make and share this Oven Baked Pork Chops reci...,"c(""https://img.sndimg.com/food/image/upload/w_...",...,12.0,207.8,207.0,0.5,0.0,0.1,43.1,4.0,,"c(""Preheat oven tho 425 degrees."", ""Place butt..."
10457,13647,Potato Soup With Two Cheeses,18510,Meryl,PT2H,PT30M,PT2H30M,2001-11-02T16:30:00Z,"I have not tried this recipe, but thought it s...","""https://img.sndimg.com/food/image/upload/w_55...",...,17.6,106.6,1235.2,63.2,5.9,6.2,25.2,4.0,7 cups,"c(""Add olive oil to a large pot. Heat on mediu..."
48582,52630,Best Ever Spaghetti Sauce,34206,AngelicFantasia,PT2H,PT15M,PT2H15M,2003-01-29T20:00:00Z,This is my favorite spaghetti sauce recipe. It...,"""https://img.sndimg.com/food/image/upload/w_55...",...,11.1,64.7,4560.2,37.2,7.5,24.3,28.4,,,"c(""Brown the sausage with the onion and drain...."
57549,61850,Curried Cauliflower and Potatoes,41409,Kozmic Blues,PT20M,PT10M,PT30M,2003-05-09T20:00:00Z,This is a simple vegetarian curry dish that yo...,"c(""https://img.sndimg.com/food/image/upload/w_...",...,0.5,0.0,35.7,26.1,4.5,4.4,4.7,6.0,,"c(""Peel potatoes and cut into cubes, about 1 i..."
12027,15246,Grilled Steak and Asparagus Salad,20754,RecipeNut,PT10M,PT1H,PT1H10M,2001-12-04T13:08:00Z,Make and share this Grilled Steak and Asparagu...,character(0),...,0.3,0.0,24.5,6.3,2.9,2.1,3.1,4.0,,"c(""Blend vinaigrette and steak sauce."", ""Pour ..."
100688,106248,Rebel Rouser Spinach Artichoke Dip,112365,Danny Beason,PT35M,PT30M,PT1H5M,2004-12-16T20:00:00Z,Make and share this Rebel Rouser Spinach Artic...,character(0),...,31.5,160.9,1051.3,18.7,8.6,4.0,41.2,,,"c(""In a large bowl mix hot pepper cheese- mozz..."
16831,20167,Braided Sweet Bread,27395,Manda,PT35M,PT2H,PT2H35M,2002-02-19T18:02:00Z,This recipe comes from my best friend's aunt. ...,"c(""https://img.sndimg.com/food/image/upload/w_...",...,10.1,118.6,2144.1,275.0,8.6,50.8,39.9,,2 loaves,"c(""Mix together water, warm milk, and yeast."",..."
37252,40996,Quick Beef Soup,50281,Beaner in Washington,PT45M,PT45M,PT1H30M,2002-09-21T00:34:00Z,Make and share this Quick Beef Soup recipe fro...,character(0),...,4.9,74.1,739.6,24.6,4.0,7.4,28.4,6.0,,"c(""Cook beef, onion and garlic in large saucep..."


### Item-Item Recommendation
---

In [32]:
recipes['Keywords'].fillna('', inplace=True)

In [33]:
keyword_matrix = recipes['Keywords'].apply(lambda x: [1 if keyword in x else 0 for keyword in unique_keywords])

In [71]:
keyword_matrix

0         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2         [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3         [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4         [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
                                ...                        
522512    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
522513    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
522514    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
522515    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
522516    [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: Keywords, Length: 522517, dtype: object

In [73]:
keyword_df = pd.DataFrame.from_records(keyword_matrix.tolist())

In [87]:
keyword_df_names = keyword_df.astype('float64')

In [88]:
keyword_df_names.columns = list(unique_keywords)

In [89]:
keyword_df_names

Unnamed: 0,Salad Dressings,Malaysian,Whole Turkey,Breakfast Eggs,Broil/Grill,< 60 Mins,Halibut,Whole Chicken,Gumbo,Oysters,...,Weeknight,Chicken Stews,Meatloaf,Tomato Sauce,White Rice,Orange Roughy,Pineapple,Lentil,Household Cleaner,Georgian
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
522512,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
522513,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
522514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
522515,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [111]:
recipes_w_keywords = recipes.join(keyword_df_names)

In [112]:
recipes_w_keywords = recipes_w_keywords.drop(recipes_w_keywords.columns[1:28], axis=1)

In [318]:
recipes_w_keywords = recipes_w_keywords.astype('int64')

In [324]:
half_df = recipes_w_keywords.iloc[:len(recipes_w_keywords)//10]

In [325]:
recipe_similarities = cosine_similarity(half_df.iloc[:, 1:])

In [339]:
recipe_similarities

array([[1.        , 0.11785113, 0.59628479, ..., 0.42163702, 0.        ,
        0.        ],
       [0.11785113, 1.        , 0.        , ..., 0.1118034 , 0.57735027,
        0.28867513],
       [0.59628479, 0.        , 1.        , ..., 0.56568542, 0.18257419,
        0.        ],
       ...,
       [0.42163702, 0.1118034 , 0.56568542, ..., 1.        , 0.38729833,
        0.12909944],
       [0.        , 0.57735027, 0.18257419, ..., 0.38729833, 1.        ,
        0.16666667],
       [0.        , 0.28867513, 0.        , ..., 0.12909944, 0.16666667,
        1.        ]])

In [346]:
def recommend_recipes(recipe_id, num_recommendations=5):
    idx = recipes_w_keywords[recipes_w_keywords['RecipeId'] == recipe_id].index[0]
    sim_scores = list(enumerate(recipe_similarities[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]

    recipe_indices = [i[0] for i in sim_scores]
    return recipes_w_keywords['RecipeId'].iloc[recipe_indices], sim_scores


In [354]:
recommended_recipes, scores = recommend_recipes(50, 5)
print(recommended_recipes)
print(scores)

326        393
31360    34989
46981    50998
363        434
3692      6433
Name: RecipeId, dtype: int64
[(326, 1.0), (31360, 1.0), (46981, 1.0), (363, 0.912870929175277), (3692, 0.912870929175277)]


In [358]:
recipes.iloc[12]

RecipeId                                                                     50
Name                                                          Biscotti Di Prato
AuthorId                                                                   1752
AuthorName                                                             Bob Ross
CookTime                                                                  PT50M
PrepTime                                                                  PT20M
TotalTime                                                               PT1H10M
DatePublished                                              1999-08-31T21:19:00Z
Description                   Make and share this Biscotti Di Prato recipe f...
Images                        "https://img.sndimg.com/food/image/upload/w_55...
RecipeCategory                                                          Dessert
Keywords                      c("Cookie & Brownie", "European", "Weeknight",...
RecipeIngredientQuantities    c("3 3/4",

In [355]:
recipes.loc[recommended_recipes.index]

Unnamed: 0,RecipeId,Name,AuthorId,AuthorName,CookTime,PrepTime,TotalTime,DatePublished,Description,Images,...,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions
326,393,Chocolate and Vanilla Chip Biscotti,1545,Nancy Van Ess,,PT1H50M,PT1H50M,1999-08-15T07:31:00Z,Make and share this Chocolate and Vanilla Chip...,character(0),...,1.4,14.9,39.7,11.8,0.4,6.4,1.4,,48 biscotti,"c(""Heat oven to 350 degrees Fahrenheit."", ""Spr..."
31360,34989,Gourmet Cappuccino Biscotti,45698,OceanIvy,PT40M,PT30M,PT1H10M,2002-07-25T21:23:00Z,I have had this recipe for a while now and hav...,character(0),...,0.6,4.7,56.0,12.9,0.6,7.0,1.3,,36 biscotti,"c(""Preheat oven to 350°."", ""In bowl of a elect..."
46981,50998,Fig Newton Biscotti,37449,Sharon123,PT1H5M,PT30M,PT1H35M,2003-01-13T20:06:00Z,Make and share this Fig Newton Biscotti recipe...,character(0),...,15.6,140.3,307.9,136.8,6.5,75.4,13.2,,,"c(""Filling: Place the fruit in a small saucepa..."
363,434,Cherry-Pistachio Biscotti,1545,Nancy Van Ess,,PT1H35M,PT1H35M,1999-08-15T07:31:00Z,Make and share this Cherry-Pistachio Biscotti ...,character(0),...,0.5,9.5,11.4,5.6,0.3,2.8,1.0,,80 biscotti,"c(""Oven to 350 degrees."", ""Lightly grease a la..."
3692,6433,Spritzgeback (Spritz Cookies),293001,tweetyfan,,PT0S,PT2H10M,2000-03-06T16:07:00Z,Make and share this Spritzgeback (Spritz Cooki...,character(0),...,12.0,90.9,217.1,29.7,0.8,8.0,4.1,10.0,4 dozen cookies,"c(""Beat butter and sugar until light."", ""Beat ..."


#### Extra Work (Not Pertinant as of now)
---

In [30]:
recipes["Keywords"]

0         c("Dessert", "Low Protein", "Low Cholesterol",...
1         c("Chicken Thigh & Leg", "Chicken", "Poultry",...
2         c("Low Protein", "Low Cholesterol", "Healthy",...
3         c("Beans", "Vegetable", "Low Cholesterol", "We...
4         c("Low Protein", "Vegan", "Low Cholesterol", "...
                                ...                        
522512                                          "< 4 Hours"
522513         c("High Protein", "High In...", "< 4 Hours")
522514                            c("Dessert", "< 4 Hours")
522515                               c("< 15 Mins", "Easy")
522516                                          "< 60 Mins"
Name: Keywords, Length: 522517, dtype: object

In [31]:
all_keywords = []

num = 0

for keywords_list in recipes['Keywords']:
    if pd.isna(keywords_list):
        continue
        
    if keywords_list[0] == "c":
        keywords_list = keywords_list[2:-1]

    keywords_list = keywords_list.split(',')
    
    for keyword in keywords_list:
        all_keywords.append(keyword)

    num += 1

all_keywords = [word.strip('" ').strip() for word in all_keywords]

unique_keywords = set(all_keywords)

print(unique_keywords)

{'Salad Dressings', 'Malaysian', 'Whole Turkey', 'Breakfast Eggs', 'Broil/Grill', '< 60 Mins', 'Halibut', 'Whole Chicken', 'Gumbo', 'Oysters', 'Duck', 'Oranges', 'Moroccan', 'Camping', 'Cherries', 'Cheese', 'Bear', 'Kosher', 'Chard', 'South American', 'Halloween', 'Chocolate Chip Cookies', 'Greek', 'Winter', 'Long Grain Rice', 'Plums', 'Bar Cookie', 'Onions', 'Catfish', 'Tilapia', 'Breads', 'Tuna', 'Polynesian', 'Short Grain Rice', 'Perch', 'Bread Machine', 'Pot Roast', 'Labor Day', 'Czech', 'Peruvian', 'Raspberries', 'Pork Crock Pot', 'Frozen Desserts', 'Spicy', 'Spinach', 'Cajun', 'Steak', 'Lemon', 'Reynolds Wrap Contest', 'Coconut Desserts', 'Soy/Tofu', 'Beans', 'Healthy', 'Punch Beverage', 'Free Of...', 'Lebanese', 'Asian', 'Poultry', 'Collard Greens', 'Medium Grain Rice', 'Southwestern U.S.', 'Papaya', 'Venezuelan', 'Beef Barley Soup', 'Southwest Asia (middle East)', 'Potluck', 'Artichoke', 'Veal', 'Ecuadorean', 'Pie', 'Strawberries Desserts', 'Mussels', 'Spring', 'Polish', 'Homeo

In [154]:
recipes_with_many_reviews_complete = recipes_with_many_reviews.merge(recipes_w_keywords, on='RecipeId', how='left')

In [155]:
recipes_with_many_reviews_complete = recipes_with_many_reviews_complete.drop(recipes_with_many_reviews_complete.columns[0], axis=1)
recipes_with_many_reviews_complete = recipes_with_many_reviews_complete.drop(recipes_with_many_reviews_complete.columns[2], axis=1)
recipes_with_many_reviews_completed = recipes_with_many_reviews_complete.drop(recipes_with_many_reviews_complete.columns[3:6], axis=1)

In [231]:
recipes_with_many_reviews_completed.iloc[23333]

RecipeId              61932.0
AuthorId             101914.0
Rating                    5.0
Salad Dressings           0.0
Malaysian                 0.0
                       ...   
Orange Roughy             0.0
Pineapple                 0.0
Lentil                    0.0
Household Cleaner         0.0
Georgian                  0.0
Name: 23333, Length: 317, dtype: float64

In [232]:
recipes_with_many_reviews_completed

Unnamed: 0,RecipeId,AuthorId,Rating,Salad Dressings,Malaysian,Whole Turkey,Breakfast Eggs,Broil/Grill,< 60 Mins,Halibut,...,Weeknight,Chicken Stews,Meatloaf,Tomato Sauce,White Rice,Orange Roughy,Pineapple,Lentil,Household Cleaner,Georgian
0,4807,2695,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,810,2312,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5466,2312,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,12134,2695,5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2713,5523,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
344023,49088,2038463,5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
344024,43023,894666,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
344025,73866,843817,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
344026,26370,2038463,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [158]:
grouped_recipes = recipes_with_many_reviews_completed.groupby('AuthorId')

In [183]:
grouped_recipes

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001DD464579B0>

In [164]:
def calculate_recipe_similarity(recipe1, recipe2):
    return cosine_similarity([recipe1], [recipe2])[0][0]

In [165]:
def calculate_author_similarity(author1_recipes, author2_recipes):
    similarities = []
    for recipe1 in author1_recipes:
        for recipe2 in author2_recipes:
            similarities.append(calculate_recipe_similarity(recipe1, recipe2))
    return sum(similarities) / len(similarities)

In [189]:
author_ids = recipes_with_many_reviews_completed['AuthorId'].unique()
similarity_matrix = pd.DataFrame(index=author_ids, columns=author_ids)

In [225]:
similarity_matrix

Unnamed: 0,2695,2312,5523,42189,6406,6702,8526,7802,10033,2178,...,2002312797,2001832057,2002321540,2002169932,2002404048,2001925765,2001201872,2002273175,2000407935,2002754832
2695,1.0,0.728705,0.781411,0.785116,0.77594,0.765874,0.771648,0.794408,0.783261,0.771569,...,,,,,,,,,,
2312,,,,,,,,,,,...,,,,,,,,,,
5523,,,,,,,,,,,...,,,,,,,,,,
42189,,,,,,,,,,,...,,,,,,,,,,
6406,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2001925765,,,,,,,,,,,...,,,,,,,,,,
2001201872,,,,,,,,,,,...,,,,,,,,,,
2002273175,,,,,,,,,,,...,,,,,,,,,,
2000407935,,,,,,,,,,,...,,,,,,,,,,


In [226]:
for author1 in author_ids:
    print(author1)
    for author2 in author_ids:
        if author1 == author2:
            similarity_matrix.loc[author1, author2] = 1.0
        else:
            author1_recipes = grouped_recipes.get_group(author1).drop(['AuthorId', 'RecipeId'], axis=1).values
            author2_recipes = grouped_recipes.get_group(author2).drop(['AuthorId', 'RecipeId'], axis=1).values
            similarity_matrix.loc[author1, author2] = calculate_author_similarity(author1_recipes, author2_recipes)

        # Display the matrix at regular intervals or when a specific condition is met
        if author2 % 100 == 0:
            display(similarity_matrix)

2695


Unnamed: 0,2695,2312,5523,42189,6406,6702,8526,7802,10033,2178,...,2002312797,2001832057,2002321540,2002169932,2002404048,2001925765,2001201872,2002273175,2000407935,2002754832
2695,1.0,0.728705,0.781411,0.785116,0.77594,0.765874,0.771648,0.794408,0.783261,0.771569,...,,,,,,,,,,
2312,,,,,,,,,,,...,,,,,,,,,,
5523,,,,,,,,,,,...,,,,,,,,,,
42189,,,,,,,,,,,...,,,,,,,,,,
6406,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2001925765,,,,,,,,,,,...,,,,,,,,,,
2001201872,,,,,,,,,,,...,,,,,,,,,,
2002273175,,,,,,,,,,,...,,,,,,,,,,
2000407935,,,,,,,,,,,...,,,,,,,,,,


Unnamed: 0,2695,2312,5523,42189,6406,6702,8526,7802,10033,2178,...,2002312797,2001832057,2002321540,2002169932,2002404048,2001925765,2001201872,2002273175,2000407935,2002754832
2695,1.0,0.728705,0.781411,0.785116,0.77594,0.765874,0.771648,0.794408,0.783261,0.771569,...,,,,,,,,,,
2312,,,,,,,,,,,...,,,,,,,,,,
5523,,,,,,,,,,,...,,,,,,,,,,
42189,,,,,,,,,,,...,,,,,,,,,,
6406,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2001925765,,,,,,,,,,,...,,,,,,,,,,
2001201872,,,,,,,,,,,...,,,,,,,,,,
2002273175,,,,,,,,,,,...,,,,,,,,,,
2000407935,,,,,,,,,,,...,,,,,,,,,,


KeyboardInterrupt: 

In [178]:
len(author_ids)

10620

---