In [1]:
import pandas as pd

### Load Data from Source Files

In [2]:

df_recipes = pd.read_csv("../data/recipes.csv")
df_interactions = pd.read_csv("../data/interactions.csv")

In [3]:
interactions_sum = df_interactions["rating"].sum()
interactions_count = df_interactions["rating"].count()

df_recipe_ids = df_recipes[["id"]]
df_mean_ratings = df_recipe_ids.copy()[["id"]]
df_mean_ratings["mean_rating"] = (interactions_sum/interactions_count)

In [4]:
df_merged = pd.merge(
    df_interactions,
    df_recipes,
    left_on=["recipe_id"],
    right_on=["id"], how="left"
    )[["id", "name", "rating"]]

In [6]:
df_merged.head()

Unnamed: 0,id,name,rating
0,b355f2ab-a8e3-4c69-833c-3ac31f5b0022,soft snickerdoodle cookies,4
1,5d063835-3353-4072-8742-754a2b6931bf,three bean salad with orange vinaigrette,4
2,4fd380ff-2998-418b-aa0f-7bf262452e5b,sesame maple roasted tofu,5
3,b4a5a1b7-3015-4a5b-ae59-9f79fb6d7bb5,oat n toffee cookies,5
4,4f128c81-e2ab-449c-91ca-194ab31b2d6e,filled strawberry cheesecakes,5


In [7]:
df_ratings_count = df_merged.groupby(["id"]) \
    .count()[["rating"]] \
    .reset_index() \
    .rename(columns={"rating": "ratings_count"})

In [8]:
df_ratings_count.head()

Unnamed: 0,id,ratings_count
0,000002ad-1d04-438b-b047-81ae2defd87d,30
1,00003604-5c44-4773-8e7e-1693f92410ec,1
2,00007d2c-eb1e-4d13-af2a-1acd6e4da72d,6
3,0000abc9-fb37-46a1-95ee-cd60575dd13d,1
4,0000cb28-764b-4807-8233-04a293c6b876,2


In [9]:
df_ratings_average = df_merged.groupby(["id"]) \
    .mean()[["rating"]] \
    .reset_index() \
    .rename(columns={"rating": "ratings_average"})

In [10]:
df_ratings_average.head()

Unnamed: 0,id,ratings_average
0,000002ad-1d04-438b-b047-81ae2defd87d,4.3
1,00003604-5c44-4773-8e7e-1693f92410ec,3.0
2,00007d2c-eb1e-4d13-af2a-1acd6e4da72d,5.0
3,0000abc9-fb37-46a1-95ee-cd60575dd13d,5.0
4,0000cb28-764b-4807-8233-04a293c6b876,3.5


In [11]:
df_tmp_one = pd.merge(df_mean_ratings, df_ratings_count, on="id")
df_tmp_two = pd.merge(df_tmp_one, df_ratings_average, on="id")
df_tmp_two["min_ratings_count"] = df_tmp_two["ratings_count"] \
    .quantile(1 - (10000/df_tmp_two["ratings_count"].count()))

def weighted_average(row):
    ratings_count = row["ratings_count"]
    ratings_average = row["ratings_average"]
    min_ratings_count = row["min_ratings_count"]
    mean_rating = row["mean_rating"]
    return ((ratings_average * ratings_count) + (mean_rating * min_ratings_count))/(ratings_count + min_ratings_count)

df_tmp_two["ratings_weighted_average"] = df_tmp_two.apply(lambda r : weighted_average(r), axis=1)

In [12]:
df_recipe_popularity = df_tmp_two.sort_values(["ratings_weighted_average"], ascending=False) \
    .reset_index(drop=True)
df_recipe_popularity = pd.merge(df_recipe_popularity, df_recipes, on="id")[[
        "id",
        "name",
        "mean_rating",
        "ratings_count",
        "ratings_average",
        "min_ratings_count",
        "ratings_weighted_average"
    ]]

In [13]:
df_recipe_popularity.head()

Unnamed: 0,id,name,mean_rating,ratings_count,ratings_average,min_ratings_count,ratings_weighted_average
0,47dc8f77-42df-4667-929b-4835f2f32073,mexican stack up rsc,4.411666,217,4.990783,16.0,4.951016
1,5bee1ac6-e667-4b13-bebf-15862af52a49,mango salsa 1,4.411666,74,4.959459,16.0,4.862074
2,635d1863-37e1-4146-8b69-1c0bca178db0,caprese salad tomatoes italian marinated toma...,4.411666,52,5.0,16.0,4.861569
3,c3653883-f558-4573-a08f-b87b7a98a90b,brown sugar bundt cake,4.411666,118,4.915254,16.0,4.855124
4,f1f7d3b0-2942-4d8c-acfc-c554a13a9bdc,syrup for blueberry pancakes,4.411666,57,4.964912,16.0,4.843653


In [14]:
df_recipe_popularity.to_csv("../data/recipe_popularity.csv", index=False)