In [1]:
import os
import torch
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import random_split

### Data loading & pre-processing

In [2]:
# read reviews.parquet file
reviews_df = pd.read_parquet('data/reviews.parquet')

# read recipes.parquet file
recipes_df = pd.read_parquet('data/recipes.parquet')

In [3]:
# print the first 5 rows of reviews_df
reviews_df.head()

Unnamed: 0,ReviewId,RecipeId,AuthorId,AuthorName,Rating,Review,DateSubmitted,DateModified
0,2,992,2008,gayg msft,5,better than any you can get at a restaurant!,2000-01-25 21:44:00+00:00,2000-01-25 21:44:00+00:00
1,7,4384,1634,Bill Hilbrich,4,"I cut back on the mayo, and made up the differ...",2001-10-17 16:49:59+00:00,2001-10-17 16:49:59+00:00
2,9,4523,2046,Gay Gilmore ckpt,2,i think i did something wrong because i could ...,2000-02-25 09:00:00+00:00,2000-02-25 09:00:00+00:00
3,13,7435,1773,Malarkey Test,5,easily the best i have ever had. juicy flavor...,2000-03-13 21:15:00+00:00,2000-03-13 21:15:00+00:00
4,14,44,2085,Tony Small,5,An excellent dish.,2000-03-28 12:51:00+00:00,2000-03-28 12:51:00+00:00


In [4]:
# print the first 5 rows of recipes_df
recipes_df.head()

Unnamed: 0,RecipeId,Name,AuthorId,AuthorName,CookTime,PrepTime,TotalTime,DatePublished,Description,Images,...,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions
0,38.0,Low-Fat Berry Blue Frozen Dessert,1533,Dancer,PT24H,PT45M,PT24H45M,1999-08-09 21:46:00+00:00,Make and share this Low-Fat Berry Blue Frozen ...,[https://img.sndimg.com/food/image/upload/w_55...,...,1.3,8.0,29.8,37.1,3.6,30.2,3.2,4.0,,"[Toss 2 cups berries with sugar., Let stand fo..."
1,39.0,Biryani,1567,elly9812,PT25M,PT4H,PT4H25M,1999-08-29 13:12:00+00:00,Make and share this Biryani recipe from Food.com.,[https://img.sndimg.com/food/image/upload/w_55...,...,16.6,372.8,368.4,84.4,9.0,20.4,63.4,6.0,,[Soak saffron in warm milk for 5 minutes and p...
2,40.0,Best Lemonade,1566,Stephen Little,PT5M,PT30M,PT35M,1999-09-05 19:52:00+00:00,This is from one of my first Good House Keepi...,[https://img.sndimg.com/food/image/upload/w_55...,...,0.0,0.0,1.8,81.5,0.4,77.2,0.3,4.0,,"[Into a 1 quart Jar with tight fitting lid, pu..."
3,41.0,Carina's Tofu-Vegetable Kebabs,1586,Cyclopz,PT20M,PT24H,PT24H20M,1999-09-03 14:54:00+00:00,This dish is best prepared a day in advance to...,[https://img.sndimg.com/food/image/upload/w_55...,...,3.8,0.0,1558.6,64.2,17.3,32.1,29.3,2.0,4 kebabs,"[Drain the tofu, carefully squeezing out exces..."
4,42.0,Cabbage Soup,1538,Duckie067,PT30M,PT20M,PT50M,1999-09-19 06:19:00+00:00,Make and share this Cabbage Soup recipe from F...,[https://img.sndimg.com/food/image/upload/w_55...,...,0.1,0.0,959.3,25.1,4.8,17.7,4.3,4.0,,"[Mix everything together and bring to a boil.,..."


In [5]:
# number of rows of recipes_df
len(recipes_df)

522517

In [6]:
# number of rows of reviews_df
len(reviews_df)

1401982

In [7]:
data = pd.merge(reviews_df, recipes_df, on='RecipeId')
# print the first 5 rows of merged dataframe
data.head()

Unnamed: 0,ReviewId,RecipeId,AuthorId_x,AuthorName_x,Rating,Review,DateSubmitted,DateModified,Name,AuthorId_y,...,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions
0,2,992,2008,gayg msft,5,better than any you can get at a restaurant!,2000-01-25 21:44:00+00:00,2000-01-25 21:44:00+00:00,Jalapeno Pepper Poppers,1545,...,4.9,23.7,172.5,3.2,0.6,0.9,4.3,24.0,,"[In a mixing bowl, combine cheeses, bacon and ..."
1,250,992,4242,marie m,5,my family loved these. fresher than any bough...,2001-01-02 16:19:29+00:00,2001-01-02 16:19:29+00:00,Jalapeno Pepper Poppers,1545,...,4.9,23.7,172.5,3.2,0.6,0.9,4.3,24.0,,"[In a mixing bowl, combine cheeses, bacon and ..."
2,242612,992,205100,mommyoftwo,5,Wow! Awesome - Awesome - Awesome! And I don'...,2006-01-27 06:09:46+00:00,2006-01-27 06:09:46+00:00,Jalapeno Pepper Poppers,1545,...,4.9,23.7,172.5,3.2,0.6,0.9,4.3,24.0,,"[In a mixing bowl, combine cheeses, bacon and ..."
3,463486,992,559815,karen.nash-horton,4,This is a good recipe. I much prefer the oven ...,2007-08-15 11:04:13+00:00,2007-08-15 11:04:13+00:00,Jalapeno Pepper Poppers,1545,...,4.9,23.7,172.5,3.2,0.6,0.9,4.3,24.0,,"[In a mixing bowl, combine cheeses, bacon and ..."
4,643932,992,743849,NELady,5,"GREAT recipe! I used fat free cream cheese, p...",2008-06-02 18:24:08+00:00,2008-06-02 18:24:08+00:00,Jalapeno Pepper Poppers,1545,...,4.9,23.7,172.5,3.2,0.6,0.9,4.3,24.0,,"[In a mixing bowl, combine cheeses, bacon and ..."


In [8]:
# number of rows of merged dataframe
len(data)

1401963

In [9]:
# all the columns of merged dataframe
data.columns

Index(['ReviewId', 'RecipeId', 'AuthorId_x', 'AuthorName_x', 'Rating',
       'Review', 'DateSubmitted', 'DateModified', 'Name', 'AuthorId_y',
       'AuthorName_y', 'CookTime', 'PrepTime', 'TotalTime', 'DatePublished',
       'Description', 'Images', 'RecipeCategory', 'Keywords',
       'RecipeIngredientQuantities', 'RecipeIngredientParts',
       'AggregatedRating', 'ReviewCount', 'Calories', 'FatContent',
       'SaturatedFatContent', 'CholesterolContent', 'SodiumContent',
       'CarbohydrateContent', 'FiberContent', 'SugarContent', 'ProteinContent',
       'RecipeServings', 'RecipeYield', 'RecipeInstructions'],
      dtype='object')

In [10]:
# types of each column
data.dtypes

ReviewId                                    int32
RecipeId                                    int32
AuthorId_x                                  int32
AuthorName_x                               object
Rating                                      int32
Review                                     object
DateSubmitted                 datetime64[ns, UTC]
DateModified                  datetime64[ns, UTC]
Name                                       object
AuthorId_y                                  int32
AuthorName_y                               object
CookTime                                   object
PrepTime                                   object
TotalTime                                  object
DatePublished                 datetime64[ns, UTC]
Description                                object
Images                                     object
RecipeCategory                             object
Keywords                                   object
RecipeIngredientQuantities                 object


In [11]:
# filter out data with low ratings (ratings < 4)
data = data[data['Rating'] >= 4]

In [12]:
len(data)

1241281

In [13]:
# select all relevant columns from original merged dataframe
df = data[['RecipeId', 'CookTime', 'PrepTime', 'TotalTime',
            'Calories', 'FatContent',
            'SaturatedFatContent', 'CholesterolContent', 'SodiumContent',
            'CarbohydrateContent', 'FiberContent', 'SugarContent', 'ProteinContent',
            ]]

# remove duplicates
df = df[~df.duplicated()]

# reset the index and drop the original index column
df = df.reset_index(drop=True)

# print the updated dataframe
df.head()

Unnamed: 0,RecipeId,CookTime,PrepTime,TotalTime,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent
0,992,,PT30M,PT30M,111.4,9.2,4.9,23.7,172.5,3.2,0.6,0.9,4.3
1,4384,PT2H,PT5M,PT2H5M,4.6,0.1,0.0,0.0,291.3,0.9,0.4,0.1,0.2
2,4523,PT30M,PT30M,PT1H,420.7,5.7,1.4,133.1,2077.6,45.5,1.3,20.3,43.0
3,7435,PT3H20M,PT1H15M,PT4H35M,756.6,44.3,14.6,222.1,2783.7,41.5,9.5,11.8,47.0
4,44,PT3M,PT35M,PT38M,895.5,66.8,31.9,405.8,557.2,29.1,3.1,5.0,45.3


In [14]:
# number of rows of final dataframe
len(df)

255322

In [15]:
df.isna().sum()

RecipeId                   0
CookTime               40960
PrepTime                   0
TotalTime                  0
Calories                   0
FatContent                 0
SaturatedFatContent        0
CholesterolContent         0
SodiumContent              0
CarbohydrateContent        0
FiberContent               0
SugarContent               0
ProteinContent             0
dtype: int64

In [16]:
# fill the NA CookTime value with time 0 ('PT0S')
df['CookTime'].fillna('PT0S', inplace=True)
df.isna().sum()

RecipeId               0
CookTime               0
PrepTime               0
TotalTime              0
Calories               0
FatContent             0
SaturatedFatContent    0
CholesterolContent     0
SodiumContent          0
CarbohydrateContent    0
FiberContent           0
SugarContent           0
ProteinContent         0
dtype: int64

In [17]:
# function for converting time due to iso 8601 format error
def convert(s):
    time = s[2:s.find('H')]
    if len(time) > 2: 
        days = int(time)//24
        hours = int(time)-days*24
        s = s.replace(time, str(days)+'D'+str(hours))
    return s
s = 'PT100H'
s = convert(s)
print(s)
s = 'PT240H'
s = convert(s)
print(s)

PT4D4H
PT10D0H


In [18]:
# convert CookTime, PrepTime and TotalTime from iso format to timedelta format
df['CookTime'] = df['CookTime'].apply(lambda x: pd.to_timedelta(convert(x)).total_seconds())
df['PrepTime'] = df['PrepTime'].apply(lambda x: pd.to_timedelta(convert(x)).total_seconds())
df['TotalTime'] = df['TotalTime'].apply(lambda x: pd.to_timedelta(convert(x)).total_seconds())

In [19]:
# print first 5 rows of finalized df for k-means clustering
df.head()

Unnamed: 0,RecipeId,CookTime,PrepTime,TotalTime,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent
0,992,0.0,1800.0,1800.0,111.4,9.2,4.9,23.7,172.5,3.2,0.6,0.9,4.3
1,4384,7200.0,300.0,7500.0,4.6,0.1,0.0,0.0,291.3,0.9,0.4,0.1,0.2
2,4523,1800.0,1800.0,3600.0,420.7,5.7,1.4,133.1,2077.6,45.5,1.3,20.3,43.0
3,7435,12000.0,4500.0,16500.0,756.6,44.3,14.6,222.1,2783.7,41.5,9.5,11.8,47.0
4,44,180.0,2100.0,2280.0,895.5,66.8,31.9,405.8,557.2,29.1,3.1,5.0,45.3


In [20]:
# Use k-means clustering to select ~2,000 possible alike recipes for each selected recipe for training/validation

# select the cooking time features and convert it to numpy
df_cookingtime = df[df.columns[1:4]]
df_cookingtime = df_cookingtime.to_numpy()

# fit the model
k = 130 # may be modified later
kmeans_cookingtime = KMeans(n_clusters=k, init='k-means++', n_init='auto', tol=1e-4, random_state=540).fit(df_cookingtime) 

# add the cluster labels to the DataFrame
df['label_cooktime'] = kmeans_cookingtime.labels_

# select the ingredient features and convert it to numpy
df_ingredient = df[df.columns[4:]]
df_ingredient = df_ingredient.to_numpy()

# fit the model
k = 130 # may be modified later
kmeans_ingredient = KMeans(n_clusters=k, init='k-means++', n_init='auto', tol=1e-4, random_state=540).fit(df_ingredient) 

# add the cluster labels to the DataFrame
df['label_ingredients'] = kmeans_ingredient.labels_

In [21]:
df.head()

Unnamed: 0,RecipeId,CookTime,PrepTime,TotalTime,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,label_cooktime,label_ingredients
0,992,0.0,1800.0,1800.0,111.4,9.2,4.9,23.7,172.5,3.2,0.6,0.9,4.3,77,85
1,4384,7200.0,300.0,7500.0,4.6,0.1,0.0,0.0,291.3,0.9,0.4,0.1,0.2,58,85
2,4523,1800.0,1800.0,3600.0,420.7,5.7,1.4,133.1,2077.6,45.5,1.3,20.3,43.0,116,55
3,7435,12000.0,4500.0,16500.0,756.6,44.3,14.6,222.1,2783.7,41.5,9.5,11.8,47.0,36,109
4,44,180.0,2100.0,2280.0,895.5,66.8,31.9,405.8,557.2,29.1,3.1,5.0,45.3,77,114


In [22]:
df.dtypes

RecipeId                 int32
CookTime               float64
PrepTime               float64
TotalTime              float64
Calories               float64
FatContent             float64
SaturatedFatContent    float64
CholesterolContent     float64
SodiumContent          float64
CarbohydrateContent    float64
FiberContent           float64
SugarContent           float64
ProteinContent         float64
label_cooktime           int32
label_ingredients        int32
dtype: object

In [26]:
# test (10 random recipes)
for _ in range(10):
    test_recipeId = np.random.randint(0, len(df))
    print(f'test_recipeId: {test_recipeId}')
    label_cooktime = df.loc[test_recipeId, 'label_cooktime']
    label_ingredients = df.loc[test_recipeId, 'label_ingredients']
    print(f'label_cooktime: {label_cooktime}, label_ingredients: {label_ingredients}')
    alikeRecipes = df[df['label_cooktime'] == label_cooktime]
    alikeRecipes = df[df['label_ingredients'] == label_ingredients]
    print(f'number of alike recipes: {len(alikeRecipes)}')


test_recipeId: 49876
label_cooktime: 80, label_ingredients: 106
number of alike recipes: 14695
test_recipeId: 65727
label_cooktime: 0, label_ingredients: 78
number of alike recipes: 718
test_recipeId: 64992
label_cooktime: 47, label_ingredients: 85
number of alike recipes: 17353
test_recipeId: 161442
label_cooktime: 47, label_ingredients: 58
number of alike recipes: 686
test_recipeId: 49412
label_cooktime: 76, label_ingredients: 121
number of alike recipes: 22479
test_recipeId: 131112
label_cooktime: 0, label_ingredients: 15
number of alike recipes: 9633
test_recipeId: 180643
label_cooktime: 127, label_ingredients: 79
number of alike recipes: 27103
test_recipeId: 83162
label_cooktime: 47, label_ingredients: 59
number of alike recipes: 11684
test_recipeId: 243780
label_cooktime: 47, label_ingredients: 51
number of alike recipes: 6500
test_recipeId: 150378
label_cooktime: 76, label_ingredients: 126
number of alike recipes: 9136


### Below part haven't change anything

In [10]:
class RecipeDataset(Dataset):
    def __init__(self, data):
        self.user_encoder = LabelEncoder()
        self.item_encoder = LabelEncoder()
        self.recipe_ids = self.item_encoder.fit_transform(data["RecipeId"].values)
        self.author_ids = self.user_encoder.fit_transform(data["AuthorId_y"].values)

        self.ratings = data["Rating"].astype(float).values
        self.calories = data["Calories"].astype(float).values
        self.review_counts = data["ReviewCount"].astype(float).values

    def __len__(self):
        return len(self.recipe_ids)

    def __getitem__(self, idx):
        recipe_id = self.recipe_ids[idx]
        author_id = self.author_ids[idx]
        rating = self.ratings[idx]
        calories = self.calories[idx]
        review_count = self.review_counts[idx]
        return (recipe_id, author_id, calories, review_count), rating

In [17]:
# Split the dataset into training and validation sets
data = df # TODO
data = RecipeDataset(data)
train_size = int(0.8 * len(data))
val_size = len(data) - train_size
print(f'train size: {train_size}, val size: {val_size}')
train_dataset, val_dataset = random_split(data, [train_size, val_size])
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

train size: 348328, val size: 87082


### Model

In [12]:
class RecipeModel(torch.nn.Module):
    def __init__(self, num_recipes, num_authors, max_calories, max_review_counts, embedding_dim=16, num_heads=4):
        super().__init__()
        self.recipe_embedding = torch.nn.Embedding(num_recipes, embedding_dim)
        self.author_embedding = torch.nn.Embedding(num_authors, embedding_dim)
        self.calorie_embedding = torch.nn.Embedding(max_calories, embedding_dim)
        self.review_count_embedding = torch.nn.Embedding(max_review_counts, embedding_dim)
        self.attention = torch.nn.MultiheadAttention(embed_dim=embedding_dim*4, num_heads=num_heads)
        self.fc1 = torch.nn.Linear(embedding_dim * 4, 16)
        self.fc2 = torch.nn.Linear(16, 1)
        self.activation = torch.nn.Sigmoid()

    def forward(self, recipe_id, author_id, calories, review_counts):
        recipe_embedded = self.recipe_embedding(recipe_id)
        author_embedded = self.author_embedding(author_id)
        calorie_embedded = self.calorie_embedding(calories.long())
        review_count_embedded = self.review_count_embedding(review_counts.long())

        x = torch.cat([recipe_embedded, author_embedded, calorie_embedded, review_count_embedded], dim=-1)
        # x = self.embedding_transform(x)
        
        x = x.unsqueeze(1)

        attn_output, _ = self.attention(x, x, x)
        
        x = attn_output.squeeze(1)

        # x = torch.mean(attn_output, dim=1)  # Compute the mean along the sequence_length dimension

        x = self.fc1(x)
        # x = F.relu(x)
        x = self.fc2(x)
        x = self.activation(x)

        return (x * 5.0).view(-1)

In [None]:
saved_models_dir = 'saved_models'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Calculate the maximum values for Calories and ReviewCount
max_calories = int(df["Calories"].max())
max_review_count = int(df["ReviewCount"].max())

model = RecipeModel(num_recipes=len(data.item_encoder.classes_)+1, 
                    num_authors=len(data.user_encoder.classes_)+1,
                    max_calories=max_calories+1,
                    max_review_counts=max_review_count+1)

#### Training

In [8]:
# Initialize the best validation loss to a large value
best_valid_loss = float('inf')

# Create a directory for the saved models if it doesn't exist
os.makedirs(saved_models_dir, exist_ok=True)
# RecipeModel(num_recipes=data.loc[:,'RecipeId'].max()+1, num_authors=data.loc[:,"AuthorId_y"].max()+1)
model = model.to(device) # Send model to GPU if available

criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for batch, targets in train_loader:
        optimizer.zero_grad()
        batch = [b.to(device) for b in batch]
        targets = targets.float().to(device)
        preds = model(*batch)

        loss = criterion(preds, targets)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * batch[0].shape[0]
    train_loss /= len(train_dataset)

    model.eval()
    valid_loss = 0
    with torch.no_grad():
        for batch, targets in valid_loader:
            batch = [b.to(device) for b in batch]
            targets = targets.float().to(device)
            preds = model(*batch)
            loss = criterion(preds, targets)
            valid_loss += loss.item() * batch[0].shape[0]
        valid_loss /= len(val_dataset)

    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Valid Loss: {valid_loss:.4f}")

    # Check if the current validation loss is lower than the best validation loss
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        print(f"Validation loss improved. Saving the model to {saved_models_dir}/best_model.pt")
        torch.save(model.state_dict(), f"{saved_models_dir}/best_model.pt")

Epoch 1/10, Train Loss: 1.6173, Valid Loss: 1.6045
Validation loss improved. Saving the model to saved_models/best_model.pt
Epoch 2/10, Train Loss: 1.5919, Valid Loss: 1.5968
Validation loss improved. Saving the model to saved_models/best_model.pt
Epoch 3/10, Train Loss: 1.5712, Valid Loss: 1.5992
Epoch 4/10, Train Loss: 1.5503, Valid Loss: 1.5977
Epoch 5/10, Train Loss: 1.5277, Valid Loss: 1.6096
Epoch 6/10, Train Loss: 1.5040, Valid Loss: 1.6208
Epoch 7/10, Train Loss: 1.4811, Valid Loss: 1.6131
Epoch 8/10, Train Loss: 1.4599, Valid Loss: 1.6312
Epoch 9/10, Train Loss: 1.4402, Valid Loss: 1.6563
Epoch 10/10, Train Loss: 1.4215, Valid Loss: 1.6786


#### Load/save model

In [None]:
model.load_state_dict(torch.load(f"{saved_models_dir}/best_model.pt"))
model = model.to(device) # Send model to GPU if available

#### Test

In [9]:
author_id = 1545
recipe_ids = df["RecipeId"].unique()[:10000]
# recipe_ids = df["RecipeId"].unique()


user_has_ratings = author_id in df["AuthorId_y"].values

if user_has_ratings:
    user_rated_recipe_ids = df[df["AuthorId_y"] == author_id]["RecipeId"].unique()
else:
    user_rated_recipe_ids = []

# Create a recommendation dataset
recommendation_data = []
for recipe_id in recipe_ids:
    if not user_has_ratings or (user_has_ratings and recipe_id not in user_rated_recipe_ids):
        recipe_id_transformed = data.item_encoder.transform([recipe_id])[0]
        recipe_data = df[df["RecipeId"] == recipe_id].iloc[0]
        recommendation_data.append((recipe_id_transformed, author_id, recipe_data["Calories"], recipe_data["ReviewCount"]))

recommendation_dataset = [(torch.tensor(a).to(device), torch.tensor(b).to(device), torch.tensor(c).to(device), torch.tensor(d).to(device)) for a, b, c, d in recommendation_data]
recommendation_loader = DataLoader(recommendation_dataset, batch_size=batch_size, shuffle=False)
# Model evaluation
model.eval()
with torch.no_grad():
    ratings = []
    for inputs in recommendation_loader:
        rating = model(*inputs)
        ratings.extend(rating.detach().cpu().numpy())



In [10]:
top_recipe_ids = [recipe_ids[i] for i in sorted(range(len(ratings)), key=lambda i: ratings[i], reverse=True)[:10]]
print(top_recipe_ids)

[4527, 8927, 10620, 8519, 2940, 10075, 5071, 8682, 4882, 5304]
