# Summary : Food Rating Prediction For Menu Optimization

This project predicts food recipe ratings using a combination of text analysis and numerical features. It processes user reviews with NLP techniques (LSTM), capturing the semantic meaning of reviews, and integrates recipe details like cooking time and nutritional content. The model is trained to predict ratings, helping optimize menus by recommending high-rated recipes. The project demonstrates the application of deep learning and text processing to predict user ratings in the food domain, leveraging both the textual sentiment and content of reviews for more accurate predictions.

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/foodcom-recipes-and-reviews/recipes.parquet
/kaggle/input/foodcom-recipes-and-reviews/reviews.parquet
/kaggle/input/foodcom-recipes-and-reviews/reviews.csv
/kaggle/input/foodcom-recipes-and-reviews/recipes.csv


# Data Preprocessing

## Loading the Data

In [2]:
import pandas as pd

# Load the reviews and recipes data
reviews_df = pd.read_csv('/kaggle/input/foodcom-recipes-and-reviews/reviews.csv')
recipes_df = pd.read_csv('/kaggle/input/foodcom-recipes-and-reviews/recipes.csv')

# Display basic info about the datasets
print(reviews_df.info())
print(recipes_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1401982 entries, 0 to 1401981
Data columns (total 8 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   ReviewId       1401982 non-null  int64 
 1   RecipeId       1401982 non-null  int64 
 2   AuthorId       1401982 non-null  int64 
 3   AuthorName     1401982 non-null  object
 4   Rating         1401982 non-null  int64 
 5   Review         1401768 non-null  object
 6   DateSubmitted  1401982 non-null  object
 7   DateModified   1401982 non-null  object
dtypes: int64(4), object(4)
memory usage: 85.6+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522517 entries, 0 to 522516
Data columns (total 28 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   RecipeId                    522517 non-null  int64  
 1   Name                        522517 non-null  object 
 2   AuthorId                    522517

## Merge Reviews with Recipes

In [3]:
# Merge the reviews with recipe details based on RecipeId
merged_df = pd.merge(reviews_df, recipes_df, on='RecipeId')

# Check the first few rows of the merged dataset
print(merged_df.head())


   ReviewId  RecipeId  AuthorId_x      AuthorName_x  Rating  \
0         2       992        2008         gayg msft       5   
1         7      4384        1634     Bill Hilbrich       4   
2         9      4523        2046  Gay Gilmore ckpt       2   
3        13      7435        1773     Malarkey Test       5   
4        14        44        2085        Tony Small       5   

                                              Review         DateSubmitted  \
0       better than any you can get at a restaurant!  2000-01-25T21:44:00Z   
1  I cut back on the mayo, and made up the differ...  2001-10-17T16:49:59Z   
2  i think i did something wrong because i could ...  2000-02-25T09:00:00Z   
3  easily the best i have ever had.  juicy flavor...  2000-03-13T21:15:00Z   
4                                 An excellent dish.  2000-03-28T12:51:00Z   

           DateModified                                           Name  \
0  2000-01-25T21:44:00Z                        Jalapeno Pepper Poppers   
1  2

## Handling Missing Values


In [4]:
# Fill missing values in the 'Review' column
merged_df['Review'] = merged_df['Review'].fillna("")

# Check the columns of merged_df before dropping
print(merged_df.columns)

# Drop irrelevant columns (handling missing ones safely)
columns_to_drop = ['ReviewId', 'AuthorId', 'AuthorName', 'DateSubmitted', 'DateModified']
merged_df.drop(columns=[col for col in columns_to_drop if col in merged_df.columns], inplace=True)

Index(['ReviewId', 'RecipeId', 'AuthorId_x', 'AuthorName_x', 'Rating',
       'Review', 'DateSubmitted', 'DateModified', 'Name', 'AuthorId_y',
       'AuthorName_y', 'CookTime', 'PrepTime', 'TotalTime', 'DatePublished',
       'Description', 'Images', 'RecipeCategory', 'Keywords',
       'RecipeIngredientQuantities', 'RecipeIngredientParts',
       'AggregatedRating', 'ReviewCount', 'Calories', 'FatContent',
       'SaturatedFatContent', 'CholesterolContent', 'SodiumContent',
       'CarbohydrateContent', 'FiberContent', 'SugarContent', 'ProteinContent',
       'RecipeServings', 'RecipeYield', 'RecipeInstructions'],
      dtype='object')


## Text Preprocessing for Reviews

In [5]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Preprocessing function for the review text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove non-alphabetical characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing to the review text column
merged_df['ProcessedReview'] = merged_df['Review'].apply(preprocess_text)

# Show processed reviews
print(merged_df[['Review', 'ProcessedReview']].head())

                                              Review  \
0       better than any you can get at a restaurant!   
1  I cut back on the mayo, and made up the differ...   
2  i think i did something wrong because i could ...   
3  easily the best i have ever had.  juicy flavor...   
4                                 An excellent dish.   

                                     ProcessedReview  
0                              better get restaurant  
1  cut back mayo made difference sour cream adjus...  
2  think something wrong could taste cornstarch f...  
3  easily best ever juicy flavorful dry vegetable...  
4                                     excellent dish  


## Target Variable

In [6]:
# The target variable is the Rating column
y = merged_df['Rating']

# Feature Engineering

## Text Vectorization


In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize the processed reviews
tokenizer = Tokenizer()
tokenizer.fit_on_texts(merged_df['ProcessedReview'])
sequences = tokenizer.texts_to_sequences(merged_df['ProcessedReview'])

# Define the maximum length for padding
max_sequence_length = 100
X_reviews = pad_sequences(sequences, maxlen=max_sequence_length)

# Show the shape of the tokenized input
print(f"Tokenized Review Input Shape: {X_reviews.shape}")

Tokenized Review Input Shape: (1401963, 100)


## Numerical Features from Recipe Data

In [8]:
# Example: Extract relevant numerical features
numerical_features = merged_df[['CookTime', 'PrepTime', 'TotalTime', 'ProteinContent', 'CarbohydrateContent', 'FatContent']].copy()

# Convert time durations (e.g., PT24H) into minutes
def time_to_minutes(time_str):
    if pd.isna(time_str) or time_str == 'PT0':  # Handle NaN or 'PT0' edge case
        return 0
    
    hours = 0
    minutes = 0

    # Remove 'PT' prefix
    time_str = time_str[2:]

    # If there are 'H' in the string, split to get hours
    if 'H' in time_str:
        hours = int(time_str.split('H')[0])

    # If there are 'M' in the string, split to get minutes
    if 'M' in time_str:
        minutes = int(time_str.split('M')[0].split('H')[-1])

    return hours * 60 + minutes

# Apply the conversion function to the time columns
numerical_features['CookTime'] = numerical_features['CookTime'].apply(time_to_minutes)
numerical_features['PrepTime'] = numerical_features['PrepTime'].apply(time_to_minutes)
numerical_features['TotalTime'] = numerical_features['TotalTime'].apply(time_to_minutes)

# Normalize numerical features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_numerical = scaler.fit_transform(numerical_features)

print(f"Numerical Features Shape: {X_numerical.shape}")

Numerical Features Shape: (1401963, 6)


## Combine All Features

In [9]:
import numpy as np

# Combine reviews and numerical features
X_combined = [X_reviews, X_numerical]

# Building the LSTM Model

In [10]:
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate, Dropout, Bidirectional

# Define the review input layer (for LSTM)
review_input = Input(shape=(max_sequence_length,))

# Define embedding and LSTM layers for reviews
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_sequence_length)(review_input)
lstm_layer = Bidirectional(LSTM(100))(embedding_layer)

# Define the input layer for numerical features
numerical_input = Input(shape=(X_numerical.shape[1],))

# Combine the LSTM output with the numerical features
merged_input = Concatenate()([lstm_layer, numerical_input])

# Add Dense layers
x = Dense(64, activation='relu')(merged_input)
x = Dropout(0.25)(x)
output = Dense(1, activation='linear')(x)  # Use 'linear' activation for regression output (rating prediction)

# Create the model
model = Model(inputs=[review_input, numerical_input], outputs=output)

# Define the optimizer
optimizer = Adam(learning_rate=1e-4, clipnorm=1.0)

# Compile the model
model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mae'])

# Model summary
model.summary()




## Add EarlyStopping

In [11]:
from tensorflow.keras.callbacks import EarlyStopping,ReduceLROnPlateau,ModelCheckpoint
# Define the callback to save the best model
checkpoint_callback = ModelCheckpoint(
    'best_model.h5',        # The file name to save the model
    monitor='val_loss',     # The metric to monitor
    save_best_only=True,    # Save only the best model
    mode='min',             # 'min' means we want to minimize the validation loss
    verbose=1               # Print a message when the model is saved
)


es = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Reduce Learning Rate on Plateau
lrd = ReduceLROnPlateau(monitor="val_loss", factor=0.2, patience=3, verbose=1, mode="max", min_lr=1e-6)

# Model Training

In [12]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train_reviews, X_test_reviews, X_train_numerical, X_test_numerical, y_train, y_test = train_test_split(
    X_reviews, X_numerical, y, test_size=0.2, random_state=42)

# Train the model
history = model.fit(
    [X_train_reviews, X_train_numerical],
    y_train,
    epochs=10,
    batch_size=256,
    callbacks=[es,lrd,checkpoint_callback],
    validation_data=([X_test_reviews, X_test_numerical], y_test)
)

Epoch 1/10
[1m4382/4382[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m179s[0m 40ms/step - loss: 2.4219 - mae: 1.0275 - val_loss: 1.1840 - val_mae: 0.6429 - learning_rate: 1.0000e-04
Epoch 2/10
[1m4382/4382[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m177s[0m 40ms/step - loss: 1.3583 - mae: 0.7652 - val_loss: 1.1676 - val_mae: 0.6750 - learning_rate: 1.0000e-04
Epoch 3/10
[1m4382/4382[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m177s[0m 40ms/step - loss: 1.2915 - mae: 0.7334 - val_loss: 1.1455 - val_mae: 0.6508 - learning_rate: 1.0000e-04
Epoch 4/10
[1m4381/4382[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 38ms/step - loss: 1.2236 - mae: 0.7078
Epoch 4: ReduceLROnPlateau reducing learning rate to 1.9999999494757503e-05.
[1m4382/4382[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s[0m 40ms/step - loss: 1.2236 - mae: 0.7078 - val_loss: 1.1591 - val_mae: 0.6729 - learning_rate: 1.0000e-04
Epoch 5/10
[1m4382/4382[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s

# Evaluate the Model

In [20]:
# Evaluate the model
# from tensorflow.keras.models import load_model

# # Load the best saved model
# best_model = load_model('best_model.h5')

# # Evaluate or use the model for predictions
# test_loss, test_mae = best_model.evaluate([X_test_reviews, X_test_numerical], y_test)
# print(f"Test Loss: {test_loss}, Test MAE: {test_mae}")

test_loss, test_mae = model.evaluate([X_test_reviews, X_test_numerical], y_test)
print(f"Test Loss: {test_loss}, Test MAE: {test_mae}")

[1m8763/8763[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 4ms/step - loss: 1.1468 - mae: 0.6508
Test Loss: 1.1454654932022095, Test MAE: 0.650833249092102


# Predictions and Optimization

In [14]:
# Split the data into training and testing sets
X_train_reviews, X_test_reviews, X_train_numerical, X_test_numerical, y_train, y_test = train_test_split(
    X_reviews, X_numerical, y, test_size=0.2, random_state=42
)

# Get the indices for the test set
_, test_indices = train_test_split(merged_df.index, test_size=0.2, random_state=42)

# Generate predictions for the test set
predictions = model.predict([X_test_reviews, X_test_numerical]).flatten()

# Create a DataFrame containing the test set rows
test_df = merged_df.iloc[test_indices].copy()  # Use test indices to filter the original dataset

# Add the predictions to the test DataFrame
test_df['PredictedRating'] = predictions

# Filter for optimized menu (e.g., high predicted ratings and positive sentiment)
optimized_menu = test_df[(test_df['PredictedRating'] > 4) ]

# Display the optimized menu
print(optimized_menu[['Name', 'PredictedRating']].head())

[1m8763/8763[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 4ms/step
                                        Name  PredictedRating
718879   Heathier Banana Pumpkin Spice Bread         4.522332
1006375                  Roasted Green Beans         4.288333
1296712              Parmesan Catfish Filets         4.619602
107386            Best Rub for Grilled Steak         4.351974
1171290                    Simple Irish Stew         4.416887


In [17]:
import numpy as np

# Function to preprocess the input review
def preprocess_input_review(review_text):
    # Preprocess using the same steps as training
    review_text = review_text.lower()
    review_text = re.sub(r'[^a-zA-Z\s]', '', review_text)
    tokens = word_tokenize(review_text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    processed_review = ' '.join(tokens)
    return processed_review

# Function to predict rating and menu inclusion
def predict_menu_inclusion(food_name, review_text, cook_time, prep_time, total_time, protein, carbs, fat):
    # Preprocess the review
    processed_review = preprocess_input_review(review_text)
    
    # Tokenize and pad the review
    review_sequence = tokenizer.texts_to_sequences([processed_review])
    padded_review = pad_sequences(review_sequence, maxlen=max_sequence_length)
    
    # Prepare numerical features
    numerical_features = np.array([[time_to_minutes(cook_time), 
                                     time_to_minutes(prep_time), 
                                     time_to_minutes(total_time), 
                                     protein, carbs, fat]])
    numerical_features = scaler.transform(numerical_features)
    
    # Predict the rating
    predicted_rating = model.predict([padded_review, numerical_features]).flatten()[0]
    
    # Decide whether to include the item on the menu
    include_on_menu = predicted_rating > 4  # Assuming 4 is the threshold for inclusion
    
    return {
        "FoodName": food_name,
        "PredictedRating": predicted_rating,
        "IncludeOnMenu": include_on_menu
    }

# Example usage
food_name = "Paneer Butter Masala"
review_text = "This dish is delicious, rich in flavor, and a favorite among customers!"
cook_time = "PT30M"  # e.g., 30 minutes
prep_time = "PT15M"  # e.g., 15 minutes
total_time = "PT45M"  # e.g., 45 minutes
protein = 20.0  # in grams
carbs = 10.0  # in grams
fat = 15.0  # in grams

result = predict_menu_inclusion(food_name, review_text, cook_time, prep_time, total_time, protein, carbs, fat)
print(result)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
{'FoodName': 'Paneer Butter Masala', 'PredictedRating': 4.5667415, 'IncludeOnMenu': True}


