In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import ast  # To safely parse the 'nutrition' strings into lists

raw_recipes = pd.read_csv("RAW_recipes.csv")
columns_to_keep = ["name", "id", "minutes", "nutrition"]
raw_recipes = raw_recipes[columns_to_keep]
raw_recipes["calories"] = raw_recipes["nutrition"].apply(lambda x: ast.literal_eval(x)[0] if pd.notna(x) else None)
raw_recipes = raw_recipes.drop(columns=["nutrition"])

print("\nStatistics for 'minutes' before filtering:")
print(raw_recipes["minutes"].describe())

plt.figure(figsize=(10, 6))
plt.hist(raw_recipes["minutes"], bins=50, color="skyblue", edgecolor="black")
plt.title("Distribution of Cooking Times (Minutes)")
plt.xlabel("Minutes")
plt.ylabel("Frequency")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

In [None]:
# Filter out rows where "minutes" > 300
initial_count = len(raw_recipes)
raw_recipes_filtered_mins = raw_recipes[raw_recipes["minutes"] <= 300]
raw_recipes_filtered_mins = raw_recipes_filtered_mins[raw_recipes_filtered_mins["minutes"] > 0]
final_count = len(raw_recipes_filtered_mins)
rows_dropped = initial_count - final_count
print(f"Number of rows dropped: {rows_dropped}")

print("\nStatistics for 'minutes' after filtering:")
print(raw_recipes_filtered_mins["minutes"].describe())

plt.figure(figsize=(10, 6))
plt.hist(raw_recipes_filtered_mins["minutes"], bins=50, color="skyblue", edgecolor="black")
plt.title("Distribution of Cooking Times (Minutes)")
plt.xlabel("Minutes")
plt.ylabel("Frequency")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

# Find the median of the "minutes" column
median_minutes = raw_recipes_filtered_mins["minutes"].median()

# Filter recipes with "minutes" less than or equal to the median
quick_recipes = raw_recipes_filtered_mins[raw_recipes_filtered_mins["minutes"] <= median_minutes]
print(f"\nMedian cooking time (minutes): {median_minutes}")

In [None]:
# Plot the distribution of the "minutes" variable
plt.figure(figsize=(10, 6))
plt.hist(quick_recipes["minutes"], bins=50, color="skyblue", edgecolor="black")
plt.title("Distribution of Cooking Times Below Median (Minutes)")
plt.xlabel("Minutes")
plt.ylabel("Frequency")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

In [None]:
raw_interactions = pd.read_csv("RAW_interactions.csv")
columns_to_keep = ["user_id", "recipe_id", "rating"]
raw_interactions = raw_interactions[columns_to_keep]

# Filter out interactions where "recipe_id" is in "quick_recipes"
quick_recipe_ids = set(quick_recipes["id"])  # faster lookup
quick_interactions = raw_interactions[raw_interactions["recipe_id"].isin(quick_recipe_ids)]

print(quick_interactions.head())
print(f"\nNumber of interactions in raw_interactions: {len(raw_interactions)}")
print(f"\nNumber of interactions in quick_interactions: {len(quick_interactions)}")

In [None]:
print("\nStatistics for 'calories' before filtering:")
print(raw_recipes["calories"].describe())

plt.figure(figsize=(10, 6))
plt.hist(raw_recipes["calories"], bins=50, color="skyblue", edgecolor="black")
plt.title("Distribution of Calorie Counts (kCal)")
plt.xlabel("Calories")
plt.ylabel("Frequency")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

In [None]:
# Filter out rows where 10 < "calories" < 2000
initial_count = len(raw_recipes)
raw_recipes_filtered_cals = raw_recipes[raw_recipes["calories"] <= 2000]
raw_recipes_filtered_cals = raw_recipes_filtered_cals[raw_recipes_filtered_cals["calories"] > 10]
final_count = len(raw_recipes_filtered_cals)
rows_dropped = initial_count - final_count
print(f"Number of rows dropped: {rows_dropped}")

print("\nStatistics for 'calories' after filtering:")
print(raw_recipes_filtered_cals["calories"].describe())

plt.figure(figsize=(10, 6))
plt.hist(raw_recipes_filtered_cals["calories"], bins=50, color="skyblue", edgecolor="black")
plt.title("Distribution of Calorie Counts (kCal)")
plt.xlabel("Calories")
plt.ylabel("Frequency")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

# Find the median of the "calories" column
median_calories = raw_recipes_filtered_cals["calories"].median()

# Filter recipes with "calories" less than or equal to the median
low_cal_recipes = raw_recipes_filtered_cals[raw_recipes_filtered_cals["calories"] <= median_calories]
print(f"\nMedian calorie count (kCal): {median_calories}")

In [None]:
# Plot the distribution of the "calories" variable
plt.figure(figsize=(10, 6))
plt.hist(low_cal_recipes["calories"], bins=50, color="skyblue", edgecolor="black")
plt.title("Distribution of Calorie Counts Below Median (kCal)")
plt.xlabel("Calories")
plt.ylabel("Frequency")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

In [None]:
# Filter out interactions where "recipe_id" is in "quick_recipes"
low_cal_recipe_ids = set(low_cal_recipes["id"])  # faster lookup
low_cal_interactions = raw_interactions[raw_interactions["recipe_id"].isin(low_cal_recipe_ids)]

print(low_cal_interactions.head())
print(f"\nNumber of interactions in raw_interactions: {len(raw_interactions)}")
print(f"\nNumber of interactions in low_cal_interactions: {len(low_cal_interactions)}")

In [9]:
quick_recipes.to_csv("quick_recipes.csv", index=False)
quick_interactions.to_csv("quick_interactions.csv", index=False)
low_cal_recipes.to_csv("low_cal_recipes.csv", index=False)
low_cal_interactions.to_csv("low_cal_interactions.csv", index=False)

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

# Load the low-calorie interactions dataset
low_cal_interactions = pd.read_csv("low_cal_interactions.csv")

# Data splitting: training (80%), validation (10%), testing (10%)
random_seed = 42
train_data, temp_data = train_test_split(
    low_cal_interactions, test_size=0.2, random_state=random_seed
)
validation_data, test_data = train_test_split(
    temp_data, test_size=0.5, random_state=random_seed
)

# Print dataset sizes
print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(validation_data)}")
print(f"Testing set size: {len(test_data)}")

# Save splits to CSV (optional)
train_data.to_csv("low_cal_train.csv", index=False)
validation_data.to_csv("low_cal_validation.csv", index=False)
test_data.to_csv("low_cal_test.csv", index=False)

# Check train-test overlap
train_users = set(train_data["user_id"])
train_recipes = set(train_data["recipe_id"])

valid_users_covered = all(user in train_users for user in validation_data["user_id"])
valid_recipes_covered = all(recipe in train_recipes for recipe in validation_data["recipe_id"])
test_users_covered = all(user in train_users for user in test_data["user_id"])
test_recipes_covered = all(recipe in train_recipes for recipe in test_data["recipe_id"])

print(f"Validation users covered: {valid_users_covered}")
print(f"Validation recipes covered: {valid_recipes_covered}")
print(f"Test users covered: {test_users_covered}")
print(f"Test recipes covered: {test_recipes_covered}")

# Bias-Only Model

# Step 1: Compute Global Bias
global_bias = train_data["rating"].mean()
print(f"Global Bias (mean rating): {global_bias}")

# Step 2: Compute User Biases
user_bias = (
    train_data.groupby("user_id")["rating"]
    .mean()
    .subtract(global_bias)
    .to_dict()
)

# Step 3: Compute Recipe Biases
recipe_bias = (
    train_data.groupby("recipe_id")["rating"]
    .mean()
    .subtract(global_bias)
    .to_dict()
)

# Step 4: Prediction Function
def predict(user_id, recipe_id):
    """
    Predict the rating for a given user and recipe based on biases.
    """
    user_b = user_bias.get(user_id, 0)  # Default to 0 if user not in training
    recipe_b = recipe_bias.get(recipe_id, 0)  # Default to 0 if recipe not in training
    return global_bias + user_b + recipe_b

# Step 5: Evaluate Model with MSE
def evaluate(data, name="Dataset"):
    """
    Evaluate the model on a dataset by calculating MSE.
    """
    y_true = data["rating"]
    y_pred = data.apply(lambda row: predict(row["user_id"], row["recipe_id"]), axis=1)
    mse = np.mean((y_true - y_pred) ** 2)
    print(f"MSE on {name}: {mse}")
    return mse

# Evaluate on Training, Validation, and Testing Sets
evaluate(train_data, "Training Set")
evaluate(validation_data, "Validation Set")
evaluate(test_data, "Testing Set")


Training set size: 439821
Validation set size: 54978
Testing set size: 54978
Validation users covered: False
Validation recipes covered: False
Test users covered: False
Test recipes covered: False
Global Bias (mean rating): 4.420857576150297
MSE on Training Set: 0.6844740441212623
MSE on Validation Set: 1.7480310314739416
MSE on Testing Set: 1.772939838651607


1.772939838651607

In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load quick recipes and interactions
quick_recipes = pd.read_csv("quick_recipes.csv")
quick_interactions = pd.read_csv("quick_interactions.csv")

# Split the data into train (80%), validation (10%), and test (10%)
train_data, temp_data = train_test_split(quick_interactions, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")
print(f"Test set size: {len(test_data)}")

# Global mean (mu)
global_bias = train_data["rating"].mean()

# User biases
lambda_reg = 10  # Regularization strength
user_bias = (
    train_data.groupby("user_id")["rating"]
    .apply(lambda x: (x - global_bias).sum() / (len(x) + lambda_reg))
    .to_dict()
)

# Recipe biases
recipe_bias = (
    train_data.groupby("recipe_id")["rating"]
    .apply(lambda x: (x - global_bias - user_bias.get(x.name, 0)).sum() / (len(x) + lambda_reg))
    .to_dict()
)

# Prediction function
def predict(user_id, recipe_id):
    """Predict rating for a given user and recipe."""
    user_b = user_bias.get(user_id, 0)
    recipe_b = recipe_bias.get(recipe_id, 0)
    return global_bias + user_b + recipe_b

# Evaluate model
def evaluate(data, label="Set"):
    """Evaluate model performance on a given dataset."""
    predictions = data.apply(lambda row: predict(row["user_id"], row["recipe_id"]), axis=1)
    mse = mean_squared_error(data["rating"], predictions)
    print(f"{label} MSE: {mse}")
    return mse

# Evaluate on all splits
print("\nEvaluating the model:")
train_mse = evaluate(train_data, "Train")
val_mse = evaluate(val_data, "Validation")
test_mse = evaluate(test_data, "Test")

# Output the model's predictions on test data (optional)
test_data["predicted_rating"] = test_data.apply(lambda row: predict(row["user_id"], row["recipe_id"]), axis=1)
test_data.to_csv("quick_recipes_test_predictions.csv", index=False)

print("\nPredictions saved to 'quick_recipes_test_predictions.csv'")


Training set size: 428936
Validation set size: 53617
Test set size: 53617

Evaluating the model:
Train MSE: 1.093801375648195
Validation MSE: 1.3541831216510385
Test MSE: 1.3589729169254396

Predictions saved to 'quick_recipes_test_predictions.csv'


In [18]:
import pandas as pd
import numpy as np
import ast
from sklearn.metrics import mean_squared_error

# Step 1: Preprocessing the Data
# Load raw recipes and interactions
raw_recipes = pd.read_csv("RAW_recipes.csv")
raw_interactions = pd.read_csv("RAW_interactions.csv")

# Keep relevant columns
raw_recipes = raw_recipes[["name", "id", "minutes", "nutrition"]]
raw_recipes["calories"] = raw_recipes["nutrition"].apply(lambda x: ast.literal_eval(x)[0] if pd.notna(x) else None)
raw_recipes = raw_recipes.drop(columns=["nutrition"])

raw_interactions = raw_interactions[["user_id", "recipe_id", "rating"]]

# Filter recipes based on calories and minutes
raw_recipes = raw_recipes[(raw_recipes["calories"] <= 2000) & (raw_recipes["calories"] > 10)]
raw_recipes = raw_recipes[(raw_recipes["minutes"] <= 300) & (raw_recipes["minutes"] > 0)]

# Filter interactions to include only valid recipe_ids
all_recipe_ids = set(raw_recipes["id"])
all_interactions = raw_interactions[raw_interactions["recipe_id"].isin(all_recipe_ids)]

# Step 2: Train/Test Split
# Use an 80-10-10 split for training, validation, and testing
np.random.seed(42)  # For reproducibility
shuffled_indices = np.random.permutation(len(all_interactions))
train_end = int(0.8 * len(shuffled_indices))
valid_end = int(0.9 * len(shuffled_indices))

train_indices = shuffled_indices[:train_end]
valid_indices = shuffled_indices[train_end:valid_end]
test_indices = shuffled_indices[valid_end:]

train_data = all_interactions.iloc[train_indices]
valid_data = all_interactions.iloc[valid_indices]
test_data = all_interactions.iloc[test_indices]

# Step 3: Bias-Only Model
# Calculate global bias, user biases, and item biases
global_bias = train_data["rating"].mean()

# Calculate user and item biases
user_bias = train_data.groupby("user_id")["rating"].mean() - global_bias
item_bias = train_data.groupby("recipe_id")["rating"].mean() - global_bias

# Fill missing biases with 0 for new users/items in validation or test
user_bias = user_bias.to_dict()
item_bias = item_bias.to_dict()

def predict(user_id, recipe_id):
    """
    Predict rating based on bias model: global_bias + user_bias + item_bias
    """
    user_b = user_bias.get(user_id, 0)
    item_b = item_bias.get(recipe_id, 0)
    return global_bias + user_b + item_b

# Step 4: Evaluation
def evaluate(data):
    """
    Evaluate the model using Mean Squared Error (MSE).
    """
    y_true = data["rating"]
    y_pred = data.apply(lambda row: predict(row["user_id"], row["recipe_id"]), axis=1)
    mse = mean_squared_error(y_true, y_pred)
    return mse

# Calculate MSE for train, validation, and test sets
train_mse = evaluate(train_data)
valid_mse = evaluate(valid_data)
test_mse = evaluate(test_data)

print(f"Train MSE: {train_mse:.4f}")
print(f"Validation MSE: {valid_mse:.4f}")
print(f"Test MSE: {test_mse:.4f}")


Train MSE: 0.7199
Validation MSE: 1.7852
Test MSE: 1.7891
