In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# Note: The following do not work with Python 3.12
import shap
from ydata_profiling import ProfileReport
import sweetviz as sv

Reproducibility:

In [3]:
seed = 2024

# pandas, statsmodels, matplotlib and y_data_profiling rely on numpy's random generator, and thus, we need to set the seed in numpy
np.random.seed(seed)

In [35]:
diet = pd.read_csv('diet.csv', low_memory=False)
requests = pd.read_csv('requests.csv', low_memory=False)
reviews = pd.read_csv('reviews.csv', low_memory=False)
recipes = pd.read_csv('recipes.csv', low_memory=False)

In [19]:
merged_diet_reviews = pd.merge(diet, reviews, on='AuthorId', how='left')
merged_diet_reviews

Unnamed: 0,AuthorId,Diet,Age,RecipeId,Rating,Like,TestSetId
0,10000120E,Vegetarian,46,,,,
1,1000014D,Vegan,18,,,,
2,1000015A,Vegetarian,58,,,,
3,1000016E,Vegetarian,32,,,,
4,1000027E,Vegan,61,,,,
...,...,...,...,...,...,...,...
362749,999917E,Vegetarian,28,169413.0,2.0,False,
362750,999936C,Omnivore,22,,,,
362751,99993D,Vegetarian,58,,,,
362752,99994A,Vegetarian,18,373964.0,,,7555.0


Changing the object data types

In [6]:
diet['Diet'] = diet['Diet'].astype('category')
requests['HighProtein'] = requests['HighProtein'].astype('category')
requests['LowSugar'] = requests['LowSugar'].astype('category')

Data Joining using common attributes

In [25]:
author_ID = 'AuthorId'
merged_diet_requests = pd.merge(diet, requests, on=author_ID, how='left')
merged_request_recipes = pd.merge(requests, recipes, on='RecipeId', how='left')
merged_diet_requests

Unnamed: 0,AuthorId,Diet,Age,RecipeId,Time,HighCalories,HighProtein,LowFat,LowSugar,HighFiber
0,10000120E,Vegetarian,46,,,,,,,
1,1000014D,Vegan,18,,,,,,,
2,1000015A,Vegetarian,58,,,,,,,
3,1000016E,Vegetarian,32,,,,,,,
4,1000027E,Vegan,61,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
362749,999917E,Vegetarian,28,169413.0,3600.387748,0.0,Indifferent,0.0,Indifferent,0.0
362750,999936C,Omnivore,22,,,,,,,
362751,99993D,Vegetarian,58,,,,,,,
362752,99994A,Vegetarian,18,373964.0,7199.509521,0.0,Yes,0.0,0,0.0


In [30]:
#cleaning merge_diet_requests
# Drop 'RecipeId_y' column
#merge_diet_requests = merged_diet_requests.drop('RecipeId_y', axis=1)

# Rename 'RecipeId_x' to 'RecipeId'
#merged_diet_requests = merged_diet_requests.rename(columns={'RecipeId_x': 'RecipeId'})

Plot graphs of the data frames

In [6]:
merged_diet_requests.to_csv('merged_diet_requests.csv')

Impute the missing values

In [None]:
# 1 value missing in diet column. Filled with most occuring value.
merged_diet_requests['Diet'] = merged_diet_requests['Diet'].fillna('Vegetarian')

In [14]:
#impute the values for all dietary preferences for all ages with the most frequent RecipeId for that age in that category
helper_df = merged_diet_requests.groupby(['Age', 'Diet'])['RecipeId'].agg(lambda x: x.mode()[0]).reset_index()
helper_df.columns = ['Age', 'Diet', 'Most Common Recipe']
print(helper_df)
def impute_recipe(row):
    if pd.isnull(row['RecipeId']):
        return helper_df[(helper_df['Age'] == row['Age']) & (helper_df['Diet'] == row['Diet'])]['Most Common Recipe'].values[0]
    else:
        return row['RecipeId']

merged_diet_requests['RecipeId'] = merged_diet_requests.apply(impute_recipe, axis=1)
merged_diet_requests



     Age        Diet  Most Common Recipe
0     18    Omnivore             22782.0
1     18       Vegan             11194.0
2     18  Vegetarian              2886.0
3     19    Omnivore             32204.0
4     19       Vegan              2510.0
..   ...         ...                 ...
181   78       Vegan             43990.0
182   78  Vegetarian             45809.0
183   79    Omnivore             22782.0
184   79       Vegan             73440.0
185   79  Vegetarian             45809.0

[186 rows x 3 columns]


Unnamed: 0,AuthorId,Diet,Age,RecipeId,Time,HighCalories,HighProtein,LowFat,LowSugar,HighFiber
0,10000120E,Vegetarian,46,2886.0,4198.933093,1.0,Yes,1,Indifferent,1
1,1000014D,Vegan,18,11194.0,600.014974,0.0,Indifferent,0,Indifferent,0
2,1000015A,Vegetarian,58,45809.0,14399.381563,1.0,Indifferent,0,Indifferent,0
3,1000016E,Vegetarian,32,150863.0,3600.420005,0.0,Indifferent,0,0,0
4,1000027E,Vegan,61,64446.0,2099.032638,0.0,Indifferent,0,0,1
...,...,...,...,...,...,...,...,...,...,...
362749,999917E,Vegetarian,28,169413.0,3600.387748,0.0,Indifferent,0,Indifferent,0
362750,999936C,Omnivore,22,15072.0,2280.037298,0.0,Indifferent,0,Indifferent,0
362751,99993D,Vegetarian,58,45809.0,14399.381563,1.0,Indifferent,0,Indifferent,0
362752,99994A,Vegetarian,18,373964.0,7199.509521,0.0,Yes,0,0,0


In [39]:
# Fill the rest of the missing values in the merged_diet_requests by mapping them from requests.csv with RecipeId as key 
# Create mapping DataFrames from `requests`
map_time = requests.set_index('RecipeId')['Time'].to_dict()
map_calories = requests.set_index('RecipeId')['HighCalories'].to_dict()
map_protein = requests.set_index('RecipeId')['HighProtein'].to_dict()
map_fat = requests.set_index('RecipeId')['LowFat'].to_dict()
map_sugar = requests.set_index('RecipeId')['LowSugar'].to_dict()
map_fiber = requests.set_index('RecipeId')['HighFiber'].to_dict()

# Apply mapping to `merged_diet_requests`
merged_diet_requests['Time'] = merged_diet_requests['RecipeId'].map(map_time)
merged_diet_requests['HighCalories'] = merged_diet_requests['RecipeId'].map(map_calories)
merged_diet_requests['HighProtein'] = merged_diet_requests['RecipeId'].map(map_protein)
merged_diet_requests['LowFat'] = merged_diet_requests['RecipeId'].map(map_fat)
merged_diet_requests['LowSugar'] = merged_diet_requests['RecipeId'].map(map_sugar)
merged_diet_requests['HighFiber'] = merged_diet_requests['RecipeId'].map(map_fiber)
merged_diet_requests    

TypeError: can only concatenate str (not "int") to str

Merge the file with reviews column to include 'likes' column

In [41]:
# Assuming reviews is already loaded as a DataFrame
total_data = pd.merge(merged_diet_requests, reviews, on='AuthorId')
total_data

Unnamed: 0,AuthorId,Diet,Age,RecipeId_x,Time,HighCalories,HighProtein,LowFat,LowSugar,HighFiber,RecipeId_y,Rating,Like,TestSetId
0,1000036C,Vegetarian,50,320576.0,119.024930,0.0,Indifferent,0.0,Indifferent,1.0,320576,,False,
1,1000216B,Vegetarian,78,189335.0,1199.549763,1.0,Indifferent,0.0,0,0.0,189335,,False,
2,1000221A,Vegetarian,25,133043.0,362.152341,0.0,Yes,0.0,Indifferent,1.0,133043,2.0,False,
3,1000221A,Vegetarian,25,133043.0,362.152341,0.0,Yes,0.0,Indifferent,1.0,90537,2.0,False,
4,1000221A,Vegetarian,25,133043.0,362.152341,0.0,Yes,0.0,Indifferent,1.0,334314,2.0,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7926874,999774A,Vegetarian,57,1171.0,480.233207,1.0,Yes,0.0,0,0.0,29002,2.0,False,
7926875,999774A,Vegetarian,57,1171.0,480.233207,1.0,Yes,0.0,0,0.0,159252,,False,
7926876,999774A,Vegetarian,57,1171.0,480.233207,1.0,Yes,0.0,0,0.0,1171,2.0,True,
7926877,999917E,Vegetarian,28,169413.0,3600.387748,0.0,Indifferent,0.0,Indifferent,0.0,169413,2.0,False,


Data Visualization

Random Forest Classifier

In [26]:
# Step 1: Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [29]:
# Step 2: Load cleaned data
def load_cleaned_data():
    # Replace this function with your data loading and cleaning logic
    # Example assumes X contains features and y contains labels
    data = pd.read_csv('cleaned_data.csv') # can also pass a data frame
    X = data.drop('target_column', axis=1)
    y = data['target_column']
    return X, y

X, y = load_cleaned_data()

In [30]:
# Step 3: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
# Step 4: Choose a Random Forest Classifier model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [32]:
# Step 5: Train the model
rf_model.fit(X_train, y_train)

ValueError: could not convert string to float: '1113424C'

In [ ]:
# Step 6: Make predictions on the test set
predictions = rf_model.predict(X_test)

In [ ]:
# Step 7: Evaluate the model
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)
print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")