In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# Note: The following do not work with Python 3.12
import shap
from ydata_profiling import ProfileReport
import sweetviz as sv

Reproducibility:

In [2]:
seed = 2024

# pandas, statsmodels, matplotlib and y_data_profiling rely on numpy's random generator, and thus, we need to set the seed in numpy
np.random.seed(seed)

In [3]:
diet = pd.read_csv('diet.csv', low_memory=False)
requests = pd.read_csv('requests.csv', low_memory=False)
reviews = pd.read_csv('reviews.csv', low_memory=False)
reviews['Rating'].unique()

array([ 2., nan])

Changing the object data types

In [4]:
diet['Diet'] = diet['Diet'].astype('category')
requests['HighProtein'] = requests['HighProtein'].astype('category')
requests['LowSugar'] = requests['LowSugar'].astype('category')

Data Joining using common attributes

In [5]:
author_ID = 'AuthorId'
merged_diet_requests = pd.merge(diet, requests, on=author_ID, how='left')
merged_diet_requests

Unnamed: 0,AuthorId,Diet,Age,RecipeId,Time,HighCalories,HighProtein,LowFat,LowSugar,HighFiber
0,10000120E,Vegetarian,46,,,,,,,
1,1000014D,Vegan,18,,,,,,,
2,1000015A,Vegetarian,58,,,,,,,
3,1000016E,Vegetarian,32,,,,,,,
4,1000027E,Vegan,61,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
362749,999917E,Vegetarian,28,169413.0,3600.387748,0.0,Indifferent,0.0,Indifferent,0.0
362750,999936C,Omnivore,22,,,,,,,
362751,99993D,Vegetarian,58,,,,,,,
362752,99994A,Vegetarian,18,373964.0,7199.509521,0.0,Yes,0.0,0,0.0


In [6]:
merged_diet_requests.to_csv('merged_diet_requests.csv')

Impute the missing values

In [7]:
merged_diet_requests['Diet'] = merged_diet_requests['Diet'].fillna('Vegetarian')
merged_diet_requests_cleaned = merged_diet_requests.dropna(subset=['RecipeId'])
merged_diet_requests_cleaned

Unnamed: 0,AuthorId,Diet,Age,RecipeId,Time,HighCalories,HighProtein,LowFat,LowSugar,HighFiber
6,1000036C,Vegetarian,50,320576.0,119.024930,0.0,Indifferent,0.0,Indifferent,1.0
14,1000216B,Vegetarian,78,189335.0,1199.386790,0.0,Yes,0.0,0,1.0
16,1000221A,Vegetarian,25,133043.0,362.152341,0.0,Yes,0.0,Indifferent,1.0
17,1000221A,Vegetarian,25,90537.0,1198.957497,0.0,Yes,0.0,0,1.0
18,1000221A,Vegetarian,25,334314.0,5400.036634,1.0,Indifferent,0.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...
362736,999774A,Vegetarian,57,29002.0,2402.372535,0.0,Indifferent,0.0,Indifferent,0.0
362737,999774A,Vegetarian,57,159252.0,5999.598903,0.0,Yes,0.0,0,0.0
362738,999774A,Vegetarian,57,1171.0,480.233207,1.0,Yes,0.0,0,0.0
362749,999917E,Vegetarian,28,169413.0,3600.387748,0.0,Indifferent,0.0,Indifferent,0.0


Merge the file with reviews column to include 'likes' column

In [8]:
merged_data = merged_diet_requests_cleaned.merge(reviews, how='inner', on='AuthorId')
merged_data

Unnamed: 0,AuthorId,Diet,Age,RecipeId_x,Time,HighCalories,HighProtein,LowFat,LowSugar,HighFiber,RecipeId_y,Rating,Like,TestSetId
0,1000036C,Vegetarian,50,320576.0,119.024930,0.0,Indifferent,0.0,Indifferent,1.0,320576,,False,
1,1000216B,Vegetarian,78,189335.0,1199.386790,0.0,Yes,0.0,0,1.0,189335,,False,
2,1000221A,Vegetarian,25,133043.0,362.152341,0.0,Yes,0.0,Indifferent,1.0,133043,2.0,False,
3,1000221A,Vegetarian,25,133043.0,362.152341,0.0,Yes,0.0,Indifferent,1.0,90537,2.0,False,
4,1000221A,Vegetarian,25,133043.0,362.152341,0.0,Yes,0.0,Indifferent,1.0,334314,2.0,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7926874,999774A,Vegetarian,57,1171.0,480.233207,1.0,Yes,0.0,0,0.0,29002,2.0,False,
7926875,999774A,Vegetarian,57,1171.0,480.233207,1.0,Yes,0.0,0,0.0,159252,,False,
7926876,999774A,Vegetarian,57,1171.0,480.233207,1.0,Yes,0.0,0,0.0,1171,2.0,True,
7926877,999917E,Vegetarian,28,169413.0,3600.387748,0.0,Indifferent,0.0,Indifferent,0.0,169413,2.0,False,


Data Visualization

Random Forest Classifier

In [9]:
# Step 1: Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [10]:
# Step 2: Load cleaned data
def load_cleaned_data():
    # Replace this function with your data loading and cleaning logic
    # Example assumes X contains features and y contains labels
    data = pd.read_csv('merged_diet_requests.csv')
    X = data.drop('target_column', axis=1)
    y = data['target_column']
    return X, y

X, y = load_cleaned_data()


KeyError: "['target_column'] not found in axis"