# **IMPORTING IMPORTANT LIBRARIES**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report , confusion_matrix
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings("ignore")

# **READING THE DATASET**

In [None]:
df = pd.read_csv("diet_recommendations_dataset.csv")
df=pd.DataFrame(df)
df =pd.concat([df,df.loc[10:110]],ignore_index=True)
df.head()

# **EDA**

In [None]:
print("Initial Data Info:")
print(df.info())

In [None]:
print("Checking Missing Values:")
print(df.isnull().sum())

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df = df.drop(columns=["Patient_ID"], errors='ignore')

In [None]:
df['Disease_Type'].fillna(df['Disease_Type'].mode()[0], inplace=True)
df['Dietary_Restrictions'].fillna(df['Dietary_Restrictions'].mode()[0], inplace=True)
df['Allergies'].fillna(df['Allergies'].mode()[0], inplace=True)

In [None]:
numerical_cols = df.select_dtypes(include=["int64", "float64"]).columns
num_cols = len(numerical_cols)
fig, axes = plt.subplots(nrows=num_cols//3 + 1, ncols=3, figsize=(20, 5 * (num_cols//3 + 1)))
axes = axes.flatten()

for i, col in enumerate(numerical_cols):
    sns.histplot(df[col], bins=20, kde=True, ax=axes[i], color='teal')
    axes[i].set_title(f"Distribution of {col}")

plt.tight_layout()
plt.show()

In [None]:
plt.subplots(figsize=(20,10))
for i ,column in enumerate(numerical_cols):
    plt.subplot(3,4, i+1)
    sns.histplot(data=df, x=column, hue='Diet_Recommendation', kde=True, palette='viridis', bins=30)
    plt.title(f"Distribution of {column}")
    plt.xlabel(column)
    plt.ylabel("Count")
    plt.legend(title="Diet", labels=["No (0)", "Yes (1)"])
plt.tight_layout()
plt.show()

In [None]:
categorical_cols = ['Disease_Type','Gender','Severity','Physical_Activity_Level','Dietary_Restrictions',
                    'Allergies','Preferred_Cuisine','Diet_Recommendation']
plt.figure(figsize=(20, 10))
for i, col in enumerate(categorical_cols, 1):
    plt.subplot(3, 3, i)
    sns.countplot(data=df, x=col, palette="coolwarm")
    plt.title(f"Distribution of {col}")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
corr_matrix = df.select_dtypes(include=["number"]).corr()
sns.heatmap(corr_matrix.corr(), annot=True, fmt=".2f", cmap="RdYlBu")
plt.title("Correlation Matrix")
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(data=df, x='Diet_Recommendation', y='BMI', palette="coolwarm")
plt.title("Boxplot of BMI by Diet_Recommendation")
plt.xlabel("Diet_Recommendation (Target)")
plt.ylabel("BMI")
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
for i, col in enumerate(categorical_cols, 1):
    plt.subplot(3, 3, i)
    sns.countplot(data=df, x=col,hue = 'Diet_Recommendation', palette="coolwarm")
    plt.title(f"Distribution of {col}")
plt.tight_layout()
plt.show()

In [None]:
sns.pairplot(df, vars= numerical_cols, hue='Diet_Recommendation', palette="coolwarm")
plt.suptitle("Pair Plot of Numerical Features by Diet_Recommendation", y=1.02)
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
for i, col in enumerate(numerical_cols, 1):
    plt.subplot(3, 4, i)
    sns.boxplot(data=df, y=col, palette="viridis")
    plt.title(f"Boxplot of {col}")
plt.tight_layout()
plt.show()

In [None]:
outlier_columns = []
for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    if not outliers.empty:
        outlier_columns.append(col)

print("Columns with potential outliers:", outlier_columns)
print(df.shape)

In [None]:
for col in outlier_columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

In [None]:
print("Missing values:\n", df.isnull().sum())

# **Encoding**

In [None]:
categorical_cols = df.select_dtypes(include=["object"]).columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

categorical_cols

In [None]:
print("\nEncoded Categorical Variables:")
print(df.head())

In [None]:
corr = df.corr()
print(corr['Diet_Recommendation'].sort_values(ascending=False))

# **FEATURE SCALING**

In [None]:
numerical_cols = df.drop(columns=["Diet_Recommendation"]).select_dtypes(include=["int64", "float64"]).columns
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
print("\nScaled Numerical Features:")
print(df.head())

mean = scaler.mean_
std = np.sqrt(scaler.var_)  # Use std, not var, because StandardScaler uses (X - mean) / std

print("Mean:", mean)
print("Std Dev:", std)



Scaled Numerical Features:
        Age    Gender  Weight_kg  Height_cm       BMI  Disease_Type  Severity  \
0  0.338975  0.952914  -1.302914  -1.044832 -0.669464      1.497711 -0.039619   
1  1.057315  0.952914   0.836109  -0.414303  0.922936     -1.393219 -1.272776   
2 -0.213593 -1.049412  -1.048031  -0.134068 -0.871673      0.052246 -1.272776   
3 -0.987190  0.952914  -1.317908  -0.764597 -0.821121      0.052246 -1.272776   
4  0.560003  0.952914  -0.248396   1.547342 -0.960140     -1.393219 -0.039619   

   Physical_Activity_Level  Daily_Caloric_Intake  Cholesterol_mg/dL  \
0                 0.003697              1.066792          -0.909240   
1                 0.003697              0.983459          -0.019427   
2                 1.231227             -1.312616          -0.644701   
3                 0.003697              0.318572          -1.084454   
4                 1.231227              1.806146           0.021800   

   Blood_Pressure_mmHg  Glucose_mg/dL  Dietary_Restriction

In [None]:
X = df.drop(columns=["Diet_Recommendation"])
y = df["Diet_Recommendation"]

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)


In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# **MODELING**

# RANDOM FOREST CLASSIFIER

In [None]:
model=RandomForestClassifier(n_estimators=200,min_samples_leaf=2)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

print(f'train score: {model.score(X_train,y_train)}')
print(f'test score: {model.score(X_test,y_test)}')
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
pd.DataFrame({'True_values':y_test,'pred_values':y_pred})

In [None]:
# !pip install skl2onnx
# !pip install onnxruntime_gpu
import joblib
import skl2onnx
import onnx
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

initial_type = [("float_input", FloatTensorType([None, X_train.shape[1]]))]
onnx_model = convert_sklearn(model, initial_types=initial_type)

# Save ONNX model
with open("/content/model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())