In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('final_data.csv')

In [3]:
df['menstrual_cycle_regular'] = df['menstrual_cycle_regular'].fillna('None')

In [4]:
df.isnull().sum()

age                        0
gender                     0
height_cm                  0
weight_kg                  0
bmi                        0
goal                       0
medical_conditions         0
allergies                  0
menstrual_cycle_regular    0
cycle_length_days          0
avg_water_intake           0
avg_sleep_hours            0
avg_daily_steps            0
avg_heart_rate             0
avg_calorie_burned         0
calorie_target             0
protein_g                  0
carbs_g                    0
fat_g                      0
recommended_diet_type      0
category                   0
dtype: int64

In [5]:
df['goal'].value_counts()

goal
maintain                         240
improve cardiovascular health    235
gain muscle                      205
maintain weight                  200
improve endurance                196
lose weight                      126
maintain performance             109
increase strength                 97
recover from injury               89
reduce stress                     73
reduce blood pressure             70
gain_mus                          57
improve_                          56
weight_l                          47
Name: count, dtype: int64

In [6]:
fix_dict = {
    'gain_mus': 'gain muscle',
    'improve_': 'improve cardiovascular health',
    'weight_l': 'lose weight'
}
df['goal'] = df['goal'].replace(fix_dict)

In [7]:
df['goal'].value_counts()

goal
improve cardiovascular health    291
gain muscle                      262
maintain                         240
maintain weight                  200
improve endurance                196
lose weight                      173
maintain performance             109
increase strength                 97
recover from injury               89
reduce stress                     73
reduce blood pressure             70
Name: count, dtype: int64

In [8]:
df['menstrual_cycle_regular'].value_counts()

menstrual_cycle_regular
None     874
FALSE    324
yes      227
TRUE     190
no       185
Name: count, dtype: int64

In [9]:
mapping = {
    "TRUE": "yes",
    "yes": "yes",
    "FALSE": "no",
    "no": "no",
    "None": "None"  
}

df['menstrual_cycle_regular'] = df['menstrual_cycle_regular'].replace(mapping)

In [10]:
df['medical_conditions'].value_counts()

medical_conditions
none                                    1075
asthma                                   118
joint_pain                                91
diabetes                                  82
hypertension                              71
sports_injury                             70
diab                                      48
asth                                      45
obes                                      35
anem                                      32
arthritis                                 26
cardiovascular disease                    20
arthritis, hypertension                    9
none, arthritis                            8
arthritis, diabetes                        7
diabetes, cardiovascular disease           7
diabetes, hypertension                     6
none, hypertension                         6
diabetes, arthritis                        5
none, diabetes                             5
hypertension, arthritis                    5
hypertension, diabetes              

In [11]:
import pandas as pd

# Your column
# Replace abbreviations
df['medical_conditions'] = df['medical_conditions'].replace({
    "diab": "diabetes",
    "asth": "asthma",
    "anem": "anemia",
    "obes": "obesity"
})

# Function to clean each entry
def clean_conditions(entry):
    if entry == "none":
        return "none"
    
    # Split by comma, strip spaces, and map values
    parts = [p.strip() for p in entry.split(",")]
    parts = [p for p in parts if p != "none"]  # remove 'none' from combined entries
    parts = sorted(set(parts))  # remove duplicates, sort
    return ", ".join(parts) if parts else "none"

df['medical_conditions'] = df['medical_conditions'].apply(clean_conditions)


In [12]:
df['medical_conditions'].value_counts()

medical_conditions
none                                    1075
asthma                                   163
diabetes                                 139
joint_pain                                91
hypertension                              80
sports_injury                             70
obesity                                   35
arthritis                                 34
anemia                                    32
cardiovascular disease                    27
arthritis, hypertension                   14
cardiovascular disease, diabetes          12
arthritis, diabetes                       12
diabetes, hypertension                    11
arthritis, cardiovascular disease          3
cardiovascular disease, hypertension       2
Name: count, dtype: int64

In [13]:
from sklearn.preprocessing import LabelEncoder

In [14]:
obj_cols = df.select_dtypes(include='object')
for col in obj_cols.columns:
    df[col] = df[col].astype(str)

In [15]:
label_mapping = {}
le = LabelEncoder()
for col in obj_cols.columns:
    df[col] = le.fit_transform(df[col])
    label_mapping[col] = dict(zip(le.classes_,le.transform(le.classes_)))

In [20]:
label_encodings = {
    'gender': {'female': 0, 'male': 1},

    'goal': {
        'gain muscle': 0,
        'improve cardiovascular health': 1,
        'improve endurance': 2,
        'increase strength': 3,
        'lose weight': 4,
        'maintain': 5,
        'maintain performance': 6,
        'maintain weight': 7,
        'recover from injury': 8,
        'reduce blood pressure': 9,
        'reduce stress': 10
    },

    'medical_conditions': {
        'anemia': 0,
        'arthritis': 1,
        'arthritis, cardiovascular disease': 2,
        'arthritis, diabetes': 3,
        'arthritis, hypertension': 4,
        'asthma': 5,
        'cardiovascular disease': 6,
        'cardiovascular disease, diabetes': 7,
        'cardiovascular disease, hypertension': 8,
        'diabetes': 9,
        'diabetes, hypertension': 10,
        'hypertension': 11,
        'joint_pain': 12,
        'none': 13,
        'obesity': 14,
        'sports_injury': 15
    },

    'allergies': {
        'gluten': 0,
        'lactose': 1,
        'none': 2,
        'nuts': 3,
        'peanuts': 4,
        'pollen': 5,
        'seafood': 6,
        'shellfish': 7
    },

    'menstrual_cycle_regular': {
        'none': 0,
        'no': 1,
        'yes': 2
    },

    'recommended_diet_type': {
        'Balanced': 0,
        'DASH': 1,
        'High Protein': 2,
        'Ketogenic': 3,
        'Low Carb': 4,
        'Mediterranean': 5
    },

    'category': {
        'athlete': 0,
        'normal': 1
    }
}

In [22]:
with open("label_encoding.json", "w") as f:
    json.dump(label_encodings, f)

In [23]:
X = df.drop(['calorie_target','protein_g','carbs_g','fat_g','recommended_diet_type'],axis=1)
y= df[['calorie_target','protein_g','carbs_g','fat_g']]

In [24]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

In [25]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1,stratify=df['category'])

In [26]:
xgb_r = XGBRegressor(objective ='reg:linear',
                  n_estimators = 10, seed = 123)
cat_r = CatBoostRegressor(verbose=100,loss_function='RMSE')

In [27]:
model_1 = MultiOutputRegressor(xgb_r)
model_2 = MultiOutputRegressor(cat_r)

In [28]:
model_1.fit(X_train,y_train)

  bst.update(dtrain, iteration=i, fobj=obj)
  bst.update(dtrain, iteration=i, fobj=obj)
  bst.update(dtrain, iteration=i, fobj=obj)
  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,estimator,"XGBRegressor(...ree=None, ...)"
,n_jobs,

0,1,2
,objective,'reg:linear'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [29]:
model_2.fit(X_train,y_train)

Learning rate set to 0.043371
0:	learn: 570.1686213	total: 155ms	remaining: 2m 35s
100:	learn: 178.7312375	total: 427ms	remaining: 3.8s
200:	learn: 146.6994033	total: 770ms	remaining: 3.06s
300:	learn: 124.5052022	total: 1.02s	remaining: 2.38s
400:	learn: 108.2108680	total: 1.28s	remaining: 1.91s
500:	learn: 96.2315307	total: 1.64s	remaining: 1.64s
600:	learn: 86.0650709	total: 1.94s	remaining: 1.29s
700:	learn: 77.9001560	total: 2.16s	remaining: 922ms
800:	learn: 70.6800767	total: 2.35s	remaining: 585ms
900:	learn: 64.7837306	total: 2.52s	remaining: 277ms
999:	learn: 60.0812088	total: 2.69s	remaining: 0us
Learning rate set to 0.043371
0:	learn: 30.0538554	total: 1.79ms	remaining: 1.78s
100:	learn: 12.0027821	total: 169ms	remaining: 1.5s
200:	learn: 10.4607802	total: 389ms	remaining: 1.54s
300:	learn: 9.2912149	total: 666ms	remaining: 1.55s
400:	learn: 8.3261411	total: 950ms	remaining: 1.42s
500:	learn: 7.6223862	total: 1.22s	remaining: 1.22s
600:	learn: 6.9992779	total: 1.49s	remainin

0,1,2
,estimator,<catboost.cor...0021FFD63EBA0>
,n_jobs,


In [30]:
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

In [31]:
y_pred_xgb = model_1.predict(X_test)
y_pred_cat = model_2.predict(X_test)

In [32]:
for i, col in enumerate(y_test.columns):
    print(f"=== {col} ===")
    print("MAE:", mean_absolute_error(y_test[col], y_pred_xgb[:, i]))
    print("MSE:", mean_squared_error(y_test[col], y_pred_xgb[:, i]))
    print("R²:", r2_score(y_test[col], y_pred_xgb[:, i]))
    print()

=== calorie_target ===
MAE: 149.4416046142578
MSE: 44615.140625
R²: 0.8774406909942627

=== protein_g ===
MAE: 11.120753288269043
MSE: 186.62539672851562
R²: 0.7974385619163513

=== carbs_g ===
MAE: 23.221250534057617
MSE: 890.402587890625
R²: 0.8619889616966248

=== fat_g ===
MAE: 7.443862438201904
MSE: 88.590576171875
R²: 0.8228431940078735



In [33]:
for i, col in enumerate(y_test.columns):
    print(f"=== {col} ===")
    print("MAE:", mean_absolute_error(y_test[col], y_pred_cat[:, i]))
    print("MSE:", mean_squared_error(y_test[col], y_pred_cat[:, i]))
    print("R²:", r2_score(y_test[col], y_pred_cat[:, i]))
    print()

=== calorie_target ===
MAE: 143.52671486433616
MSE: 43848.585290494455
R²: 0.8795464603740077

=== protein_g ===
MAE: 11.151198104641635
MSE: 188.4925410091112
R²: 0.7954119624018273

=== carbs_g ===
MAE: 23.55916205211187
MSE: 941.1471824229827
R²: 0.8541236193970825

=== fat_g ===
MAE: 7.645553365037362
MSE: 90.14594626707701
R²: 0.8197329192265469



In [35]:
import joblib
joblib.dump(model_2, "dietra_catboost_model.pkl")

['dietra_catboost_model.pkl']

In [36]:
X.dtypes.astype(str).to_json('input_variables.json')