In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df= pd.read_csv('finalised_obesity.csv')

In [4]:
df.columns

Index(['Unnamed: 0', 'Gender', 'Age', 'family_history_with_overweight', 'FAVC',
       'SMOKE', 'CH2O', 'SCC', 'CALC', 'MTRANS', 'obesity_level', 'BMI',
       'SedentaryScore', 'DietScore'],
      dtype='object')

In [5]:
df = df.rename(columns={'NObeyesdad': 'obesity_level'})
df= df.drop(columns={'Unnamed: 0'})

In [6]:
df.head()

Unnamed: 0,Gender,Age,family_history_with_overweight,FAVC,SMOKE,CH2O,SCC,CALC,MTRANS,obesity_level,BMI,SedentaryScore,DietScore
0,Female,21.0,yes,no,no,2.0,no,no,Public_Transportation,Normal_Weight,24.386526,1.0,4.0
1,Female,21.0,yes,no,yes,3.0,yes,Sometimes,Public_Transportation,Normal_Weight,24.238227,-3.0,5.0
2,Male,23.0,yes,no,no,2.0,no,Frequently,Public_Transportation,Normal_Weight,23.765432,-1.0,4.0
3,Male,27.0,no,no,no,2.0,no,Frequently,Walking,Overweight_Level_I,26.851852,-2.0,5.0
4,Male,22.0,no,no,no,2.0,no,Sometimes,Public_Transportation,Overweight_Level_II,28.342381,0.0,2.0


In [7]:
missing_data = df.isnull().sum()
print("\nMissing Data:")
print(missing_data)


Missing Data:
Gender                            0
Age                               0
family_history_with_overweight    0
FAVC                              0
SMOKE                             0
CH2O                              0
SCC                               0
CALC                              0
MTRANS                            0
obesity_level                     0
BMI                               0
SedentaryScore                    0
DietScore                         0
dtype: int64


# Machine Learning Modeling

In [8]:
# Label encoding for categorical variables
from sklearn.preprocessing import LabelEncoder

# Assuming `data` is your original DataFrame
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Identify categorical columns
categorical_columns = ['Gender', 'family_history_with_overweight','FAVC', 
                       'SMOKE', 'SCC', 'CALC', 'MTRANS', 'obesity_level']

# Create a new DataFrame to store encoded data
encoded_data = df.copy()

# Apply Label Encoding to each categorical column
for column in categorical_columns:
    encoded_data[column] = label_encoder.fit_transform(df[column])

# Display the newly encoded DataFrame
print(encoded_data.head())

   Gender   Age  family_history_with_overweight  FAVC  SMOKE  CH2O  SCC  CALC  \
0       0  21.0                               1     0      0   2.0    0     3   
1       0  21.0                               1     0      1   3.0    1     2   
2       1  23.0                               1     0      0   2.0    0     1   
3       1  27.0                               0     0      0   2.0    0     1   
4       1  22.0                               0     0      0   2.0    0     2   

   MTRANS  obesity_level        BMI  SedentaryScore  DietScore  
0       3              1  24.386526             1.0        4.0  
1       3              1  24.238227            -3.0        5.0  
2       3              1  23.765432            -1.0        4.0  
3       4              5  26.851852            -2.0        5.0  
4       3              6  28.342381             0.0        2.0  


### Modelling Techniques

In [9]:
import warnings
# Example: Ignore warnings globally
warnings.filterwarnings('ignore')  # Suppresses all warnings
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV


In [10]:
X = encoded_data.drop(columns=['obesity_level'])  # Features (all columns except the target)
y = encoded_data['obesity_level']                # Target (obesity_level)
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Logistic Regression with Default Parameters

In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
log_model = LogisticRegression(multi_class='multinomial',solver='lbfgs', max_iter=1000, random_state=42)
log_model.fit(X_train, y_train)
y_predlog=log_model.predict(X_test)
print(accuracy_score(y_test,y_predlog))
print("\nLogistic Regression:")
print("Accuracy:", accuracy_score(y_test, y_predlog))
print(classification_report(y_test, y_predlog))

0.8912529550827423

Logistic Regression:
Accuracy: 0.8912529550827423
              precision    recall  f1-score   support

           0       0.93      1.00      0.97        56
           1       0.96      0.81      0.88        62
           2       0.90      0.82      0.86        78
           3       0.89      0.93      0.91        58
           4       0.89      1.00      0.94        63
           5       0.82      0.88      0.84        56
           6       0.85      0.82      0.84        50

    accuracy                           0.89       423
   macro avg       0.89      0.89      0.89       423
weighted avg       0.89      0.89      0.89       423



### Logistic Regression with Hyperparameters

In [13]:
import numpy as np
#Define parameter grid for GridSearchCV
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'multi_class': ['multinomial', 'ovr'],  # Multi-class handling options
    'solver': ['lbfgs', 'newton-cg', 'saga'],  # Solvers suitable for multi-class classification
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength (inverse of regularization)
    'max_iter': [100, 500, 1000]  # Number of iterations for convergence
}

#Initialize Logistic Regression model
log_model = LogisticRegression(random_state=42)

#Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=log_model, param_grid=param_grid, 
                           cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Step 6: Extract best parameters and evaluate model
best_params = grid_search.best_params_
best_log_model = grid_search.best_estimator_
y_pred = best_log_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Best Hyperparameters: {best_params}")
print(f"Test Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Fitting 5 folds for each of 270 candidates, totalling 1350 fits
Best Hyperparameters: {'C': 1, 'max_iter': 100, 'multi_class': 'multinomial', 'penalty': 'l2', 'solver': 'newton-cg'}
Test Accuracy: 0.96

Classification Report:
              precision    recall  f1-score   support

           0       0.93      1.00      0.97        56
           1       1.00      0.87      0.93        62
           2       0.99      0.99      0.99        78
           3       0.98      1.00      0.99        58
           4       1.00      1.00      1.00        63
           5       0.91      0.93      0.92        56
           6       0.92      0.96      0.94        50

    accuracy                           0.96       423
   macro avg       0.96      0.96      0.96       423
weighted avg       0.97      0.96      0.96       423



### Decision Tree Classifier 

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Step 1: Initialize and train the Decision Tree Classifier
dt_model = DecisionTreeClassifier()  # You can tune max_depth and criterion
dt_model.fit(X_train, y_train)

# Step 2: Predict on the test set
y_pred_dt = dt_model.predict(X_test)

# Step 3: Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred_dt)
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_dt))

Accuracy: 0.96

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.95      0.97        56
           1       0.94      0.98      0.96        62
           2       0.99      0.94      0.96        78
           3       0.92      0.98      0.95        58
           4       1.00      1.00      1.00        63
           5       0.93      0.96      0.95        56
           6       0.98      0.94      0.96        50

    accuracy                           0.96       423
   macro avg       0.96      0.96      0.96       423
weighted avg       0.97      0.96      0.96       423



### Decision Tree with HyperParameters

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Initialize the parameter grid
param_grid = {
    'criterion': ['gini', 'entropy', 'log_loss'],  # Measures for splitting
    'max_depth': [None, 5, 10, 15, 20, 30, 50],  # Range of tree depths
    'min_samples_split': [2, 5, 10, 15, 20],  # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4, 8, 10],  # Minimum samples at leaf nodes
    'max_features': [None, 'sqrt', 'log2'],  # Features considered for splits
    'splitter': ['best', 'random'],  # Splitting strategy
    'class_weight': [None, 'balanced']  # Handle class imbalance
}

# Initialize Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)

# Perform Grid Search CV
grid_search = GridSearchCV(estimator=dt_model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model and hyperparameters
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Display the best parameters
print("\nBest Hyperparameters:", best_params)

# Predict and evaluate the best model
y_pred_dt = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_dt)
print(f"\nAccuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_dt))

Fitting 5 folds for each of 6300 candidates, totalling 31500 fits

Best Hyperparameters: {'class_weight': None, 'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best'}

Accuracy: 0.97

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.96      0.98        56
           1       0.94      0.98      0.96        62
           2       0.97      0.95      0.96        78
           3       0.93      0.98      0.96        58
           4       1.00      1.00      1.00        63
           5       0.98      0.95      0.96        56
           6       0.98      0.98      0.98        50

    accuracy                           0.97       423
   macro avg       0.97      0.97      0.97       423
weighted avg       0.97      0.97      0.97       423




### Random Forest Classifier

In [None]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
print("Random Forest Classifier:")
print("Accuracy:", accuracy_score(y_test, rf_predictions))
print(classification_report(y_test, rf_predictions))

Random Forest Classifier:
Accuracy: 0.9810874704491725
              precision    recall  f1-score   support

           0       1.00      0.96      0.98        56
           1       0.94      0.98      0.96        62
           2       1.00      0.99      0.99        78
           3       0.98      1.00      0.99        58
           4       1.00      1.00      1.00        63
           5       0.98      0.93      0.95        56
           6       0.96      1.00      0.98        50

    accuracy                           0.98       423
   macro avg       0.98      0.98      0.98       423
weighted avg       0.98      0.98      0.98       423



### Random Forest Classifier with HyperParameter

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import pandas as pd

# Example: Replace X_train, y_train, X_test, y_test with your data

# Define the parameter grid for Grid Search
param_grid = {
    'n_estimators': [50, 100, 150, 200],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],      # Maximum depth of trees
    'min_samples_split': [2, 5, 10],      # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4],        # Minimum samples at a leaf node
    'max_features': ['sqrt', 'log2'],     # Number of features to consider at each split
    'criterion': ['gini', 'entropy'],     # Splitting criteria
    'class_weight': ['balanced', 'balanced_subsample']  # Handling class imbalance
}

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best parameters and best model
best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

# Print the best parameters
print("Best Parameters:", best_params)

# Evaluate the best model on the test set
rf_predictions = best_rf_model.predict(X_test)
print("\nRandom Forest Classifier with Best Parameters:")
print("Accuracy:", accuracy_score(y_test, rf_predictions))
print(classification_report(y_test, rf_predictions))

Fitting 5 folds for each of 1152 candidates, totalling 5760 fits
Best Parameters: {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}

Random Forest Classifier with Best Parameters:
Accuracy: 0.983451536643026
              precision    recall  f1-score   support

           0       1.00      0.96      0.98        56
           1       0.95      1.00      0.98        62
           2       0.99      0.99      0.99        78
           3       0.98      0.98      0.98        58
           4       1.00      1.00      1.00        63
           5       1.00      0.95      0.97        56
           6       0.96      1.00      0.98        50

    accuracy                           0.98       423
   macro avg       0.98      0.98      0.98       423
weighted avg       0.98      0.98      0.98       423



### XGBoost Classifier

In [None]:
from xgboost import XGBClassifier
# XGBoost Classifier
xgb_model = XGBClassifier(eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)
print("\nXGBoost Classifier:")
print("Accuracy:", accuracy_score(y_test, xgb_predictions))
print(classification_report(y_test, xgb_predictions))


XGBoost Classifier:
Accuracy: 0.9858156028368794
              precision    recall  f1-score   support

           0       1.00      0.96      0.98        56
           1       0.94      1.00      0.97        62
           2       1.00      0.99      0.99        78
           3       0.98      1.00      0.99        58
           4       1.00      1.00      1.00        63
           5       1.00      0.95      0.97        56
           6       0.98      1.00      0.99        50

    accuracy                           0.99       423
   macro avg       0.99      0.99      0.99       423
weighted avg       0.99      0.99      0.99       423



### XGBoost Classifier with Hyperparameter Tuning

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Step 1: Define the parameter grid
param_grid = {
    'n_estimators': [50, 100],               # Fewer options for tree count
    'learning_rate': [0.1, 0.2],             # Focused range for convergence
    'max_depth': [5, 7],                     # Reduced tree depth for simplicity
    'min_child_weight': [3],                 # Single, commonly used value
    'gamma': [0, 0.1],                       # Minimal options for loss reduction
    'subsample': [0.7, 1.0],                 # Most effective sampling fractions
    'colsample_bytree': [0.7, 1.0],          # Focused range for feature selection
    'scale_pos_weight': [1]                  # Standard balance for imbalanced data
}

# Step 2: Initialize the XGBoost Classifier
xgb_model = XGBClassifier(eval_metric='logloss', random_state=42, use_label_encoder=False)

# Step 3: Perform Grid Search CV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, 
                           cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Step 4: Get the best parameters and best model
best_params = grid_search.best_params_
best_xgb_model = grid_search.best_estimator_
print("\nBest Hyperparameters:", best_params)

# Step 5: Predict on the test set using the best model
xgb_predictions = best_xgb_model.predict(X_test)

# Step 6: Evaluate the model
accuracy = accuracy_score(y_test, xgb_predictions)
print(f"\nAccuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, xgb_predictions))

Fitting 5 folds for each of 64 candidates, totalling 320 fits



Best Hyperparameters: {'colsample_bytree': 0.7, 'gamma': 0, 'learning_rate': 0.2, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 100, 'scale_pos_weight': 1, 'subsample': 1.0}

Accuracy: 0.99

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.96      0.98        56
           1       0.95      1.00      0.98        62
           2       0.99      0.99      0.99        78
           3       0.98      0.98      0.98        58
           4       1.00      1.00      1.00        63
           5       1.00      0.96      0.98        56
           6       0.98      1.00      0.99        50

    accuracy                           0.99       423
   macro avg       0.99      0.99      0.99       423
weighted avg       0.99      0.99      0.99       423



### Support Vector Classifier

In [None]:
# Support Vector Machine (SVM)
from sklearn.svm import SVC
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)
print("\nSupport Vector Machine (SVM):")
print("Accuracy:", accuracy_score(y_test, svm_predictions))
print(classification_report(y_test, svm_predictions))


Support Vector Machine (SVM):
Accuracy: 0.9645390070921985
              precision    recall  f1-score   support

           0       0.95      0.96      0.96        56
           1       0.95      0.94      0.94        62
           2       0.97      0.97      0.97        78
           3       0.97      1.00      0.98        58
           4       1.00      1.00      1.00        63
           5       0.96      0.93      0.95        56
           6       0.94      0.94      0.94        50

    accuracy                           0.96       423
   macro avg       0.96      0.96      0.96       423
weighted avg       0.96      0.96      0.96       423



### Support Vector Classifier with Hyperparameters

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Step 1: Define the parameter grid
param_grid = {
    'kernel': ['linear', 'rbf'],                       # Focused kernel types
    'C': [0.1, 1, 10, 100],                            # Narrowed range of regularization strengths
    'gamma': ['scale', 0.01, 0.1],                     # Simplified kernel coefficients
    'degree': [2, 3],                                  # Reduced polynomial degrees
    'shrinking': [True],                               # Single value (shrinking heuristic often preferred              # One-vs-One strategy (commonly used for SVM)
    'class_weight': [None, 'balanced']               # Handle imbalanced class distribution
}


# Step 2: Initialize the SVC model
svc_model = SVC(random_state=42)

# Step 3: Perform Grid Search CV
grid_search = GridSearchCV(estimator=svc_model, param_grid=param_grid, 
                           cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Step 4: Get the best parameters and best model
best_params = grid_search.best_params_
best_svc_model = grid_search.best_estimator_
print("\nBest Hyperparameters:", best_params)

# Step 5: Predict on the test set using the best model
svc_predictions = best_svc_model.predict(X_test)

# Step 6: Evaluate the model's performance
accuracy = accuracy_score(y_test, svc_predictions)
print(f"\nAccuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, svc_predictions))

Fitting 5 folds for each of 96 candidates, totalling 480 fits



Best Hyperparameters: {'C': 10, 'class_weight': None, 'degree': 2, 'gamma': 0.1, 'kernel': 'rbf', 'shrinking': True}

Accuracy: 0.97

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96        56
           1       0.92      0.92      0.92        62
           2       0.97      0.99      0.98        78
           3       0.98      0.98      0.98        58
           4       1.00      1.00      1.00        63
           5       0.93      0.95      0.94        56
           6       1.00      0.96      0.98        50

    accuracy                           0.97       423
   macro avg       0.97      0.97      0.97       423
weighted avg       0.97      0.97      0.97       423

