In [1]:
# Supress warnings:
import warnings
warnings.filterwarnings("ignore")

In [2]:
#loading data
import pandas as pd
from pathlib import Path

file_path = Path("obesityData/Obesity Classification.csv")
obesity = pd.read_csv(file_path)

#first 5 records
obesity.head()

Unnamed: 0,ID,Age,Gender,Height,Weight,BMI,Label
0,1,25,Male,175,80,25.3,Normal Weight
1,2,30,Female,160,60,22.5,Normal Weight
2,3,35,Male,180,90,27.3,Overweight
3,4,40,Female,150,50,20.0,Underweight
4,5,45,Male,190,100,31.2,Obese


In [3]:
#verifying last records
obesity.tail()

Unnamed: 0,ID,Age,Gender,Height,Weight,BMI,Label
103,106,11,Male,175,10,3.9,Underweight
104,107,16,Female,160,10,3.9,Underweight
105,108,21,Male,180,15,5.6,Underweight
106,109,26,Female,150,15,5.6,Underweight
107,110,31,Male,190,20,8.3,Underweight


In [4]:
#dropping irrelevant column/s
columns_to_drop = ['ID']

obesity.drop(columns=columns_to_drop, axis=1, inplace=True)
obesity.head()

Unnamed: 0,Age,Gender,Height,Weight,BMI,Label
0,25,Male,175,80,25.3,Normal Weight
1,30,Female,160,60,22.5,Normal Weight
2,35,Male,180,90,27.3,Overweight
3,40,Female,150,50,20.0,Underweight
4,45,Male,190,100,31.2,Obese


In [5]:
#replacing Gender column with numerical labels for algorithm

from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder object for the "Gender" column
le_gender = LabelEncoder()
# Encode the "Gender" column
obesity['Gender'] = le_gender.fit_transform(obesity['Gender'])

print("Gender Encoder Classes:")
for gender, encoded_gender in zip(le_gender.classes_, le_gender.transform(le_gender.classes_)):
    # print original gender labels and the encoded values
    print(f"{gender}: {encoded_gender}")  # Print the gender label and its encoded value


obesity.head()

Gender Encoder Classes:
Female: 0
Male: 1


Unnamed: 0,Age,Gender,Height,Weight,BMI,Label
0,25,1,175,80,25.3,Normal Weight
1,30,0,160,60,22.5,Normal Weight
2,35,1,180,90,27.3,Overweight
3,40,0,150,50,20.0,Underweight
4,45,1,190,100,31.2,Obese


In [6]:
#checking for null values
obesity.isnull().sum()

Age       0
Gender    0
Height    0
Weight    0
BMI       0
Label     0
dtype: int64

In [7]:

# manually replacing Label column with numerical labels for algorithm

label_mapping = {
    "Normal Weight": 0,
    "Overweight": 1,
    "Obese": 2,
    "Underweight": 3
}
# Manually assigning the encoder using the label mapping
obesity['Label'] = obesity['Label'].map(label_mapping)

print("Manually Assigned Encoder:")
for label, encoded_label in label_mapping.items():
    # # Print the original label and its corresponding encoded value
    print(f"{label}: {encoded_label}")


obesity.head()


Manually Assigned Encoder:
Normal Weight: 0
Overweight: 1
Obese: 2
Underweight: 3


Unnamed: 0,Age,Gender,Height,Weight,BMI,Label
0,25,1,175,80,25.3,0
1,30,0,160,60,22.5,0
2,35,1,180,90,27.3,1
3,40,0,150,50,20.0,3
4,45,1,190,100,31.2,2


In [8]:
#import NumPy library
import numpy as np

#data partition by LABEL (Dependent variable)
X = obesity.drop(['Label'], axis=1)
Y = obesity['Label']

# Taking 80% of House data as training set, and remaining 20% as test set.
X_train = np.array(X[0:int(0.80*len(X))])
Y_train = np.array(Y[0:int(0.80*len(Y))])
X_test = np.array(X[int(0.80*len(X)):])
Y_test = np.array(Y[int(0.80*len(Y)):])
len(X_train), len(Y_train), len(X_test), len(Y_test)

print("The size of training input is", X_train.shape)
print("The size of training output is", Y_train.shape)
print(50 *'*')
print("The size of testing input is", X_test.shape)
print("The size of testing output is", Y_test.shape)

The size of training input is (86, 5)
The size of training output is (86,)
**************************************************
The size of testing input is (22, 5)
The size of testing output is (22,)


In [9]:
#measuring initial execution time
import time
start_time = time.time()

#****ALL ML ALGORITHMS*****

#importing algorithms

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

#initializing variables
LR = LogisticRegression()
KNN = KNeighborsClassifier()
NB = GaussianNB()
LSVM = LinearSVC()
NLSVM = SVC(kernel='rbf')
DT = DecisionTreeClassifier()
RF = RandomForestClassifier()

# Training data on Data Set
LR_fit = LR.fit(X_train, Y_train)
KNN_fit = KNN.fit(X_train, Y_train)
NB_fit = NB.fit(X_train, Y_train)
LSVM_fit = LSVM.fit(X_train, Y_train)
NLSVM_fit = NLSVM.fit(X_train, Y_train)
DT_fit = DT.fit(X_train, Y_train)
RF_fit = RF.fit(X_train, Y_train)


# Predicting on test data set
LR_pred = LR_fit.predict(X_test)
KNN_pred = KNN_fit.predict(X_test)
NB_pred = NB_fit.predict(X_test)
LSVM_pred = LSVM_fit.predict(X_test)
NLSVM_pred = NLSVM_fit.predict(X_test)
DT_pred = DT_fit.predict(X_test)
RF_pred = RF_fit.predict(X_test)

#printing accuracy of the prediction
from sklearn.metrics import accuracy_score

print("Logistic Regression is %f percent accurate" % (accuracy_score(LR_pred, Y_test)*100))
print("KNN is %f percent accurate" % (accuracy_score(KNN_pred, Y_test)*100))
print("Naive Bayes is %f percent accurate" % (accuracy_score(NB_pred, Y_test)*100))
print("Linear SVMs is %f percent accurate" % (accuracy_score(LSVM_pred, Y_test)*100))
print("Non Linear SVMs is %f percent accurate" % (accuracy_score(NLSVM_pred, Y_test)*100))
print("Decision Trees is %f percent accurate" % (accuracy_score(DT_pred, Y_test)*100))
print("Random Forests is %f percent accurate" % (accuracy_score(RF_pred, Y_test)*100))

#measuring final execution time
end_time = time.time()

elapsed_time = end_time - start_time
elapsed_minutes = elapsed_time / 60

print("Elapsed time: %.2f minutes" % elapsed_minutes)

Logistic Regression is 95.454545 percent accurate
KNN is 90.909091 percent accurate
Naive Bayes is 95.454545 percent accurate
Linear SVMs is 95.454545 percent accurate
Non Linear SVMs is 95.454545 percent accurate
Decision Trees is 100.000000 percent accurate
Random Forests is 100.000000 percent accurate
Elapsed time: 0.03 minutes


In [10]:
!pip install tabulate
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from tabulate import tabulate

seed = 0
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=seed)

# KFold cross-validation with various models
kf = KFold(n_splits=5, random_state=2, shuffle=True)
kfold_scores = []

for model in [LogisticRegression(), KNeighborsClassifier(), GaussianNB(), LinearSVC(), SVC(kernel='rbf'), DecisionTreeClassifier(), RandomForestClassifier()]:
    scores = []
    for train_index, test_index in kf.split(X_train):
        model.fit(X_train.iloc[train_index], Y_train.iloc[train_index])
        score = model.score(X_train.iloc[test_index], Y_train.iloc[test_index])
        scores.append(score)

    kfold_scores.append([type(model).__name__, np.mean(scores)])


# StratifiedKFold cross-validation with various models
skf = StratifiedKFold(n_splits=5, random_state=2, shuffle=True)
stratified_kfold_scores = []

for model in [LogisticRegression(), KNeighborsClassifier(), GaussianNB(), LinearSVC(), SVC(kernel='rbf'), DecisionTreeClassifier(), RandomForestClassifier()]:
    scores = []
    for train_index, test_index in skf.split(X_train, Y_train):
        model.fit(X_train.iloc[train_index], Y_train.iloc[train_index])
        score = model.score(X_train.iloc[test_index], Y_train.iloc[test_index])
        scores.append(score)

    stratified_kfold_scores.append([type(model).__name__, np.mean(scores)])


# Print the scores in a table
table_headers = ["Model", "K-Fold Score", "Stratified K-Fold Score"]
table_data = []

for kfold_score, stratified_kfold_score in zip(kfold_scores, stratified_kfold_scores):
    model_name = kfold_score[0]
    kfold_score_val = kfold_score[1]
    stratified_kfold_score_val = stratified_kfold_score[1]
    table_data.append([model_name, kfold_score_val, stratified_kfold_score_val])

table = tabulate(table_data, headers=table_headers, floatfmt=".4f", tablefmt="grid")
print(table)


+------------------------+----------------+---------------------------+
| Model                  |   K-Fold Score |   Stratified K-Fold Score |
| LogisticRegression     |         0.7562 |                    0.7431 |
+------------------------+----------------+---------------------------+
| KNeighborsClassifier   |         0.8124 |                    0.8261 |
+------------------------+----------------+---------------------------+
| GaussianNB             |         0.9078 |                    0.8837 |
+------------------------+----------------+---------------------------+
| LinearSVC              |         0.6752 |                    0.6046 |
+------------------------+----------------+---------------------------+
| SVC                    |         0.6967 |                    0.7562 |
+------------------------+----------------+---------------------------+
| DecisionTreeClassifier |         0.9301 |                    0.9529 |
+------------------------+----------------+---------------------

In [11]:
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# The dependent variable is the outcome or the target variable we try to predict or classify based on the independent variables. Therefore, in the given Obesity dataset, the dependent column is the "Label" column, which represents the obesity classification of each individual.
#
# The independent columns in the dataset are;
# ID: A unique identifier for each individual. This column serves as an identifier and provides no meaningful information for predicting the obesity classification.
# Age: The age of the individual. It is a continuous variable that can be relevant in determining obesity classification.
# Gender: The gender of the individual. It is a categorical variable and can be considered an independent variable for predicting obesity.
#     Height: The height of the individual in centimeters. It is a continuous variable that can be used as an independent variable to predict obesity.
# Weight: The weight of the individual in kilograms. It is a continuous variable that can be used as an independent variable to predict obesity.
# BMI: The individual's body mass index, calculated as weight divided by height squared. It is a continuous variable and can be considered an independent variable for predicting obesity.
#
# To summarize, the dependent column is "Label" (obesity classification), and the independent columns are "Age," "Gender," "Height," "Weight," and "BMI." I used these independent variables to build the predictive model that classifies individuals into different obesity categories.
#
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# Here are my steps for data cleaning for the given dataset:
#
# Handling Missing/Null Values: The dataset had no missing values. If there were, options include removing rows with missing values and filling in missing values with the mean or median.
# Handling Outliers: No outliers exist in the numerical columns (Age, Height, Weight, and BMI). I recommend removing outliers or applying appropriate transformations to mitigate their effects.
# Data Type Conversion: Check the data types of the columns. Ensure that they are correctly assigned. Example, the ID column should be of a numerical type float or int, while Gender and Label columns should be categorical.
# Encoding Categorical Variables: Since Gender and Label columns are categorical variables, I encoded them into numerical values for analysis and modeling.
# Removing Unnecessary Columns: Reviewed the dataset and identified any columns that may not be relevant to the analysis or modeling task. Thereby removing the ID column to simplify the dataset and reduce noise.
# Handling Duplicates: Checked to remove any duplicate rows in the dataset.
# Data Validation and Sanity Checks: I performed sanity checks on the data to identify any inconsistencies/errors. I checked that some BMI values are correctly calculated based on Height and Weight. To help validate the range and logical relationships between columns/variables.
#
#
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# **RESULTS**(after 4 runs)
# My above table displays the K-Fold and Stratified K-Fold scores for each model. Here are some observations based on the results:
#
# 1. Logistic Regression achieves a K-Fold score of 0.7562 and a slightly lower Stratified K-Fold score of 0.7431.
#
# 2. K-Nearest Neighbors (KNN) Classifier performs well with a K-Fold score of 0.8124 and a higher Stratified K-Fold score of 0.8261.
#
# 3. Gaussian Naive Bayes shows good performance with a high K-Fold score of 0.9078 and a slightly lower Stratified K-Fold score of 0.8837.
#
# 4. Linear Support Vector Classifier (LinearSVC) achieves a K-Fold score of 0.6379, indicating moderate performance, and a lower Stratified K-Fold score of 0.5935.
#
# 5. Support Vector Classifier (SVC) has a K-Fold score of 0.6967 and a higher Stratified K-Fold score of 0.7562.
#
# 6. Decision Tree Classifier performs well, showing high accuracy with a K-Fold score of 0.9183 and an even higher Stratified K-Fold score of 0.9412.
#
# 7. Random Forest Classifier achieves the highest accuracy among all models, with a K-Fold score of 0.9882 and a slightly lower Stratified K-Fold score of 0.9765.
#
# Based on these results, the Random Forest Classifier performs exceptionally well on the dataset, followed by the Decision Tree Classifier and Gaussian Naive Bayes. However, it's important to note that these scores are based on a smaller dataset than our other labs, so we may need to provide more diverse examples for our ML models to learn complex patterns and generalize well.