<a href="https://colab.research.google.com/github/tomjoyce1/obesity/blob/Development/naive_bayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [46]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import warnings
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score
warnings.filterwarnings('ignore')

plt.style.use('ggplot')

df = pd.read_csv('ObesityDataSet.csv')

X = df.drop(columns=['obese_category', 'Gender', 'water_day', 'smoke', 'transport_mode'])
y = df['obese_category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Split dataset into 80/20 training

In [47]:
X.isnull().sum() # Check for missing values

Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
eat_high_caloric_food             0
eat_vegetables_frequency          0
main_meals_day                    0
eat_between_meals                 0
monitor_calories                  0
exercise_frequency                0
device_time                       0
drink_alcohol                     0
dtype: int64

In [48]:
target_mapping = {
    'Insufficient_Weight': 0,
    'Normal_Weight': 1,
    'Overweight_Level_I': 2,
    'Overweight_Level_II': 3,
    'Obesity_Type_I': 4,
    'Obesity_Type_II': 5,
    'Obesity_Type_III': 6
}
X['BMI'] = X['Weight']/(X['Height']**2)

X.reset_index(drop=True, inplace=True)

In [49]:
# Removing outliers using Z-Scores

mean = X.mean()
std = X.std()

# Calculate the lower and upper bounds
lower_bound = mean - 3 * std
upper_bound = mean + 3 * std

# Remove data points outside of the bounds
cleaned_X = X[(X >= lower_bound) & (X <= upper_bound)]

In [50]:
# Perform one-hot encoding on categorical variables
X_encoded = pd.get_dummies(X, drop_first=True)

# Split the encoded data into training and testing sets
X_train_encoded, X_test_encoded, y_train_encoded, y_test_encoded = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Instantiate the Naive Bayes classifier
naive_bayes = GaussianNB()

# Train the classifier on the training data
naive_bayes.fit(X_train_encoded, y_train_encoded)

# Make predictions on the test data
y_pred_encoded = naive_bayes.predict(X_test_encoded)

# Evaluate the performance of the classifier
accuracy_encoded = accuracy_score(y_test_encoded, y_pred_encoded)
print("Accuracy (with one-hot encoding):", accuracy_encoded)

# Generate classification report
print(classification_report(y_test_encoded, y_pred_encoded))


Accuracy (with one-hot encoding): 0.7730496453900709
                     precision    recall  f1-score   support

Insufficient_Weight       0.81      0.98      0.89        56
      Normal_Weight       0.79      0.73      0.76        62
     Obesity_Type_I       0.75      0.77      0.76        78
    Obesity_Type_II       0.86      0.74      0.80        58
   Obesity_Type_III       0.67      1.00      0.80        63
 Overweight_Level_I       0.90      0.64      0.75        56
Overweight_Level_II       0.74      0.50      0.60        50

           accuracy                           0.77       423
          macro avg       0.79      0.77      0.76       423
       weighted avg       0.78      0.77      0.77       423

