# Midterm Project

> Develop classification models to predict a target variable.
> Evaluate the classification models based on the different performance metrics.

[Link to Dataset](https://www.kaggle.com/datasets/adeniranstephen/obesity-prediction-dataset)

In [1]:
# Import dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

raw = pd.read_csv('data/obesity_dataset.csv')

raw = raw.sample(frac=1).reset_index(drop=True)

raw

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Male,24,1.70,84.85,yes,yes,2.00,3.0,Sometimes,no,2.06,no,0.00,1.000,Sometimes,Public_Transportation,Overweight_Level_II
1,Female,21,1.52,42.00,no,no,3.00,1.0,Frequently,no,1.00,no,0.00,0.000,Sometimes,Public_Transportation,Insufficient_Weight
2,Female,20,1.53,42.00,no,yes,3.00,1.0,Frequently,no,1.32,no,0.00,0.479,Sometimes,Public_Transportation,Insufficient_Weight
3,Male,24,1.73,97.91,yes,yes,2.00,3.0,Sometimes,no,2.84,no,1.31,1.339,no,Public_Transportation,Obesity_Type_I
4,Female,36,1.63,80.00,yes,no,3.00,3.0,Sometimes,no,1.00,no,0.00,0.000,Sometimes,Automobile,Obesity_Type_I
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,Male,18,1.78,50.87,no,yes,2.05,3.0,Sometimes,no,2.00,no,0.52,1.000,Sometimes,Public_Transportation,Insufficient_Weight
2107,Female,21,1.80,152.47,yes,yes,3.00,3.0,Sometimes,no,2.32,no,0.89,0.843,Sometimes,Public_Transportation,Obesity_Type_III
2108,Male,24,1.77,97.45,yes,yes,2.00,3.0,Sometimes,no,2.97,no,2.49,1.366,no,Public_Transportation,Obesity_Type_I
2109,Female,20,1.68,68.00,no,yes,3.00,1.0,Sometimes,no,1.00,no,1.00,0.000,no,Public_Transportation,Normal_Weight


### Data Cleaning

1. CALC, CAEC Column must only have the values [Never, Sometimes, Frequently, Always]
2. Standardize CAEC, CALC to numeric values
3. Change underscores to whitespace
4. Simplify Gender to M and F
5. Change yes/no fields to binary 1/0
6. Rearrange Columns
7. Set aside 10% of the data

In [2]:
# Cleanup CALC and CAEC [Transform no to Never] - John Mihael
raw['CALC'] = raw['CALC'].replace("no", "Never")
raw['CAEC'] = raw['CAEC'].replace("no", "Never")

# Cleanup CALC and CAEC [0-3 scale] - Shaun
mapping = {'no': 0, 'Sometimes': 1, 'Frequently': 2, 'Always': 3}

raw['CAEC'] = raw['CAEC'].map(mapping)
raw['CALC'] = raw['CALC'].map(mapping)

# Change underscore values to whitespace and standardized - Cazindra
# Replace underscores in COLUMN NAMES (if they exist)
raw.columns = raw.columns.str.replace('_', ' ')

# Apply title case ONLY to column names containing spaces (multi-word columns)
raw.columns = [
    col.title() if ' ' in col else col  # Title case only if space exists
    for col in raw.columns
]

# Replace underscores in DATA ROWS for specific columns
for col in ['MTRANS', 'NObeyesdad']:
    raw[col] = raw[col].astype(str).str.replace('_', ' ', regex=True)

# Columns to standardize to title case
title_case_columns = [
    'Family History With Overweight',  # After underscore replacement
    'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC'
]

# Convert values in these columns to title case
raw[title_case_columns] = raw[title_case_columns].apply(lambda x: x.astype(str).str.title())

# Simplified Gender to M or F
raw['Gender'] = raw['Gender'].replace({'Female': 'F', 'Male': 'M'})

# Transform No/Yes to 0/1 - Joyce
columns_with_yes_no = ['Family History With Overweight', 'FAVC', 'SMOKE', 'SCC']
raw[columns_with_yes_no] = raw[columns_with_yes_no].replace({'Yes': 1, 'No': 0})

# Rearrange columns - Jude (Autofill)
raw = raw[[
    # Yes/No columns
    'Gender', 'Age', 'Height', 'Weight', 'Family History With Overweight', 'FAVC', 'SMOKE', 'SCC',
    # Decimal (numeric) columns
    'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE',
    # Target column (must remain last)
    'NObeyesdad'
]]

# Change M - 0 and F - 1
raw['Gender'] = raw['Gender'].replace({'M': 0, 'F': 1})

# Get unseen dataset
ctgnb_unseen = raw.iloc[1900:2110].reset_index(drop=True) # Get unseen sample
ctgnb_data = raw.iloc[0:1899].reset_index(drop=True) # Cut original dataset

ctgnb_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw['Gender'] = raw['Gender'].replace({'M': 0, 'F': 1})


Unnamed: 0,Gender,Age,Height,Weight,Family History With Overweight,FAVC,SMOKE,SCC,FCVC,NCP,CH2O,FAF,TUE,NObeyesdad
0,0,24,1.70,84.85,1,1,0,0,2.00,3.00,2.06,0.00,1.000,Overweight Level II
1,1,21,1.52,42.00,0,0,0,0,3.00,1.00,1.00,0.00,0.000,Insufficient Weight
2,1,20,1.53,42.00,0,1,0,0,3.00,1.00,1.32,0.00,0.479,Insufficient Weight
3,0,24,1.73,97.91,1,1,0,0,2.00,3.00,2.84,1.31,1.339,Obesity Type I
4,1,36,1.63,80.00,1,0,0,0,3.00,3.00,1.00,0.00,0.000,Obesity Type I
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1894,1,16,1.62,65.06,1,1,0,1,2.39,1.00,1.44,0.11,1.166,Overweight Level I
1895,1,26,1.62,110.80,1,1,0,0,3.00,3.00,2.70,0.00,0.270,Obesity Type III
1896,1,24,1.69,112.78,1,1,0,0,3.00,3.00,2.72,0.34,0.153,Obesity Type III
1897,0,23,1.77,90.00,1,1,0,0,2.12,3.00,1.00,0.00,1.067,Overweight Level II


### Prepare Data

1. Split features and target column
2. Prepare training and testing data.

In [3]:
# Split Features and Target Column
features = ctgnb_data.drop('NObeyesdad', axis=1)
target_col = ctgnb_data['NObeyesdad']

# TODO: Cross Validation (10-fold)
features_train, features_test, target_train, target_test = train_test_split(
    features, target_col, test_size=0.2
)

### Perform Multinomial Training

In [None]:
import matplotlib.pyplot as plt
import numpy
from sklearn import metrics
# Import Naive Bayes model
model = CategoricalNB()

model.fit(features_train, target_train)

y_pred = model.predict(features_test)

print("Confusion Matrix:", confusion_matrix(target_test, y_pred))

cm = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix(target_test, y_pred))
cm.plot()
plt.show()
print("Accuracy:", accuracy_score(target_test, y_pred))
print(classification_report(target_test, y_pred))

### Use the model

In [None]:
ctgnb_unseen.drop('NObeyesdad', axis=1, inplace=True)
results = model.predict(ctgnb_unseen)

ctgnb_unseen['Predictions'] = results

ctgnb_unseen

In [None]:
# Original
raw.iloc[1900:2110].reset_index(drop=True) 

### Create a Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

dts_unseen = raw.iloc[1900:2110].reset_index(drop=True) # Get unseen sample
dts_data = raw.iloc[0:1899].reset_index(drop=True) # Cut original dataset

# Training data
X_train = dts_data.drop('NObeyesdad', axis=1)  # Features
y_train = dts_data['NObeyesdad']              # Target

# Test data
X_test = dts_unseen.drop('NObeyesdad', axis=1)
y_test = dts_unseen['NObeyesdad']

dt_model = DecisionTreeClassifier(
    max_depth=5,          # Control tree depth to avoid overfitting
    min_samples_split=10, # Minimum samples to split a node
    random_state=42       # For reproducibility
)
dt_model.fit(X_train, y_train)

y_pred = dt_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

from sklearn.tree import plot_tree

plt.figure(figsize=(25, 15))
plot_tree(
    dt_model,
    feature_names=X_train.columns,
    class_names=dt_model.classes_,
    filled=True,
    rounded=True,
    proportion=True,
    fontsize=10
)
plt.show()

# Huh
dts_unseen.drop('NObeyesdad', axis=1, inplace=True)
results = model.predict(dts_unseen)

dts_unseen['Predictions'] = results
dts_unseen

In [None]:
# Original
raw.iloc[1900:2110].reset_index(drop=True) 