# Midterm Project

> Develop classification models to predict a target variable.
> Evaluate the classification models based on the different performance metrics.

[Link to Dataset](https://www.kaggle.com/datasets/adeniranstephen/obesity-prediction-dataset)

In [299]:
# Import dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, KBinsDiscretizer
from sklearn.pipeline import Pipeline

raw = pd.read_csv('data/obesity_dataset.csv')

raw = raw.sample(frac=1).reset_index(drop=True)

raw

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,23,1.63,83.10,yes,yes,2.98,1.63,Sometimes,no,2.84,no,1.09,0.890,no,Public_Transportation,Obesity_Type_I
1,Male,31,1.87,127.51,yes,yes,2.94,3.00,Sometimes,yes,1.34,no,0.92,1.432,Sometimes,Public_Transportation,Obesity_Type_II
2,Male,25,1.77,114.07,yes,yes,1.62,3.00,Sometimes,no,2.08,no,1.54,0.357,Sometimes,Public_Transportation,Obesity_Type_II
3,Female,22,1.73,132.90,yes,yes,3.00,3.00,Sometimes,no,1.82,no,1.58,0.931,Sometimes,Public_Transportation,Obesity_Type_III
4,Male,25,1.76,112.20,yes,yes,1.26,3.00,Sometimes,no,2.00,no,1.33,0.002,Sometimes,Public_Transportation,Obesity_Type_II
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,Male,18,1.72,53.78,yes,yes,2.00,3.13,Sometimes,no,2.07,no,1.49,2.000,Sometimes,Public_Transportation,Insufficient_Weight
2107,Male,23,1.61,100.64,yes,yes,3.00,1.15,Sometimes,no,1.24,no,1.13,1.666,no,Public_Transportation,Obesity_Type_II
2108,Male,24,1.70,75.00,yes,yes,2.00,3.00,Sometimes,no,2.00,no,1.00,0.000,Sometimes,Automobile,Overweight_Level_I
2109,Male,25,1.77,112.23,yes,yes,1.44,3.00,Sometimes,no,2.00,no,1.39,0.001,Sometimes,Public_Transportation,Obesity_Type_II


### Data Cleaning

1. CALC, CAEC Column must only have the values [Never, Sometimes, Frequently, Always]
2. Change underscores to whitespace
3. Apply One Hot Encoding for Categorical Values
4. Apply Standard Scaler for Numeric values

In [300]:
# Cleanup CALC and CAEC [Transform no to Never]
raw['CALC'] = raw['CALC'].replace("no", "Never")
raw['CAEC'] = raw['CAEC'].replace("no", "Never")

# Change underscore values to whitespace and standardized
# Replace underscores in COLUMN NAMES (if they exist)
raw.columns = raw.columns.str.replace('_', ' ')

# Apply title case ONLY to column names containing spaces (multi-word columns)
raw.columns = [
    col.title() if ' ' in col else col  # Title case only if space exists
    for col in raw.columns
]

# Replace underscores in DATA ROWS for specific columns
for col in ['MTRANS', 'NObeyesdad']:
    raw[col] = raw[col].astype(str).str.replace('_', ' ', regex=True)

# Columns to standardize to title case
title_case_columns = [
    'Family History With Overweight',  # After underscore replacement
    'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC'
]

# Convert values in these columns to title case
raw[title_case_columns] = raw[title_case_columns].apply(lambda x: x.astype(str).str.title())
# Split data into categorical and numeric
categorical_cols = [
    'Gender', 'Family History With Overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS'
]
numerical_cols = [
    'Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE'
]

# Create Preprocessor
cnb_preprocessor = ColumnTransformer(
    transformers=[
        ('num', KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform'), numerical_cols),  # Discretize numerical features
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)  # Encode categoricals
    ]
)
svm_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),      # Scale numerical features
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)  # Encode categoricals
    ]
)

### Prepare Data

1. Split features and target column
2. Prepare training and testing data.

In [301]:
# Split the dataset (unseen = 90% of dataset), (data = 10% of dataset)
unseen = raw.iloc[1900:2110].reset_index(drop=True) # Get unseen sample
data = raw.iloc[0:1899].reset_index(drop=True) # Cut original dataset

# Split Features and Target Column
features = data.drop('NObeyesdad', axis=1).copy()
target_col = data['NObeyesdad'].copy()

features_train, features_test, target_train, target_test = train_test_split(
    features, target_col, test_size=0.2
)

### Train Categorical Naive Bayes Model

In [302]:
cnb_model = Pipeline([
    ('preprocessor', cnb_preprocessor),
    ('classifier', CategoricalNB())
])

cnb_model.fit(features_train, target_train)

cnb_pred_train = cnb_model.predict(features_train)
cnb_pred_test = cnb_model.predict(features_test)

### Train Support Vector Machine Model

In [303]:
svm_model = Pipeline([
    ('preprocessor', svm_preprocessor),
    ('classifier', SVC(
        C=1.0,
        gamma='scale',
        class_weight='balanced'  # Handles class imbalance
    ))
])

svm_model.fit(features_train, target_train)

svm_pred_train = svm_model.predict(features_train)
svm_pred_test = svm_model.predict(features_test)

### Apply Ten-fold Cross Validation

### Display Confusion Matrices for every model

### Get Accuracy, Precision, and Recall Scores for all Models

In [304]:
apr_results = pd.DataFrame({
    'Results': ['Categorical Naive Bayes', 'Support Vector Machine', '(WiP)'],
    'Testing Accuracy': [accuracy_score(target_test, cnb_pred_test),
                         accuracy_score(target_test, svm_pred_test),
                         0.0],
    'Testing Precision': [precision_score(target_test, cnb_pred_test, average='weighted'),
                          precision_score(target_test, svm_pred_test, average='weighted'),
                          0.0],
    'Testing Recall': [recall_score(target_test, cnb_pred_test, average='weighted'),
                       recall_score(target_test, svm_pred_test, average='weighted'),
                       0.0],
    'Training Accuracy': [accuracy_score(target_train, cnb_pred_train),
                          accuracy_score(target_train, svm_pred_train),
                          0.0],
    'Training Precision': [precision_score(target_train, cnb_pred_train, average='weighted'),
                           precision_score(target_train, svm_pred_train, average='weighted'),
                           0.0],
    'Training Recall': [recall_score(target_train, cnb_pred_train, average='weighted'),
                        recall_score(target_train, svm_pred_train, average='weighted'),
                        0.0]
})

apr_results

Unnamed: 0,Results,Testing Accuracy,Testing Precision,Testing Recall,Training Accuracy,Training Precision,Training Recall
0,Categorical Naive Bayes,0.710526,0.712704,0.710526,0.714944,0.710068,0.714944
1,Support Vector Machine,0.902632,0.903168,0.902632,0.963792,0.964287,0.963792
2,(WiP),0.0,0.0,0.0,0.0,0.0,0.0


### Get ROC-AUC scores for every model

### Test models using the unseen dataset

In [305]:
expected = unseen['NObeyesdad'].copy()

# Categorical Naive Bayes
cnb_unseen = unseen.copy()
cnb_unseen.drop('NObeyesdad', axis=1, inplace=True)
cnb_unseen['Predictions'] = cnb_model.predict(cnb_unseen)
cnb_comparison = pd.DataFrame({
    'Expected':unseen['NObeyesdad'],
    'Actual':cnb_unseen['Predictions'],
})

# Support Vector Model

In [306]:
print("Categorical Naive Bayes Test")
# cnb_comparison

Categorical Naive Bayes Test


In [307]:
print("Support Vector Machine Test")
# svm_comparison

Support Vector Machine Test
