# Midterm Project

> Develop classification models to predict a target variable.
> Evaluate the classification models based on the different performance metrics.

[Link to Dataset](https://www.kaggle.com/datasets/adeniranstephen/obesity-prediction-dataset)

In [478]:
# Import dataset
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, KBinsDiscretizer
from sklearn.pipeline import Pipeline

raw = pd.read_csv('data/obesity_dataset.csv')

raw = raw.sample(frac=1).reset_index(drop=True)

raw

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Male,20,1.83,89.43,yes,yes,2.34,3.61,Sometimes,no,2.88,no,2.00,0.208,Sometimes,Public_Transportation,Overweight_Level_I
1,Male,27,1.85,75.00,yes,yes,2.00,1.00,Sometimes,no,2.00,no,1.00,0.000,no,Walking,Normal_Weight
2,Male,39,1.77,117.79,yes,yes,2.23,2.92,Sometimes,no,1.83,no,0.76,0.000,Sometimes,Automobile,Obesity_Type_II
3,Male,31,1.78,102.87,yes,yes,2.27,3.00,Sometimes,no,1.77,no,2.00,0.414,Sometimes,Automobile,Obesity_Type_I
4,Female,22,1.74,135.35,yes,yes,3.00,3.00,Sometimes,no,2.61,no,1.62,0.869,Sometimes,Public_Transportation,Obesity_Type_III
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,Female,37,1.53,62.42,yes,yes,2.29,2.85,Sometimes,no,2.64,no,0.00,0.991,Sometimes,Automobile,Overweight_Level_I
2107,Female,18,1.70,50.00,no,yes,1.59,3.54,Sometimes,no,1.09,no,0.62,1.000,Sometimes,Public_Transportation,Insufficient_Weight
2108,Male,21,1.85,125.00,yes,yes,3.00,1.00,Always,no,1.00,no,0.00,0.000,Sometimes,Public_Transportation,Obesity_Type_II
2109,Male,20,1.80,65.00,no,yes,2.00,3.00,Frequently,no,1.00,no,2.00,0.000,Sometimes,Motorbike,Normal_Weight


### Data Cleaning

1. CALC, CAEC Column must only have the values [Never, Sometimes, Frequently, Always]
2. Change underscores to whitespace
3. Apply One Hot Encoding for Categorical Values
4. Apply Standard Scaler for Numeric values

In [479]:
# Cleanup CALC and CAEC [Transform no to Never]
raw['CALC'] = raw['CALC'].replace("no", "Never")
raw['CAEC'] = raw['CAEC'].replace("no", "Never")

# Change underscore values to whitespace and standardized
# Replace underscores in COLUMN NAMES (if they exist)
raw.columns = raw.columns.str.replace('_', ' ')

# Apply title case ONLY to column names containing spaces (multi-word columns)
raw.columns = [
    col.title() if ' ' in col else col  # Title case only if space exists
    for col in raw.columns
]

# Replace underscores in DATA ROWS for specific columns
for col in ['MTRANS', 'NObeyesdad']:
    raw[col] = raw[col].astype(str).str.replace('_', ' ', regex=True)

# Columns to standardize to title case
title_case_columns = [
    'Family History With Overweight',  # After underscore replacement
    'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC'
]

# Convert values in these columns to title case
raw[title_case_columns] = raw[title_case_columns].apply(lambda x: x.astype(str).str.title())
# Split data into categorical and numeric
categorical_cols = [
    'Gender', 'Family History With Overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS'
]
numerical_cols = [
    'Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE'
]

# Create Preprocessor
cnb_preprocessor = ColumnTransformer(
    transformers=[
        ('num', KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform'), numerical_cols),  # Discretize numerical features
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)  # Encode categoricals
    ]
)
svm_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),      # Scale numerical features
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)  # Encode categoricals
    ]
)

### Prepare Data

1. Split features and target column
2. Prepare training and testing data.

In [480]:
# Split the dataset (unseen = 90% of dataset), (data = 10% of dataset)
unseen = raw.iloc[1900:2110].reset_index(drop=True) # Get unseen sample
data = raw.iloc[0:1899].reset_index(drop=True) # Cut original dataset

# Split Features and Target Column
features = data.drop('NObeyesdad', axis=1).copy()
target_col = data['NObeyesdad'].copy()

features_train, features_test, target_train, target_test = train_test_split(
    features, target_col, test_size=0.2
)

### Train Categorical Naive Bayes Model

In [481]:
cnb_model = Pipeline([
    ('preprocessor', cnb_preprocessor),
    ('classifier', CategoricalNB())
])

cnb_model.fit(features_train, target_train)

cnb_pred_train = cnb_model.predict(features_train)
cnb_pred_test = cnb_model.predict(features_test)

### Train Support Vector Machine Model

In [482]:
svm_model = Pipeline([
    ('preprocessor', svm_preprocessor),
    ('classifier', SVC(
        C=1.0,
        gamma='scale',
        probability=True,
        class_weight='balanced'  # Handles class imbalance
    ))
])

svm_model.fit(features_train, target_train)

svm_pred_train = svm_model.predict(features_train)
svm_pred_test = svm_model.predict(features_test)

### Display Confusion Matrices for every model

### Get Accuracy, Precision, Recall, and ROC-AUC Scores for all Models
The mean value of every result from the Ten-Fold Validation is displayed

In [483]:
from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
apr_results = pd.DataFrame({
    'Results': ['Categorical Naive Bayes', 'Support Vector Machine', '(WiP)'],
    'Testing Accuracy': [cross_val_score(cnb_model, features_test, target_test, cv=cv, scoring='accuracy').mean(),
                         cross_val_score(svm_model, features_test, target_test, cv=cv, scoring='accuracy').mean(),
                         0.0],
    'Testing Precision': [cross_val_score(cnb_model, features_test, target_test, cv=cv, scoring='precision_weighted').mean(),
                          cross_val_score(svm_model, features_test, target_test, cv=cv, scoring='precision_weighted').mean(),
                          0.0],
    'Testing Recall': [cross_val_score(cnb_model, features_test, target_test, cv=cv, scoring='recall_weighted').mean(),
                       cross_val_score(svm_model, features_test, target_test, cv=cv, scoring='recall_weighted').mean(),
                       0.0],
    'Testing ROC-AUC': [cross_val_score(cnb_model, features_test, target_test, cv=cv, scoring='roc_auc_ovr').mean(),
                       cross_val_score(svm_model, features_test, target_test, cv=cv, scoring='roc_auc_ovr').mean(),
                       0.0],
    'Training Accuracy': [cross_val_score(cnb_model, features_train, target_train, cv=cv, scoring='accuracy').mean(),
                          cross_val_score(svm_model, features_train, target_train, cv=cv, scoring='accuracy').mean(),
                          0.0],
    'Training Precision': [cross_val_score(cnb_model, features_train, target_train, cv=cv, scoring='precision_weighted').mean(),
                           cross_val_score(svm_model, features_train, target_train, cv=cv, scoring='precision_weighted').mean(),
                           0.0],
    'Training Recall': [cross_val_score(cnb_model, features_train, target_train, cv=cv, scoring='recall_weighted').mean(),
                        cross_val_score(svm_model, features_train, target_train, cv=cv, scoring='recall_weighted').mean(),
                        0.0],
    'Training ROC-AUC': [cross_val_score(cnb_model, features_train, target_train, cv=cv, scoring='roc_auc_ovr').mean(),
                       cross_val_score(svm_model, features_train, target_train, cv=cv, scoring='roc_auc_ovr').mean(),
                       0.0]
})

apr_results

Unnamed: 0,Results,Testing Accuracy,Testing Precision,Testing Recall,Testing ROC-AUC,Training Accuracy,Training Precision,Training Recall,Training ROC-AUC
0,Categorical Naive Bayes,0.644737,0.662264,0.644737,0.921968,0.695186,0.696367,0.695186,0.933585
1,Support Vector Machine,0.760526,0.780639,0.760526,0.96058,0.903895,0.907615,0.903895,0.99213
2,(WiP),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Test models using the unseen dataset

In [484]:
expected = unseen['NObeyesdad'].copy()

# Categorical Naive Bayes
cnb_unseen = unseen.copy()
cnb_unseen.drop('NObeyesdad', axis=1, inplace=True)
cnb_unseen['Predictions'] = cnb_model.predict(cnb_unseen)
cnb_comparison = pd.DataFrame({
    'Expected':expected,
    'Actual':cnb_unseen['Predictions'],
})

# Support Vector Model
svm_unseen = unseen.copy()
svm_unseen.drop('NObeyesdad', axis=1, inplace=True)
svm_unseen['Predictions'] = svm_model.predict(svm_unseen)
svm_comparison = pd.DataFrame({
    'Expected':expected,
    'Actual':svm_unseen['Predictions'],
})


# Match Counter
def match_count(df):
    col1, col2 = df.columns
    matches = (df[col1] == df[col2]).sum()
    mismatches = len(df) - matches
    return [matches, mismatches]

In [485]:
matches = match_count(cnb_comparison)
print("Categorical Naive Bayes Test")
print('Matches: ', matches[0], ' | Mismatches: ', matches[1])
cnb_comparison

Categorical Naive Bayes Test
Matches:  154  | Mismatches:  56


Unnamed: 0,Expected,Actual
0,Obesity Type III,Obesity Type III
1,Overweight Level II,Obesity Type I
2,Obesity Type I,Obesity Type I
3,Obesity Type III,Obesity Type III
4,Obesity Type I,Obesity Type I
...,...,...
205,Obesity Type III,Obesity Type III
206,Overweight Level I,Overweight Level I
207,Insufficient Weight,Insufficient Weight
208,Obesity Type II,Obesity Type II


In [486]:
matches = match_count(svm_comparison)
print("Support Vector Machine Test")
print('Matches: ', matches[0], ' | Mismatches: ', matches[1])
svm_comparison

Support Vector Machine Test
Matches:  196  | Mismatches:  14


Unnamed: 0,Expected,Actual
0,Obesity Type III,Obesity Type III
1,Overweight Level II,Overweight Level II
2,Obesity Type I,Obesity Type I
3,Obesity Type III,Obesity Type III
4,Obesity Type I,Obesity Type I
...,...,...
205,Obesity Type III,Obesity Type III
206,Overweight Level I,Overweight Level I
207,Insufficient Weight,Insufficient Weight
208,Obesity Type II,Obesity Type II
