# AAI 500 Final Team Project

**Authors:** Zach Artman, Olga Pospelova, Narendra Fadnavis

**Date:** 06/02/2024

## Data Cleaning and Preparation

In [1]:
# Importing necessary modules

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Set up data
df = pd.read_csv("dataset.csv")

# Rename for clearer column names
df = df.rename(columns={"FAVC": "high_calories_frequently", 
                     "FCVC": "vegetable_consumption", 
                     "NCP": "meals_per_day", 
                     "CAEC": "eat_between_meals", 
                     "CH2O": "daily_water_intake", 
                     "SCC": "monitor_calories", 
                     "FAF": "physical_activity_frequency", 
                     "TUE": "technology_usage_frequency", 
                     "CALC": "alcohol_consumption_frequency", 
                     "MTRANS": "transportation_method", 
                     "NObeyesdad": "obesity_level"})

df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,high_calories_frequently,vegetable_consumption,meals_per_day,eat_between_meals,SMOKE,daily_water_intake,monitor_calories,physical_activity_frequency,technology_usage_frequency,alcohol_consumption_frequency,transportation_method,obesity_level
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [3]:
# determining encoding method
from sklearn.calibration import LabelEncoder

categorical_variables = ["Gender", "family_history_with_overweight", "high_calories_frequently", "eat_between_meals", "SMOKE", "monitor_calories", "alcohol_consumption_frequency", "transportation_method", "obesity_level"]
label_encoders = {}

for variable in categorical_variables:
    le = LabelEncoder()
    df[variable] = le.fit_transform(df[variable])
    label_encoders[variable] = le

for variable in categorical_variables:
    print(f"{variable}: {dict(zip(label_encoders[variable].classes_, label_encoders[variable].transform(label_encoders[variable].classes_)))}")

df.head()

Gender: {'Female': 0, 'Male': 1}
family_history_with_overweight: {'no': 0, 'yes': 1}
high_calories_frequently: {'no': 0, 'yes': 1}
eat_between_meals: {'Always': 0, 'Frequently': 1, 'Sometimes': 2, 'no': 3}
SMOKE: {'no': 0, 'yes': 1}
monitor_calories: {'no': 0, 'yes': 1}
alcohol_consumption_frequency: {'Always': 0, 'Frequently': 1, 'Sometimes': 2, 'no': 3}
transportation_method: {'Automobile': 0, 'Bike': 1, 'Motorbike': 2, 'Public_Transportation': 3, 'Walking': 4}
obesity_level: {'Insufficient_Weight': 0, 'Normal_Weight': 1, 'Obesity_Type_I': 2, 'Obesity_Type_II': 3, 'Obesity_Type_III': 4, 'Overweight_Level_I': 5, 'Overweight_Level_II': 6}


Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,high_calories_frequently,vegetable_consumption,meals_per_day,eat_between_meals,SMOKE,daily_water_intake,monitor_calories,physical_activity_frequency,technology_usage_frequency,alcohol_consumption_frequency,transportation_method,obesity_level
0,0,21.0,1.62,64.0,1,0,2.0,3.0,2,0,2.0,0,0.0,1.0,3,3,1
1,0,21.0,1.52,56.0,1,0,3.0,3.0,2,1,3.0,1,3.0,0.0,2,3,1
2,1,23.0,1.8,77.0,1,0,2.0,3.0,2,0,2.0,0,2.0,1.0,1,3,1
3,1,27.0,1.8,87.0,0,0,3.0,3.0,2,0,2.0,0,2.0,0.0,1,4,5
4,1,22.0,1.78,89.8,0,0,2.0,1.0,2,0,2.0,0,0.0,0.0,2,3,6


## Exploratory Data Analysis

## Model Selection