# CMU Portugal Case Study
> Q4 2024

In [2]:
import numpy as np
import pandas as pd

# Dataset - Estimation of Obesity Levels in Central America
(<a href = "https://archive.ics.uci.edu/dataset/544/estimation+of+obesity+levels+based+on+eating+habits+and+physical+condition">Link to data source</a>)

This dataset include data for the estimation of obesity levels in individuals from the countries of Mexico, Peru and Colombia, based on their eating habits and physical condition. The data contains 17 attributes and 2111 records, the records are labeled with the class variable NObesity (Obesity Level), that allows classification of the data using the values of Insufficient Weight, Normal Weight, Overweight Level I, Overweight Level II, Obesity Type I, Obesity Type II and Obesity Type III.

Dataset Characteristics: Multivariate

Subject Area: Health and Medicine

Associated Tasks: Classification, Regression, Clustering

Feature Type: Integer

Instance Count: 2111

Feature Count: 16

In [3]:
dataset_path = "dataset/ObesityDataSet_raw_and_data_sinthetic.csv"
df = pd.read_csv(dataset_path)

# Preview data

In [4]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   object 
 12  FAF                             21

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


# Simplify data labels

In [5]:
# generate column names to relabel

df.columns

Index(['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight',
       'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE',
       'CALC', 'MTRANS', 'NObeyesdad'],
      dtype='object')

In [13]:
# simplify the column names

new_column_names = {
    'family_history_with_overweight': 'Family Overweight',
    'FAVC': 'High Cal Food', 
    'FCVC': 'Qty Vegetables', 
    'NCP': 'Qty Meals', 
    'CAEC': 'Eat Btw Meals', 
    'SMOKE': 'Smoker', 
    'CH2O': 'Qty Water', 
    'SCC': 'Monitor Calories', 
    'FAF': 'Qty Phys Activity', 
    'TUE': 'Hrs Use Technology',
    'CALC': 'Frq Alcohol', 
    'MTRANS': 'Mtd Transit', 
    'NObeyesdad': 'Obesity Level',      
}

# rename the columns based on the new_collumn_names dictionary
df = df.rename(columns=new_column_names)

df.head()

Unnamed: 0,Gender,Age,Height,Weight,Family Overweight,High Cal Food,Qty Vegetables,Qty Meals,Eat Btw Meals,Smoker,Qty Water,Monitor Calories,Qty Phys Activity,Hrs Use Technology,Frq Alcohol,Mtd Transit,Obesity Level
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


# Find and address null values

In [37]:
# Find nulls
# Nothing to address in this dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Gender              2111 non-null   object 
 1   Age                 2111 non-null   float64
 2   Height              2111 non-null   float64
 3   Weight              2111 non-null   float64
 4   Family Overweight   2111 non-null   object 
 5   High Cal Food       2111 non-null   object 
 6   Qty Vegetables      2111 non-null   float64
 7   Qty Meals           2111 non-null   float64
 8   Eat Btw Meals       2111 non-null   object 
 9   Smoker              2111 non-null   object 
 10  Qty Water           2111 non-null   float64
 11  Monitor Calories    2111 non-null   object 
 12  Qty Phys Activity   2111 non-null   float64
 13  Hrs Use Technology  2111 non-null   float64
 14  Frq Alcohol         2111 non-null   object 
 15  Mtd Transit         2111 non-null   object 
 16  Obesit

# Look for Outliers

In [15]:
# By comparing the mean to the max & min, we dont see outliers (in the numeric variables) from this method
df.describe()

Unnamed: 0,Age,Height,Weight,Qty Vegetables,Qty Meals,Qty Water,Qty Phys Activity,Hrs Use Technology
count,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0
mean,24.3126,1.701677,86.586058,2.419043,2.685628,2.008011,1.010298,0.657866
std,6.345968,0.093305,26.191172,0.533927,0.778039,0.612953,0.850592,0.608927
min,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,19.947192,1.63,65.473343,2.0,2.658738,1.584812,0.124505,0.0
50%,22.77789,1.700499,83.0,2.385502,3.0,2.0,1.0,0.62535
75%,26.0,1.768464,107.430682,3.0,3.0,2.47742,1.666678,1.0
max,61.0,1.98,173.0,3.0,4.0,3.0,3.0,2.0


In [None]:
# Look for outliers by graphing histograms (numeric and non numeric variables)

# Use pdf and notebook from session 4!  Try all methods


# Descriptive Statistics

In [17]:
# Show unique values for each column
df['Frq Alcohol'].unique()

array(['no', 'Sometimes', 'Frequently', 'Always'], dtype=object)

In [34]:
float_df=df.iloc[:,[1,2,3,6,7,10,12,13]]
float_df

Unnamed: 0,Age,Height,Weight,Qty Vegetables,Qty Meals,Qty Water,Qty Phys Activity,Hrs Use Technology
0,21.000000,1.620000,64.000000,2.0,3.0,2.000000,0.000000,1.000000
1,21.000000,1.520000,56.000000,3.0,3.0,3.000000,3.000000,0.000000
2,23.000000,1.800000,77.000000,2.0,3.0,2.000000,2.000000,1.000000
3,27.000000,1.800000,87.000000,3.0,3.0,2.000000,2.000000,0.000000
4,22.000000,1.780000,89.800000,2.0,1.0,2.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...
2106,20.976842,1.710730,131.408528,3.0,3.0,1.728139,1.676269,0.906247
2107,21.982942,1.748584,133.742943,3.0,3.0,2.005130,1.341390,0.599270
2108,22.524036,1.752206,133.689352,3.0,3.0,2.054193,1.414209,0.646288
2109,24.361936,1.739450,133.346641,3.0,3.0,2.852339,1.139107,0.586035


In [36]:
# Calculate coorelation between numeric variables 

import itertools

# 1st, make df of just float variables (find way to do this with col==float)
float_df=df.iloc[:,[1,2,3,6,7,10,12,13]]

# Show coor (see session 3b soluions, exercise #2)
subset_columns = float_df
for col1_i, col2_i in itertools.combinations(range(len(subset_columns)), 2):
    col1, col2 = subset_columns[col1_i], subset_columns[col2_i]
    correlation = float_df[col1].corr(float_df[col2])
    print(f"Correlations between columns:\n\t- {col1}\n\t- {col2}\n\t- Correlation: {correlation}")
    print()

KeyError: 0