# Classification Obesity Dataset

In [474]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Load Data

In [475]:
data = pd.read_csv('data/ObesityDataSet_raw_and_data_sinthetic.csv')
data

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21,1.62,64.00,yes,no,2.0,3.0,Sometimes,no,2.00,no,0.00,1.000,no,Public_Transportation,Normal_Weight
1,Female,21,1.52,56.00,yes,no,3.0,3.0,Sometimes,yes,3.00,yes,3.00,0.000,Sometimes,Public_Transportation,Normal_Weight
2,Male,23,1.80,77.00,yes,no,2.0,3.0,Sometimes,no,2.00,no,2.00,1.000,Frequently,Public_Transportation,Normal_Weight
3,Male,27,1.80,87.00,no,no,3.0,3.0,Sometimes,no,2.00,no,2.00,0.000,Frequently,Walking,Overweight_Level_I
4,Male,22,1.78,89.80,no,no,2.0,1.0,Sometimes,no,2.00,no,0.00,0.000,Sometimes,Public_Transportation,Overweight_Level_II
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,Female,21,1.71,131.41,yes,yes,3.0,3.0,Sometimes,no,1.73,no,1.68,0.906,Sometimes,Public_Transportation,Obesity_Type_III
2107,Female,22,1.75,133.74,yes,yes,3.0,3.0,Sometimes,no,2.01,no,1.34,0.599,Sometimes,Public_Transportation,Obesity_Type_III
2108,Female,23,1.75,133.69,yes,yes,3.0,3.0,Sometimes,no,2.05,no,1.41,0.646,Sometimes,Public_Transportation,Obesity_Type_III
2109,Female,24,1.74,133.35,yes,yes,3.0,3.0,Sometimes,no,2.85,no,1.14,0.586,Sometimes,Public_Transportation,Obesity_Type_III


In [476]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   int64  
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   object 
 12  FAF                             21

In [477]:
data.columns

Index(['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight',
       'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE',
       'CALC', 'MTRANS', 'NObeyesdad'],
      dtype='object')

In [478]:
numerics_columns = data.select_dtypes(include=[np.number]).columns
categorical_columns = data.select_dtypes(exclude=[np.number]).columns
print(f'Numeric columns: {numerics_columns}')
print(f'Categorical columns: {categorical_columns}')

Numeric columns: Index(['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE'], dtype='object')
Categorical columns: Index(['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE',
       'SCC', 'CALC', 'MTRANS', 'NObeyesdad'],
      dtype='object')


## Transform data

In [479]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder, LabelBinarizer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

transform = ColumnTransformer(
    [
        ('numeric', StandardScaler(), numerics_columns),
        ('categorical', OrdinalEncoder(), categorical_columns[:-1]),
    ],
    remainder='passthrough'
)

data = transform.fit_transform(data)
data

array([[-0.5217412208658921, -0.8743795837127092, -0.8625576043025508,
        ..., 3.0, 3.0, 'Normal_Weight'],
       [-0.5217412208658921, -1.9456595031657193, -1.1680765153365942,
        ..., 2.0, 3.0, 'Normal_Weight'],
       [-0.2070567302236356, 1.0539242713027066, -0.36608937387223034,
        ..., 1.0, 3.0, 'Normal_Weight'],
       ...,
       [-0.2070567302236356, 0.5182843115762016, 1.798894009442759, ...,
        2.0, 3.0, 'Obesity_Type_III'],
       [-0.049714484902507336, 0.41115631963090055, 1.7859094557238122,
        ..., 2.0, 3.0, 'Obesity_Type_III'],
       [-0.049714484902507336, 0.41115631963090055, 1.790492239389323,
        ..., 2.0, 3.0, 'Obesity_Type_III']], dtype=object)

In [480]:
columns = list(numerics_columns) + list(categorical_columns)
columns

['Age',
 'Height',
 'Weight',
 'FCVC',
 'NCP',
 'CH2O',
 'FAF',
 'TUE',
 'Gender',
 'family_history_with_overweight',
 'FAVC',
 'CAEC',
 'SMOKE',
 'SCC',
 'CALC',
 'MTRANS',
 'NObeyesdad']

In [481]:
data = pd.DataFrame(data, columns=columns)
data

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender,family_history_with_overweight,FAVC,CAEC,SMOKE,SCC,CALC,MTRANS,NObeyesdad
0,-0.521741,-0.87438,-0.862558,-0.78481,0.404102,-0.013141,-1.188028,0.562005,0.0,1.0,0.0,2.0,0.0,0.0,3.0,3.0,Normal_Weight
1,-0.521741,-1.94566,-1.168077,1.088307,0.404102,1.618701,2.339676,-1.080619,0.0,1.0,0.0,2.0,1.0,1.0,2.0,3.0,Normal_Weight
2,-0.207057,1.053924,-0.366089,-0.78481,0.404102,-0.013141,1.163774,0.562005,1.0,1.0,0.0,2.0,0.0,0.0,1.0,3.0,Normal_Weight
3,0.422312,1.053924,0.015809,1.088307,0.404102,-0.013141,1.163774,-1.080619,1.0,0.0,0.0,2.0,0.0,0.0,1.0,4.0,Overweight_Level_I
4,-0.364399,0.839668,0.122741,-0.78481,-2.166941,-0.013141,-1.188028,-1.080619,1.0,0.0,0.0,2.0,0.0,0.0,2.0,3.0,Overweight_Level_II
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,-0.521741,0.089772,1.711821,1.088307,0.404102,-0.453739,0.787486,0.407599,0.0,1.0,1.0,2.0,0.0,0.0,2.0,3.0,Obesity_Type_III
2107,-0.364399,0.518284,1.800804,1.088307,0.404102,0.003177,0.38768,-0.096687,0.0,1.0,1.0,2.0,0.0,0.0,2.0,3.0,Obesity_Type_III
2108,-0.207057,0.518284,1.798894,1.088307,0.404102,0.068451,0.469993,-0.019483,0.0,1.0,1.0,2.0,0.0,0.0,2.0,3.0,Obesity_Type_III
2109,-0.049714,0.411156,1.785909,1.088307,0.404102,1.373925,0.152499,-0.118041,0.0,1.0,1.0,2.0,0.0,0.0,2.0,3.0,Obesity_Type_III


In [482]:
lista = [i for i in LabelBinarizer().fit_transform(data['NObeyesdad'])]
lista

[array([0, 1, 0, 0, 0, 0, 0]),
 array([0, 1, 0, 0, 0, 0, 0]),
 array([0, 1, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 1, 0]),
 array([0, 0, 0, 0, 0, 0, 1]),
 array([0, 1, 0, 0, 0, 0, 0]),
 array([0, 1, 0, 0, 0, 0, 0]),
 array([0, 1, 0, 0, 0, 0, 0]),
 array([0, 1, 0, 0, 0, 0, 0]),
 array([0, 1, 0, 0, 0, 0, 0]),
 array([0, 0, 1, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 1]),
 array([0, 1, 0, 0, 0, 0, 0]),
 array([0, 0, 1, 0, 0, 0, 0]),
 array([0, 1, 0, 0, 0, 0, 0]),
 array([0, 1, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 1]),
 array([0, 0, 1, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 1]),
 array([0, 0, 0, 0, 0, 1, 0]),
 array([0, 0, 0, 0, 0, 0, 1]),
 array([0, 0, 1, 0, 0, 0, 0]),
 array([0, 1, 0, 0, 0, 0, 0]),
 array([0, 0, 1, 0, 0, 0, 0]),
 array([0, 1, 0, 0, 0, 0, 0]),
 array([0, 1, 0, 0, 0, 0, 0]),
 array([0, 1, 0, 0, 0, 0, 0]),
 array([0, 1, 0, 0, 0, 0, 0]),
 array([0, 1, 0, 0, 0, 0, 0]),
 array([0, 1, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 1, 0]),
 array([0, 0, 0, 0, 0, 0, 1]),
 array([

In [483]:
for column in categorical_columns:
    data[column] = data[column].astype('category')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype   
---  ------                          --------------  -----   
 0   Age                             2111 non-null   object  
 1   Height                          2111 non-null   object  
 2   Weight                          2111 non-null   object  
 3   FCVC                            2111 non-null   object  
 4   NCP                             2111 non-null   object  
 5   CH2O                            2111 non-null   object  
 6   FAF                             2111 non-null   object  
 7   TUE                             2111 non-null   object  
 8   Gender                          2111 non-null   category
 9   family_history_with_overweight  2111 non-null   category
 10  FAVC                            2111 non-null   category
 11  CAEC                            2111 non-null   category
 12  SMOKE               

## Save Clean Dataset

In [484]:
data.to_csv('data/obesity_clean.csv', index=False)

## Spliting Dataset

In [485]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('NObeyesdad', axis=1), data['NObeyesdad'], test_size=0.2, random_state=42)
pd.concat([X_train, y_train], axis=1).to_csv('data/X_obesity_train.csv', index=False)
pd.concat([X_test, y_test], axis=1).to_csv('data/X_obesity_test.csv', index=False)

## Training LogisticRegression

In [486]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

print(f'Acurácia Logistic Regressor: {logreg.score(X_test, y_test) * 100:.2f}%')

Acurácia Logistic Regressor: 87.23%


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Training SVC

In [487]:
from sklearn.svm import SVC
svc = SVC(random_state=42)
svc.fit(X_train, y_train)

print(f'Acurácia SVC: {svc.score(X_test, y_test) * 100:.2f}%')

Acurácia SVC: 91.49%


In [488]:
svc = SVC(kernel='linear', random_state=42)
svc.fit(X_train, y_train)

print(f'Acurácia SVC: {svc.score(X_test, y_test) * 100:.2f}%')

Acurácia SVC: 95.98%


## Random Forest

In [489]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(X_train, y_train)

print(f'Acurácia Random Forest: {random_forest.score(X_test, y_test) * 100:.2f}%')

Acurácia Random Forest: 95.27%


## Rede Neural - Perceptron

In [490]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

In [491]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

class ObesityDataset(Dataset):
    def __init__(self, csv_file, transform=None):
        self.data = pd.read_csv(csv_file)
        self.transform = transform

        # Separar features e labels
        self.features = self.data.iloc[:, :-1]  # Todas as colunas, exceto a última
        self.labels = self.data.iloc[:, -1]  # Última coluna (nível de obesidade)

        # Transformar colunas categóricas em numéricas
        self.label_encoder = LabelEncoder()
        self.labels = self.label_encoder.fit_transform(self.labels)

        categorical_columns = self.features.select_dtypes(include=['object']).columns
        for col in categorical_columns:
            self.features[col] = LabelEncoder().fit_transform(self.features[col])

        # Normalizar os dados
        self.scaler = StandardScaler()
        self.features = self.scaler.fit_transform(self.features)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        sample = torch.tensor(self.features[idx], dtype=torch.float32)
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return sample, label

In [492]:
# Caminho do dataset
csv_file = 'data/ObesityDataSet_raw_and_data_sinthetic.csv'

# Criar dataset
full_dataset = ObesityDataset(csv_file)

# Dividir treino e teste
train_size = int(0.8 * len(full_dataset))
test_size = len(full_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])

# Criar DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [493]:
class ObesityClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(ObesityClassifier, self).__init__()
        self.hidden1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.hidden2 = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax()

    def forward(self, x):
        x = self.relu(self.hidden1(x))
        return self.softmax(self.hidden2(x))

In [494]:
args = {
    'input_size': full_dataset.features.shape[1],
    'hidden_size': 48,
    'output_size': len(full_dataset.labels),
    'epochs': 500,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu'
}

In [495]:
input_size = full_dataset.features.shape[1]  # Número de colunas do dataset
num_classes = len(set(full_dataset.labels))  # Número de classes
model = ObesityClassifier(args['input_size'], args['hidden_size'], args['output_size']).to(args['device'])

In [496]:
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss().to()

In [497]:
def train(model, optimizer, criterion, dataloader_train, args):
    for epoch in range(args['epochs']):
        print(f'Epoch: {epoch + 1}')
        for batch, (X, y) in enumerate(dataloader_train):
            X = X.to(args['device'])
            y = y.to(args['device'])
            # Forward
            optimizer.zero_grad()
            y_pred = model(X)
            loss = criterion(y_pred, y)

            # Backpropagation
            loss.backward()
            optimizer.step()
            print(f'Loss {loss}')

In [502]:
train(model, optimizer, criterion, train_loader, args)

Epoch: 1
Loss 6.65583610534668
Loss 6.655805587768555
Loss 6.655797958374023
Loss 6.68705940246582
Loss 6.718300819396973
Loss 6.7183074951171875
Loss 6.655808448791504
Loss 6.655887603759766
Loss 6.655829906463623
Loss 6.655806064605713
Loss 6.655793190002441
Loss 6.655786514282227
Loss 6.6558027267456055
Loss 6.655745983123779
Loss 6.6870317459106445
Loss 6.6558146476745605
Loss 6.655953407287598
Loss 6.655879497528076
Loss 6.655820369720459
Loss 6.655819416046143
Loss 6.655788898468018
Loss 6.655773639678955
Loss 6.687049388885498
Loss 6.655796527862549
Loss 6.6557536125183105
Loss 6.655851364135742
Loss 6.655779838562012
Loss 6.655849456787109
Loss 6.718351364135742
Loss 6.655857086181641
Loss 6.655814170837402
Loss 6.687088489532471
Loss 6.655765056610107


  return self._call_impl(*args, **kwargs)


Loss 6.687333106994629
Loss 6.655770778656006
Loss 6.686988830566406
Loss 6.655834674835205
Loss 6.655796527862549
Loss 6.655930519104004
Loss 6.655755519866943
Loss 6.687028408050537
Loss 6.65573787689209
Loss 6.7182817459106445
Loss 6.687102317810059
Loss 6.655968189239502
Loss 6.687134742736816
Loss 6.655825138092041
Loss 6.655881881713867
Loss 6.687000274658203
Loss 6.655847549438477
Loss 6.718425750732422
Loss 6.687199115753174
Loss 6.697425842285156
Epoch: 2
Loss 6.655801773071289
Loss 6.655777454376221
Loss 6.687077522277832
Loss 6.655756950378418
Loss 6.655957221984863
Loss 6.655882358551025
Loss 6.687050819396973
Loss 6.68708610534668
Loss 6.687027931213379
Loss 6.65580940246582
Loss 6.718284606933594
Loss 6.6870503425598145
Loss 6.718242645263672
Loss 6.655871868133545
Loss 6.687025547027588
Loss 6.6558122634887695
Loss 6.655811309814453
Loss 6.655840873718262
Loss 6.718262672424316
Loss 6.65585470199585
Loss 6.655848979949951
Loss 6.655858516693115
Loss 6.655852794647217
Los

In [501]:
from sklearn.metrics import accuracy_score

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for features, labels in test_loader:
        features, labels = features.to(args['device']), labels.to(args['device'])
        outputs = model(features)
        _, predicted = torch.max(outputs, 1)
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

acc = accuracy_score(all_labels, all_preds)
print(f"Acurácia no teste: {acc * 100:.2f}%")

Acurácia no teste: 91.73%


  return self._call_impl(*args, **kwargs)
