# This notebook preprocesses data to feed into logistic regression model

In [19]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [20]:
# Visualize data
df = pd.read_csv("ObesityDataSet_raw_and_data_sinthetic.csv")
df.head()

# Check balance or unbalance
total = len(df)
print("tot", total)
cols_to_count = ['Obesity_Type_I', 'Obesity_Type_II', 'Obesity_Type_III', 'Overweight_Level_I', 'Overweight_Level_II', 'Overweight_Level_III']

each_type = df["NObeyesdad"].value_counts()
print("obese count:", each_type)

obese_count = len(df["NObeyesdad"].apply(lambda x: x in ["Overweight_Level_I", "Overweight_Level_II", "Overweight_Level_III"]))

print("overweight count", obese_count)


tot 2111
obese count: NObeyesdad
Obesity_Type_I         351
Obesity_Type_III       324
Obesity_Type_II        297
Overweight_Level_I     290
Overweight_Level_II    290
Normal_Weight          287
Insufficient_Weight    272
Name: count, dtype: int64
overweight count 2111


In [21]:
# Normalize numerical variables:
numerical_columns = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF']
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(X=df[numerical_columns])

# Transform label column into binary
obese = (df['NObeyesdad'] == "Obesity_Type_I") |\
               (df['NObeyesdad'] == "Obesity_Type_II") |\
               (df['NObeyesdad'] == "Obesity_Type_III")|\
               (df['NObeyesdad'] == "Overweight_Level_I")|\
               (df['NObeyesdad'] == "Overweight_Level_II")|\
               (df['NObeyesdad'] == "Overweight_Level_III")

df['NObeyesdad'] = np.where(obese, 1, df['NObeyesdad'])

not_obese = (df['NObeyesdad'] == "Normal_Weight") |\
               (df['NObeyesdad'] == "Insufficient_Weight")
df['NObeyesdad'] = np.where(not_obese, 0, df['NObeyesdad'])

#print(df['NObeyesdad'])

# Encoding categorical variables: one-hot-encoding
categorical_columns = ['Gender', 'CALC', 'FAVC', 'SCC', 'SMOKE', 'family_history_with_overweight', 'CAEC', 'MTRANS']
one_hot = OneHotEncoder()
encoded_features = one_hot.fit_transform(df[categorical_columns]) 
encoded_array = encoded_features.toarray()
encoded_df = pd.DataFrame(encoded_array, columns= one_hot.get_feature_names_out(categorical_columns))


df = pd.concat([df[numerical_columns], encoded_df, df['NObeyesdad']], axis=1)
#df = df.drop(columns= ['NObeyesdad_Obesity_Type_I','NObeyesdad_Obesity_Type_II'])

obese = len(df[df["NObeyesdad"] == 1])
not_obese = len(df[df["NObeyesdad"] == 0])

print(f"obese{obese}")
print(f"not obese {not_obese}")



df.head(10)

obese1552
not obese 559


Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,Gender_Female,Gender_Male,CALC_Always,...,CAEC_Always,CAEC_Frequently,CAEC_Sometimes,CAEC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking,NObeyesdad
0,-0.522124,-0.875589,-0.862558,-0.785019,0.404153,-0.013073,-1.188039,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0
1,-0.522124,-1.947599,-1.168077,1.088342,0.404153,1.618759,2.33975,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0
2,-0.206889,1.054029,-0.36609,-0.785019,0.404153,-0.013073,1.16382,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0
3,0.423582,1.054029,0.015808,1.088342,0.404153,-0.013073,1.16382,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1
4,-0.364507,0.839627,0.12274,-0.785019,-2.167023,-0.013073,-1.188039,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1
5,0.738817,-0.875589,-1.282647,-0.785019,0.404153,-0.013073,-1.188039,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0
6,-0.206889,-2.162001,-1.206267,1.088342,0.404153,-0.013073,-0.012109,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0
7,-0.364507,-0.661187,-1.282647,-0.785019,0.404153,-0.013073,2.33975,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0
8,-0.049271,0.839627,-0.862558,1.088342,0.404153,-0.013073,-0.012109,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0
9,-0.364507,0.196421,-0.709799,-0.785019,0.404153,-0.013073,-0.012109,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0


In [22]:
df.to_csv("processed_data.csv")