# This notebook preprocesses data to feed into logistic regression model

In [61]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [62]:
# Visualize data
df = pd.read_csv("ObesityDataSet_raw_and_data_sinthetic.csv")
df.head()


Unnamed: 0,Age,Gender,Height,Weight,CALC,FAVC,FCVC,NCP,SCC,SMOKE,CH2O,family_history_with_overweight,FAF,TUE,CAEC,MTRANS,NObeyesdad
0,21.0,Female,1.62,64.0,no,no,2.0,3.0,no,no,2.0,yes,0.0,1.0,Sometimes,Public_Transportation,Normal_Weight
1,21.0,Female,1.52,56.0,Sometimes,no,3.0,3.0,yes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,23.0,Male,1.8,77.0,Frequently,no,2.0,3.0,no,no,2.0,yes,2.0,1.0,Sometimes,Public_Transportation,Normal_Weight
3,27.0,Male,1.8,87.0,Frequently,no,3.0,3.0,no,no,2.0,no,2.0,0.0,Sometimes,Walking,Overweight_Level_I
4,22.0,Male,1.78,89.8,Sometimes,no,2.0,1.0,no,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [63]:
# Normalize numerical variables:
numerical_columns = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF']
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(X=df[numerical_columns])

# Encoding categorical variables: one-hot-encoding
categorical_columns = ['Gender', 'CALC', 'FAVC', 'SCC', 'SMOKE', 'family_history_with_overweight', 'CAEC', 'MTRANS']
one_hot = OneHotEncoder()
encoded_features = one_hot.fit_transform(df[categorical_columns]) 
encoded_array = encoded_features.toarray()
encoded_df = pd.DataFrame(encoded_array, columns= one_hot.get_feature_names_out(categorical_columns))

df = pd.concat([df[numerical_columns], encoded_df], axis=1)

df.head(10)

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,Gender_Female,Gender_Male,CALC_Always,...,family_history_with_overweight_yes,CAEC_Always,CAEC_Frequently,CAEC_Sometimes,CAEC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,-0.522124,-0.875589,-0.862558,-0.785019,0.404153,-0.013073,-1.188039,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-0.522124,-1.947599,-1.168077,1.088342,0.404153,1.618759,2.33975,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-0.206889,1.054029,-0.36609,-0.785019,0.404153,-0.013073,1.16382,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.423582,1.054029,0.015808,1.088342,0.404153,-0.013073,1.16382,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-0.364507,0.839627,0.12274,-0.785019,-2.167023,-0.013073,-1.188039,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
5,0.738817,-0.875589,-1.282647,-0.785019,0.404153,-0.013073,-1.188039,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
6,-0.206889,-2.162001,-1.206267,1.088342,0.404153,-0.013073,-0.012109,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
7,-0.364507,-0.661187,-1.282647,-0.785019,0.404153,-0.013073,2.33975,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
8,-0.049271,0.839627,-0.862558,1.088342,0.404153,-0.013073,-0.012109,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
9,-0.364507,0.196421,-0.709799,-0.785019,0.404153,-0.013073,-0.012109,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [64]:
df.to_csv("processed_data.csv")