In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder,StandardScaler,FunctionTransformer,LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv(r"C:\Users\phunk\Desktop\MyProject\machine learning\Obesity\Data\ObesityDataSet.csv")

In [3]:
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [4]:
df.isnull().sum()

Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   int64  
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   object 
 12  FAF                             21

In [6]:
X = df.drop("NObeyesdad",axis=1)
y = LabelEncoder().fit_transform(df["NObeyesdad"])

In [7]:
num_col = ["Age","Height","Weight","FCVC","NCP","CH2O","FAF","TUE"]
cat_col = ["Gender","family_history_with_overweight","FAVC","CAEC","SMOKE","SCC","CALC","MTRANS"]

In [8]:
num_tranform = Pipeline(steps=[
    ("impute",KNNImputer(n_neighbors=5)),
    ("scale",StandardScaler())
])

cat_tranform = Pipeline(steps=[
    ("ordinal",OrdinalEncoder()),
    ("impute",KNNImputer(n_neighbors=5)),
    ("round",FunctionTransformer(lambda x: np.round(x).astype(int)))
])

preprocess = ColumnTransformer(transformers=[
    ("num",num_tranform,num_col),
    ("cat",cat_tranform,cat_col)
])

In [9]:
models = {
    "logistic regression" : LogisticRegression(max_iter=500),
    "decision tree" : DecisionTreeClassifier(),
    "random forrest" : RandomForestClassifier(n_estimators=300),
    "SVM" : SVC(),
    "KNeighbors":KNeighborsClassifier(n_neighbors=3)
}

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
for name, model in models.items():
    clf = Pipeline(steps=[
        ("preprocess",preprocess),
        ("classifier",model)
    ])
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"{name}")
    print(classification_report(y_test, y_pred))

voting_clf = VotingClassifier(
    estimators=list(models.items()),
    voting="hard"
)
vclf = Pipeline(steps=[
    ("voteing preprocess",preprocess),
    ("voting classifier",voting_clf)
])
vclf.fit(X_train,y_train)
y_pred_voting = vclf.predict(X_test)
print("Voting Result(Hard)")
print(classification_report(y_test,y_pred_voting))

logistic regression
              precision    recall  f1-score   support

           0       0.84      1.00      0.91        56
           1       0.90      0.61      0.73        62
           2       0.95      0.90      0.92        78
           3       0.89      0.97      0.93        58
           4       1.00      1.00      1.00        63
           5       0.76      0.79      0.77        56
           6       0.77      0.86      0.81        50

    accuracy                           0.87       423
   macro avg       0.87      0.87      0.87       423
weighted avg       0.88      0.87      0.87       423

decision tree
              precision    recall  f1-score   support

           0       0.91      0.95      0.93        56
           1       0.81      0.84      0.83        62
           2       0.95      0.94      0.94        78
           3       0.95      0.95      0.95        58
           4       1.00      1.00      1.00        63
           5       0.87      0.84      0.85 