Data background, cleaning and EDA --> [Link](https://www.kaggle.com/code/ustcer1984/obesity-eda-cluster-playground-s4e2)

## K-Nearest Neighbors (KNN)

KNN model is powerful and easy to understand, but to achieve best performance it is important to apply suitable scaler to the independent variables.

Let's try below 3 scalers and compare their performance:  
1. MinMaxScaler (linearly transform the scale to range[0, 1])
2. StandardScaler (linearly transform the scale to standard deviation and mean location)
3. PowerTransformer (normalize and standardlize the variables)

In [3]:
# Environment setup
fast_mode = True # If true, use simple param_grid to reduce running time, for drafting stage
input_path = './'

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None) # show all columns

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme() # I like seaborn default theme

from sklearn.preprocessing import StandardScaler, PowerTransformer, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier, KernelDensity

import warnings
warnings.filterwarnings('ignore') # suppress warning msg

In [None]:
# data transform
df_train_raw = pd.read_csv(input_path + 'train.csv')
df_test_raw = pd.read_csv(input_path + 'test.csv')

df0 = df_train_raw.copy()
df0.drop(columns=['id'], inplace=True)
df0.columns = df0.columns.str.lower()
df0.rename(columns={'family_history_with_overweight':'history'}, inplace=True)

# tranform boolean columns
for col in ['history', 'favc', 'smoke', 'scc']:
    df0[col] = df0[col].map({'yes': True, 'no': False})

# transfer categorical columns
df0['gender'] = pd.Categorical(df0['gender'], 
                               categories=['Male', 'Female'],
                               ordered=True)
df0['caec'] = pd.Categorical(df0['caec'],
                             categories=['Frequently', 'Always', 'no', 'Sometimes'],
                             ordered=True)
df0['calc'] = pd.Categorical(df0['calc'],
                             categories=['Frequently', 'no', 'Sometimes'],
                             ordered=True)
df0['mtrans'] = pd.Categorical(df0['mtrans'],
                               categories=['Walking', 'Bike', 'Motorbike', 
                                           'Automobile', 'Public_Transportation'],
                               ordered=True)
df0['nobeyesdad'] = pd.Categorical(df0['nobeyesdad'],
                                   categories=['Insufficient_Weight', 'Normal_Weight', 
                                               'Overweight_Level_I', 'Overweight_Level_II', 
                                               'Obesity_Type_I', 'Obesity_Type_II', 'Obesity_Type_III'], 
                                   ordered=True)

df0['bmi'] = df0['weight'] / np.square(df0['height'])
df0.drop(columns=['height', 'weight'], inplace=True)

# ordinary encoding all categorical variables
for col in df0.columns:
    if df0[col].dtype == 'category':
        df0[col] = df0[col].cat.codes

# split train and val datasets
## prepare stratify standard column
df0['stratify'] = np.zeros(df0.shape[0])
for col in ['gender', 'favc', 'smoke', 'scc']:
    df0['stratify'] = df0['stratify'] * 10 + df0[col]
df0['stratify'] = df0['stratify'].convert_dtypes('int')

# select X, y
X = df0.drop(columns=['stratify', 'nobeyesdad'])
y = df0['nobeyesdad']

# split train and validate datasets
X_train, X_val, y_train, y_val = \
    train_test_split(X, y, test_size=0.25, stratify=df0['stratify'], random_state=42)

# reset index for split datasets, otherwise later correlation factor calculation will be wrong
X_train.reset_index(drop=True, inplace=True)
X_val.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_val.reset_index(drop=True, inplace=True)

# build and tune models
model_summary = {'id': [1, 2, 3],
                 'model': ['knn clf'] *3,
                 'scaler': ['MinMaxScaler', 'StandardScaler', 'PowerTransformer'],
                 'accuracy_default': [],
                 'accuracy_tuned': [],
                 'comment': [''] * 3}
scalers = [MinMaxScaler(), StandardScaler(), PowerTransformer()]

knn = KNeighborsClassifier(n_jobs=-1)
if fast_mode == True:
    param_grid = {'n_neighbors': np.arange(10, 11),
                'weights': ['distance'],
                'p': [1]}
else:
    param_grid = {'n_neighbors': np.arange(1, 51),
                'weights': ['uniform', 'distance'],
                'p': [1, 2]}

for scaler in scalers:
    X_train_scale = scaler.fit_transform(X_train)
    X_val_scale = scaler.transform(X_val)
    # default KNN model performance
    knn.fit(X_train_scale, y_train)
    score = metrics.accuracy_score(y_val, knn.predict(X_val_scale))
    model_summary['accuracy_default'].append(score)
    # hyper parameter tuning
    grid = GridSearchCV(knn, param_grid, cv=4,
                        scoring=['accuracy'], refit='accuracy', n_jobs=-1)
    grid.fit(X_train_scale, y_train)
    score = metrics.accuracy_score(y_val, grid.predict(X_val_scale))
    model_summary['accuracy_tuned'].append(score)

pd.DataFrame(model_summary)

Looks like `StandardScaler` and `PowerTransformer` perform better than `MinMaxScaler`. However, the tuned model score is still not idea.

One assumption of KNN model is all features have same prediction power, which is obviously not valid according to our EDA result.

How to improve? Maybe for those features having better correlation with target, we should expand their range so the 'distance' can be longer. We can try to adjust their range based on their **correlation factor** with target. 

In [None]:
model_summary['id'] = model_summary['id'] + [4, 5, 6]
model_summary['model'] = model_summary['model'] * 2
model_summary['scaler'] = model_summary['scaler'] +\
    ['MinMaxScaler * w', 'StandardScaler * w', 'PowerTransformer * w']
model_summary['comment'] = model_summary['comment'] + ['weight: r']*3
knn = KNeighborsClassifier(n_jobs=-1)
for scaler in scalers:
    X_train_scale = scaler.fit_transform(X_train)
    X_val_scale = scaler.transform(X_val)
    # correlation factor
    X_train_scale = pd.DataFrame(X_train_scale, columns=X_train.columns)
    X_val_scale = pd.DataFrame(X_val_scale, columns=X_val.columns)
    r_score = X_train_scale.corrwith(y_train)
    for col in X_train.columns:
        r = r_score[col]
        X_train_scale[col] = X_train_scale[col] * r
        X_val_scale[col] = X_val_scale[col] * r
    # default KNN model performance
    knn.fit(X_train_scale, y_train)
    score = metrics.accuracy_score(y_val, knn.predict(X_val_scale))
    model_summary['accuracy_default'].append(score)
    # hyper parameter tuning
    grid = GridSearchCV(knn, param_grid, cv=4,
                        scoring=['accuracy'], refit='accuracy', n_jobs=-1)
    grid.fit(X_train_scale, y_train)
    score = metrics.accuracy_score(y_val, grid.predict(X_val_scale))
    model_summary['accuracy_tuned'].append(score)

pd.DataFrame(model_summary)

Unnamed: 0,id,model,scaler,accuracy_default,accuracy_tuned,comment
0,1,knn clf,MinMaxScaler,0.726397,0.778998,
1,2,knn clf,StandardScaler,0.749711,0.794027,
2,3,knn clf,PowerTransformer,0.750674,0.792678,
3,4,knn clf,MinMaxScaler * w,0.83025,0.852408,weight: r
4,5,knn clf,StandardScaler * w,0.840655,0.85973,weight: r
5,6,knn clf,PowerTransformer * w,0.840077,0.860501,weight: r


Very significant improvement (+0.06~0.1 accuracy score)!

Is $r$ the best weight choice? Let's try $\sqrt{|r|}$ and $r^2$ also.

In [None]:
model_summary['id'] = model_summary['id'] + [7, 8, 9, 10, 11, 12]
model_summary['model'] = model_summary['model'] * 2
model_summary['scaler'] = model_summary['scaler'] +\
    ['MinMaxScaler * w', 'StandardScaler * w', 'PowerTransformer * w'] * 2
model_summary['comment'] = model_summary['comment'] + ['weight: sqrt(abs(r))'] * 3 + ['weight: r^2']*3
knn = KNeighborsClassifier(n_jobs=-1)
for scaler in scalers:
    X_train_scale = scaler.fit_transform(X_train)
    X_val_scale = scaler.transform(X_val)
    # correlation factor
    X_train_scale = pd.DataFrame(X_train_scale, columns=X_train.columns)
    X_val_scale = pd.DataFrame(X_val_scale, columns=X_val.columns)
    r_score = X_train_scale.corrwith(y_train)
    for r_score_ in [r_score.apply(lambda x: np.sqrt(abs(x))),
                    r_score.apply(lambda x: x*x)]:
        for col in X_train.columns:
            r = r_score_[col]
            X_train_scale[col] = X_train_scale[col] * r
            X_val_scale[col] = X_val_scale[col] * r
        # default KNN model performance
        knn.fit(X_train_scale, y_train)
        score = metrics.accuracy_score(y_val, knn.predict(X_val_scale))
        model_summary['accuracy_default'].append(score)
        # hyper parameter tuning
        grid = GridSearchCV(knn, param_grid, cv=4,
                            scoring=['accuracy'], refit='accuracy', n_jobs=-1)
        grid.fit(X_train_scale, y_train)
        score = metrics.accuracy_score(y_val, grid.predict(X_val_scale))
        model_summary['accuracy_tuned'].append(score)

pd.DataFrame(model_summary)

Unnamed: 0,id,model,scaler,accuracy_default,accuracy_tuned,comment
0,1,knn clf,MinMaxScaler,0.726397,0.778998,
1,2,knn clf,StandardScaler,0.749711,0.794027,
2,3,knn clf,PowerTransformer,0.750674,0.792678,
3,4,knn clf,MinMaxScaler * w,0.83025,0.852408,weight: r
4,5,knn clf,StandardScaler * w,0.840655,0.85973,weight: r
5,6,knn clf,PowerTransformer * w,0.840077,0.860501,weight: r
6,7,knn clf,MinMaxScaler * w,0.784778,0.82158,weight: sqrt(abs(r))
7,8,knn clf,StandardScaler * w,0.842582,0.857418,weight: sqrt(abs(r))
8,9,knn clf,PowerTransformer * w,0.807322,0.83738,weight: sqrt(abs(r))
9,10,knn clf,MinMaxScaler * w,0.847592,0.860501,weight: r^2


$\sqrt{|r|}$ is best with `StandardScaler`, $r^2$ can slightly imporve with `MinMaxScaler` and `PowerTransformer`.

Is there any other way to improve? You may notice that all features are NOT strictly linearly correlated to the target, so using linear correlation factor may not be the optimized option.

Let's try another approach:  
- Categorical variables, encode them with average target code value
- Continuous variables, transfer them with average target code value determined by density function

In [None]:
# feature transform
scaler = PowerTransformer()
X_train_scale = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_val_scale = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)
for col in X_train_scale.columns:
    if X_train_scale[col].unique().size < 6: # categorical
        XY = pd.concat([X_train_scale[col], y_train], axis=1)
        XY = XY.groupby(col).mean()
        X_train_scale[col] = X_train_scale[col].apply(lambda x: XY.loc[x])
        X_val_scale[col] = X_val_scale[col].apply(lambda x: XY.loc[x])
    else: # continuous
        train_dw = np.zeros(X_train_scale.shape[0]) # _dw: density * weight
        train_d = np.zeros(X_train_scale.shape[0]) # _d: density
        val_dw = np.zeros(X_val_scale.shape[0])
        val_d = np.zeros(X_val_scale.shape[0])
        for i in range(7): # 7 obesity levels
            kde = KernelDensity(bandwidth=1/30) # bandwidth
            kde.fit(X_train_scale[y_train==i][[col]])
            train_density = np.exp(kde.score_samples(X_train_scale[[col]]))
            train_dw = train_dw + train_density * i
            train_d = train_d + train_density
            val_density = np.exp(kde.score_samples(X_val_scale[[col]]))
            val_dw = val_dw + val_density * i
            val_d = val_d + val_density
        X_train_scale[col] = train_dw / train_d
        X_val_scale[col] = val_dw / val_d

# update model_summary
model_summary['id'].append(model_summary['id'][-1] + 1)
model_summary['model'].append('knn clf')
model_summary['scaler'].append('PowerTransformer + kde')
model_summary['comment'].append('')

# fit model
knn = KNeighborsClassifier(n_jobs=-1)

# default KNN model performance
knn.fit(X_train_scale, y_train)
score = metrics.accuracy_score(y_val, knn.predict(X_val_scale))
model_summary['accuracy_default'].append(score)

# hyper parameter tuning
grid = GridSearchCV(knn, param_grid, cv=4,
                    scoring=['accuracy'], refit='accuracy', n_jobs=-1)
grid.fit(X_train_scale, y_train)
score = metrics.accuracy_score(y_val, grid.predict(X_val_scale))
model_summary['accuracy_tuned'].append(score)

pd.DataFrame(model_summary)

Unnamed: 0,id,model,scaler,accuracy_default,accuracy_tuned,comment
0,1,knn clf,MinMaxScaler,0.726397,0.778998,
1,2,knn clf,StandardScaler,0.749711,0.794027,
2,3,knn clf,PowerTransformer,0.750674,0.792678,
3,4,knn clf,MinMaxScaler * w,0.83025,0.852408,weight: r
4,5,knn clf,StandardScaler * w,0.840655,0.85973,weight: r
5,6,knn clf,PowerTransformer * w,0.840077,0.860501,weight: r
6,7,knn clf,MinMaxScaler * w,0.784778,0.82158,weight: sqrt(abs(r))
7,8,knn clf,StandardScaler * w,0.842582,0.857418,weight: sqrt(abs(r))
8,9,knn clf,PowerTransformer * w,0.807322,0.83738,weight: sqrt(abs(r))
9,10,knn clf,MinMaxScaler * w,0.847592,0.860501,weight: r^2


Performance is worse...

Lastly, from EDA we know male and female with severe obesity conditions show big difference in features. If we build models for male and female respectively, will it imporve overall performance?

In [None]:
# split datasets
XY_train = pd.concat([X_train, y_train], axis=1)
XY_train_male = XY_train[XY_train['gender']==0].reset_index(drop=True)
X_train_male = XY_train_male.drop(columns=['nobeyesdad', 'gender'])
y_train_male = XY_train_male['nobeyesdad']
XY_train_female = XY_train[XY_train['gender']==1].reset_index(drop=True)
X_train_female = XY_train_female.drop(columns=['nobeyesdad', 'gender'])
y_train_female = XY_train_female['nobeyesdad']

XY_val = pd.concat([X_val, y_val], axis=1)
XY_val_male = XY_val[XY_val['gender']==0].reset_index(drop=True)
X_val_male = XY_val_male.drop(columns=['nobeyesdad', 'gender'])
y_val_male = XY_val_male['nobeyesdad']
XY_val_female = XY_val[XY_val['gender']==1].reset_index(drop=True)
X_val_female = XY_val_female.drop(columns=['nobeyesdad', 'gender'])
y_val_female = XY_val_female['nobeyesdad']

# model for male
scaler = PowerTransformer()
X_train_male = pd.DataFrame(scaler.fit_transform(X_train_male), columns=X_train_male.columns)
X_val_male = pd.DataFrame(scaler.transform(X_val_male), columns=X_val_male.columns)
r_score = X_train_male.corrwith(y_train_male)
for col in X_train_male.columns:
    r = r_score[col]
    X_train_male[col] = X_train_male[col] * r * r
    X_val_male[col] = X_val_male[col] * r * r
knn_male = KNeighborsClassifier(n_jobs=-1)
knn_male.fit(X_train_male, y_train_male)

# model for female
X_train_female = pd.DataFrame(scaler.fit_transform(X_train_female), columns=X_train_female.columns)
X_val_female = pd.DataFrame(scaler.transform(X_val_female), columns=X_val_female.columns)
r_score = X_train_female.corrwith(y_train_female)
for col in X_train_female.columns:
    r = r_score[col]
    X_train_female[col] = X_train_female[col] * r * r
    X_val_female[col] = X_val_female[col] * r * r
knn_female = KNeighborsClassifier(n_jobs=-1)
knn_female.fit(X_train_female, y_train_female)

# update default models score
score_male = metrics.accuracy_score(y_val_male, knn_male.predict(X_val_male))
score_female = metrics.accuracy_score(y_val_female, knn_female.predict(X_val_female))
score_all = (score_male * X_train_male.shape[0] + score_female * X_train_female.shape[0])/X_train.shape[0]

model_summary['id'].append(model_summary['id'][-1] + 1)
model_summary['model'].append('knn clf')
model_summary['scaler'].append('PowerTransformer * r^2')
model_summary['comment'].append('Models for male/female respectively')
model_summary['accuracy_default'].append(score_all)

# hyper parameter tuning
grid_male = GridSearchCV(knn_male, param_grid, cv=4,
                         scoring=['accuracy'], refit='accuracy', n_jobs=-1)
grid_male.fit(X_train_male, y_train_male)
score_male = metrics.accuracy_score(y_val_male, grid_male.predict(X_val_male))

grid_female = GridSearchCV(knn_female, param_grid, cv=4,
                           scoring=['accuracy'], refit='accuracy', n_jobs=-1)
grid_female.fit(X_train_female, y_train_female)
score_female = metrics.accuracy_score(y_val_female, grid_female.predict(X_val_female))

score_all = (score_male * X_train_male.shape[0] + score_female * X_train_female.shape[0])/X_train.shape[0]
model_summary['accuracy_tuned'].append(score_all)

pd.DataFrame(model_summary)

Unnamed: 0,id,model,scaler,accuracy_default,accuracy_tuned,comment
0,1,knn clf,MinMaxScaler,0.726397,0.778998,
1,2,knn clf,StandardScaler,0.749711,0.794027,
2,3,knn clf,PowerTransformer,0.750674,0.792678,
3,4,knn clf,MinMaxScaler * w,0.83025,0.852408,weight: r
4,5,knn clf,StandardScaler * w,0.840655,0.85973,weight: r
5,6,knn clf,PowerTransformer * w,0.840077,0.860501,weight: r
6,7,knn clf,MinMaxScaler * w,0.784778,0.82158,weight: sqrt(abs(r))
7,8,knn clf,StandardScaler * w,0.842582,0.857418,weight: sqrt(abs(r))
8,9,knn clf,PowerTransformer * w,0.807322,0.83738,weight: sqrt(abs(r))
9,10,knn clf,MinMaxScaler * w,0.847592,0.860501,weight: r^2


Slight improvement of +0.007 accuracy score.

In [None]:
# prepare final model
# split datasets
XY = pd.concat([X, y], axis=1)
XY_male = XY[XY['gender']==0].reset_index(drop=True)
X_male = XY_male.drop(columns=['nobeyesdad', 'gender'])
y_male = XY_male['nobeyesdad']
XY_female = XY[XY['gender']==1].reset_index(drop=True)
X_female = XY_female.drop(columns=['nobeyesdad', 'gender'])
y_female = XY_female['nobeyesdad']

# model for male
scaler_male = PowerTransformer()
X_male = pd.DataFrame(scaler_male.fit_transform(X_male), columns=X_male.columns)
r_score_male = X_male.corrwith(y_male)
for col in X_male.columns:
    r = r_score_male[col]
    X_male[col] = X_male[col] * r * r
knn_male = KNeighborsClassifier(n_jobs=-1)
grid_male = GridSearchCV(knn_male, param_grid, cv=4,
                         scoring=['accuracy'], refit='accuracy', n_jobs=-1)
grid_male.fit(X_male, y_male)

# model for female
scaler_female = PowerTransformer()
X_female = pd.DataFrame(scaler_female.fit_transform(X_female), columns=X_female.columns)
r_score_female = X_female.corrwith(y_female)
for col in X_female.columns:
    r = r_score_female[col]
    X_female[col] = X_female[col] * r * r
knn_female = KNeighborsClassifier(n_jobs=-1)
grid_female = GridSearchCV(knn_female, param_grid, cv=4,
                           scoring=['accuracy'], refit='accuracy', n_jobs=-1)
grid_female.fit(X_female, y_female)

# prepare test data
df1 = df_test_raw.copy()
df1.columns = df1.columns.str.lower()
df1.rename(columns={'family_history_with_overweight':'history'}, inplace=True)

# tranform boolean columns
for col in ['history', 'favc', 'smoke', 'scc']:
    df1[col] = df1[col].map({'yes': True, 'no': False})

# transfer categorical columns
df1['gender'] = pd.Categorical(df1['gender'], 
                               categories=['Male', 'Female'],
                               ordered=True)
df1['caec'] = pd.Categorical(df1['caec'],
                             categories=['Frequently', 'Always', 'no', 'Sometimes'],
                             ordered=True)
df1['calc'] = pd.Categorical(df1['calc'],
                             categories=['Frequently', 'no', 'Sometimes'],
                             ordered=True)
df1['mtrans'] = pd.Categorical(df1['mtrans'],
                               categories=['Walking', 'Bike', 'Motorbike', 
                                           'Automobile', 'Public_Transportation'],
                               ordered=True)

df1['bmi'] = df1['weight'] / np.square(df1['height'])
df1.drop(columns=['height', 'weight'], inplace=True)

# ordinary encoding all categorical variables
for col in df1.columns:
    if df1[col].dtype == 'category':
        df1[col] = df1[col].cat.codes

# split into male and female
df1_male = df1[df1['gender']==0].reset_index(drop=True)
X_test_male_id = df1_male[['id']]
X_test_male = df1_male.drop(columns=['id', 'gender'])
X_test_male = pd.DataFrame(scaler_male.fit_transform(X_test_male), columns=X_test_male.columns)
for col in X_test_male.columns:
    r = r_score_male[col]
    X_test_male[col] = X_test_male[col] * r * r

df1_female = df1[df1['gender']==1].reset_index(drop=True)
X_test_female_id = df1_female[['id']]
X_test_female = df1_female.drop(columns=['id', 'gender'])
X_test_female = pd.DataFrame(scaler_female.fit_transform(X_test_female), columns=X_test_female.columns)
for col in X_test_female.columns:
    r = r_score_female[col]
    X_test_female[col] = X_test_female[col] * r * r

# prepare submission file
nobeyesdad_list = ['Insufficient_Weight', 'Normal_Weight', 'Overweight_Level_I', 
                   'Overweight_Level_II', 'Obesity_Type_I', 'Obesity_Type_II', 
                   'Obesity_Type_III']

y_male_pred = grid_male.predict(X_test_male)
df_male = X_test_male_id.copy()
df_male['NObeyesdad'] = y_male_pred
df_male['NObeyesdad'] = df_male['NObeyesdad'].apply(lambda x: nobeyesdad_list[x])

y_female_pred = grid_female.predict(X_test_female)
df_female = X_test_female_id.copy()
df_female['NObeyesdad'] = y_female_pred
df_female['NObeyesdad'] = df_female['NObeyesdad'].apply(lambda x: nobeyesdad_list[x])

df_submit = pd.concat([df_male, df_female], axis=0)
df_submit = df_submit.sort_values('id').reset_index(drop=True)
df_submit.to_csv('knn_submission.csv', index=False)