Data background, cleaning and EDA --> [Link](https://www.kaggle.com/code/ustcer1984/obesity-eda-cluster-playground-s4e2)  
KNN model scaler tuning --> [Link](https://www.kaggle.com/code/ustcer1984/obesity-prediction-knn-model-scaler-tuning-s4e2)  
Ethics concern on model and metric selection --> [Link](https://www.kaggle.com/code/ustcer1984/obesity-ethics-concern-on-model-selection)

## Random seed effect on model performance

Just for fun and curiosity, what is the variance of model performance with different random seeds?

#### **Plan**

- Compare RF, XGB and LGBM models.
    - All use default parameters except `random_state`
- Study statistical distribution of accuracy scores.

In [1]:
# Environment setup
num_seeds = 10 # number of seeds in our experiment
input_path = './'

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None) # show all columns

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme() # I like seaborn default theme

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings('ignore') # suppress warning msg

In [2]:
# data transform
df_train_raw = pd.read_csv(input_path + 'train.csv')
df_test_raw = pd.read_csv(input_path + 'test.csv')

df0 = df_train_raw.copy()
df0.drop(columns=['id'], inplace=True)
df0.columns = df0.columns.str.lower()
df0.rename(columns={'family_history_with_overweight':'history'}, inplace=True)

# tranform boolean columns
for col in ['history', 'favc', 'smoke', 'scc']:
    df0[col] = df0[col].map({'yes': True, 'no': False})

# transfer categorical columns
df0['gender'] = pd.Categorical(df0['gender'], 
                               categories=['Male', 'Female'],
                               ordered=True)
df0['caec'] = pd.Categorical(df0['caec'],
                             categories=['Frequently', 'Always', 'no', 'Sometimes'],
                             ordered=True)
df0['calc'] = pd.Categorical(df0['calc'],
                             categories=['Frequently', 'no', 'Sometimes'],
                             ordered=True)
df0['mtrans'] = pd.Categorical(df0['mtrans'],
                               categories=['Walking', 'Bike', 'Motorbike', 
                                           'Automobile', 'Public_Transportation'],
                               ordered=True)
df0['nobeyesdad'] = pd.Categorical(df0['nobeyesdad'],
                                   categories=['Insufficient_Weight', 'Normal_Weight', 
                                               'Overweight_Level_I', 'Overweight_Level_II', 
                                               'Obesity_Type_I', 'Obesity_Type_II', 'Obesity_Type_III'], 
                                   ordered=True)

df0['bmi'] = df0['weight'] / np.square(df0['height'])

# ordinal encoding all categorical variables
df0_ordinal = df0.copy()
for col in df0_ordinal.columns:
    if df0_ordinal[col].dtype == 'category':
        df0_ordinal[col] = df0_ordinal[col].cat.codes

# one hot encoding all categorical variables
df0_onehot = pd.get_dummies(df0.drop(columns=['nobeyesdad']))

# prepare stratify standard column
df0_ordinal['stratify'] = np.zeros(df0_ordinal.shape[0])
for col in ['gender', 'favc', 'smoke', 'scc']:
    df0_ordinal['stratify'] = df0_ordinal['stratify'] * 10 + df0_ordinal[col]
df0_ordinal['stratify'] = df0_ordinal['stratify'].convert_dtypes('int')

# select X, y
X_ordinal = df0_ordinal.drop(columns=['stratify', 'nobeyesdad'])
X_onehot = df0_onehot.copy()
y = df0_ordinal['nobeyesdad']

# split train and validate datasets
X_ordinal_train, X_ordinal_val, X_onehot_train, X_onehot_val, y_train, y_val =\
    train_test_split(X_ordinal, X_onehot, y, test_size=0.25,
                     stratify=df0_ordinal['stratify'],
                     random_state=42)

# reset index for all split datasets
X_ordinal_train.reset_index(drop=True, inplace=True)
X_ordinal_val.reset_index(drop=True, inplace=True)
X_onehot_train.reset_index(drop=True, inplace=True)
X_onehot_val.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_val.reset_index(drop=True, inplace=True)

In [6]:
model_list = ['RF', 'XGB', 'LGBM']
model_dict = {'RF': {'model': RandomForestClassifier, 'best_model': 0, 'best_score': 0},
              'XGB': {'model': XGBClassifier, 'best_model': 0, 'best_score': 0},
              'LGBM': {'model': LGBMClassifier, 'best_model': 0, 'best_score': 0}}
model_score = {'RF': [], 'XGB': [], 'LGBM': []}

In [7]:
%%time
for seed in range(2000, 2000 + num_seeds):
    for model in model_list:
        if model == 'LGBM':
            clf = model_dict[model]['model'](random_state=seed, n_jobs=-1, verbose=-1, )
        else:
            clf = model_dict[model]['model'](random_state=seed, n_jobs=-1)
        clf.fit(X_onehot_train, y_train)
        y_pred = clf.predict(X_onehot_val)
        score = metrics.accuracy_score(y_val, y_pred)
        model_score[model].append(score)
        if score > model_dict[model]['best_score']:
            model_dict[model]['best_model'] = clf
            model_dict[model]['best_score'] = score

CPU times: total: 13min 17s
Wall time: 58.7 s


In [5]:
# # prepare test dataset
# df1 = df_test_raw.copy()
# df_submit = df1[['id']] # reserve id column

# df1.drop(columns=['id'], inplace=True)
# df1.columns = df1.columns.str.lower()
# df1.rename(columns={'family_history_with_overweight':'history'}, inplace=True)

# # tranform boolean columns
# for col in ['history', 'favc', 'smoke', 'scc']:
#     df1[col] = df1[col].map({'yes': True, 'no': False})

# # transfer categorical columns
# df1['gender'] = pd.Categorical(df1['gender'], 
#                                categories=['Male', 'Female'],
#                                ordered=True)
# df1['caec'] = pd.Categorical(df1['caec'],
#                              categories=['Frequently', 'Always', 'no', 'Sometimes'],
#                              ordered=True)
# df1['calc'] = pd.Categorical(df1['calc'],
#                              categories=['Frequently', 'no', 'Sometimes'],
#                              ordered=True)
# df1['mtrans'] = pd.Categorical(df1['mtrans'],
#                                categories=['Walking', 'Bike', 'Motorbike', 
#                                            'Automobile', 'Public_Transportation'],
#                                ordered=True)

# df1['bmi'] = df1['weight'] / np.square(df1['height'])

# # one hot encoding all categorical variables
# X_test = pd.get_dummies(df1)

# nobeyesdad_list = ['Insufficient_Weight', 'Normal_Weight', 'Overweight_Level_I', 
#                    'Overweight_Level_II', 'Obesity_Type_I', 'Obesity_Type_II', 
#                    'Obesity_Type_III']
# df_submit['NObeyesdad'] = final_model.predict(X_test)
# df_submit['NObeyesdad'] = df_submit['NObeyesdad'].apply(lambda x: nobeyesdad_list[x])

# df_submit.to_csv('lgbm_submission.csv', index=False)