# Multi-Class Prediction of Obesity Risk

## Exploring datasets

In [1]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = '.'

In [3]:
!kaggle competitions download -c playground-series-s4e2

Downloading playground-series-s4e2.zip to f:\KaggleCompetition




  0%|          | 0.00/917k [00:00<?, ?B/s]
100%|██████████| 917k/917k [00:00<00:00, 996kB/s]
100%|██████████| 917k/917k [00:00<00:00, 993kB/s]


In [4]:
import zipfile
with zipfile.ZipFile('playground-series-s4e2.zip') as f:
    f.extractall('data')

In [5]:
import pandas as pd

In [6]:
raw_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
sub_df = pd.read_csv('data/sample_submission.csv')

In [7]:
raw_df

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.669950,yes,yes,2.000000,2.983297,Sometimes,no,2.763573,no,0.000000,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.000000,1.560000,57.000000,yes,yes,2.000000,3.000000,Frequently,no,2.000000,no,1.000000,1.000000,no,Automobile,Normal_Weight
2,2,Female,18.000000,1.711460,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.710730,131.274851,yes,yes,3.000000,3.000000,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20753,20753,Male,25.137087,1.766626,114.187096,yes,yes,2.919584,3.000000,Sometimes,no,2.151809,no,1.330519,0.196680,Sometimes,Public_Transportation,Obesity_Type_II
20754,20754,Male,18.000000,1.710000,50.000000,no,yes,3.000000,4.000000,Frequently,no,1.000000,no,2.000000,1.000000,Sometimes,Public_Transportation,Insufficient_Weight
20755,20755,Male,20.101026,1.819557,105.580491,yes,yes,2.407817,3.000000,Sometimes,no,2.000000,no,1.158040,1.198439,no,Public_Transportation,Obesity_Type_II
20756,20756,Male,33.852953,1.700000,83.520113,yes,yes,2.671238,1.971472,Sometimes,no,2.144838,no,0.000000,0.973834,no,Automobile,Overweight_Level_II


In [8]:
raw_df.isna().sum()

id                                0
Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64

In [13]:
raw_df['NObeyesdad'].value_counts().index.tolist()

['Obesity_Type_III',
 'Obesity_Type_II',
 'Normal_Weight',
 'Obesity_Type_I',
 'Insufficient_Weight',
 'Overweight_Level_II',
 'Overweight_Level_I']

In [14]:
def view_categoricals_feature(cat_col: str):
    d = dict()    
    total = raw_df[cat_col].value_counts()
    idx = raw_df['NObeyesdad'].value_counts().index.tolist()
    for t in idx:
        x = raw_df[raw_df['NObeyesdad']==t][cat_col].value_counts()
        d[t] = x/total*100
    
    return pd.DataFrame(data=d, columns=idx)

### Gender

In [15]:
view_categoricals_feature('Gender')

Unnamed: 0_level_0,Obesity_Type_III,Obesity_Type_II,Normal_Weight,Obesity_Type_I,Insufficient_Weight,Overweight_Level_II,Overweight_Level_I
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Female,38.773748,0.076761,15.927845,12.156976,15.553637,7.244291,10.266743
Male,0.048375,31.346749,13.75774,15.895898,8.72678,17.095588,13.12887


### Age

In [16]:
raw_df['Age'].describe()

count    20758.000000
mean        23.841804
std          5.688072
min         14.000000
25%         20.000000
50%         22.815416
75%         26.000000
max         61.000000
Name: Age, dtype: float64

In [17]:
raw_df['Age'] = pd.cut(raw_df['Age'], 
                       bins=[13,18,30,52,100],
                       labels=['Teenager','Adult','OldAdult','Old'])

In [18]:
view_categoricals_feature('Age')

Unnamed: 0_level_0,Obesity_Type_III,Obesity_Type_II,Normal_Weight,Obesity_Type_I,Insufficient_Weight,Overweight_Level_II,Overweight_Level_I
Age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Teenager,0.033647,0.235532,30.148048,14.401077,34.522207,10.935397,9.724092
Adult,27.041123,14.282849,14.135741,12.751588,9.936476,9.655634,12.19659
OldAdult,0.036153,39.913232,2.603037,20.715835,0.397686,25.37961,10.954447
Old,0.0,1.538462,0.0,3.076923,0.0,78.461538,16.923077


### Thường xuyên tiêu thụ thực phẩm có lượng calo cao (FAVC)

In [19]:
view_categoricals_feature('FAVC')

Unnamed: 0_level_0,Obesity_Type_III,Obesity_Type_II,Normal_Weight,Obesity_Type_I,Insufficient_Weight,Overweight_Level_II,Overweight_Level_I
FAVC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
yes,21.309662,16.826467,13.628701,14.840375,11.342324,10.446739,11.605732
no,0.056306,3.040541,27.871622,5.236486,20.833333,30.349099,12.612613


### Lịch sử gia đình có người thừa cân

In [20]:
view_categoricals_feature('family_history_with_overweight')

Unnamed: 0_level_0,Obesity_Type_III,Obesity_Type_II,Normal_Weight,Obesity_Type_I,Insufficient_Weight,Overweight_Level_II,Overweight_Level_I
family_history_with_overweight,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
no,0.026709,0.133547,34.508547,1.282051,38.701923,5.154915,20.192308
yes,23.774539,19.060773,10.520748,16.821441,6.312449,13.688727,9.821324


### Tiêu thụ thức ăn giữa các bữa ăn (CAEC)

In [21]:
view_categoricals_feature('CAEC')

Unnamed: 0_level_0,Obesity_Type_III,Obesity_Type_II,Normal_Weight,Obesity_Type_I,Insufficient_Weight,Overweight_Level_II,Overweight_Level_I
CAEC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Always,0.209205,2.09205,57.531381,13.389121,7.322176,6.903766,12.552301
Frequently,0.121359,0.485437,28.559871,2.265372,49.029126,8.495146,11.043689
Sometimes,23.058931,18.392378,11.780478,15.870843,7.233727,12.967083,10.69656
no,,0.716846,12.903226,2.867384,2.867384,2.150538,78.494624


### Hút thuốc

In [22]:
view_categoricals_feature('SMOKE')

Unnamed: 0_level_0,Obesity_Type_III,Obesity_Type_II,Normal_Weight,Obesity_Type_I,Insufficient_Weight,Overweight_Level_II,Overweight_Level_I
SMOKE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
no,19.704578,15.278116,14.80037,14.069127,12.265393,12.172768,11.709648
yes,1.632653,46.530612,18.77551,9.795918,2.857143,10.204082,10.204082


### Theo dõi lượng calo tiêu thụ (SCC)

In [23]:
view_categoricals_feature('SCC')

Unnamed: 0_level_0,Obesity_Type_III,Obesity_Type_II,Normal_Weight,Obesity_Type_I,Insufficient_Weight,Overweight_Level_II,Overweight_Level_I
SCC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
no,20.158438,16.162623,14.558318,14.408849,11.414479,12.445817,10.851477
yes,,0.582242,23.289665,2.620087,33.770015,3.49345,36.244541


### Tiêu thụ rượu (CALC)

In [24]:
view_categoricals_feature('CALC')

Unnamed: 0_level_0,Obesity_Type_III,Obesity_Type_II,Normal_Weight,Obesity_Type_I,Insufficient_Weight,Overweight_Level_II,Overweight_Level_I
CALC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Frequently,,2.457467,12.854442,14.36673,2.646503,43.100189,24.574669
Sometimes,26.855171,18.432232,12.876676,9.584495,10.752688,8.489314,13.009425
no,,8.870812,20.801859,26.922332,17.218671,19.659113,6.527213


### Phương tiện đi lại được sử dụng (MTRANS)

In [25]:
view_categoricals_feature('MTRANS')

Unnamed: 0_level_0,Obesity_Type_III,Obesity_Type_II,Normal_Weight,Obesity_Type_I,Insufficient_Weight,Overweight_Level_II,Overweight_Level_I
MTRANS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Automobile,,26.711941,8.743633,21.137521,7.838144,22.1279,13.44086
Bike,,3.125,43.75,6.25,6.25,12.5,28.125
Motorbike,,7.894737,44.736842,10.526316,10.526316,7.894737,18.421053
Public_Transportation,24.246419,13.747228,15.371247,12.776413,12.980164,9.881944,10.996584
Walking,,1.284797,37.901499,5.353319,15.845824,17.987152,21.627409


## Preprocessing

In [26]:
import numpy as np

In [28]:
raw_df.drop(columns='id',inplace=True)

In [29]:
numeric_cols = raw_df.select_dtypes(include=np.number).columns.tolist()
numeric_cols

['Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']

In [32]:
categorical_cols = raw_df.select_dtypes(exclude=np.number).columns.tolist()
categorical_cols

['Gender',
 'Age',
 'family_history_with_overweight',
 'FAVC',
 'CAEC',
 'SMOKE',
 'SCC',
 'CALC',
 'MTRANS',
 'NObeyesdad']

In [33]:
categorical_cols.remove('NObeyesdad')
target_col = 'NObeyesdad'

In [34]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

### Scaler

In [35]:
scaler = MinMaxScaler()
scaler.fit(raw_df[numeric_cols])

In [36]:
raw_df[numeric_cols] = scaler.transform(raw_df[numeric_cols])
test_df[numeric_cols] = scaler.transform(test_df[numeric_cols])

In [37]:
raw_df[numeric_cols].describe()

Unnamed: 0,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0
mean,0.476056,0.387822,0.722954,0.587111,0.514709,0.327249,0.308378
std,0.166099,0.209266,0.266609,0.235125,0.304234,0.279434,0.301057
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.345955,0.214188,0.5,0.666667,0.396011,0.002671,0.0
50%,0.47559,0.357495,0.696918,0.666667,0.5,0.333333,0.286944
75%,0.595224,0.575933,1.0,0.666667,0.774809,0.529135,0.5
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [38]:
test_df[numeric_cols].describe()

Unnamed: 0,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,13840.0,13840.0,13840.0,13840.0,13840.0,13840.0,13840.0
mean,0.473562,0.38383,0.721449,0.583537,0.516022,0.324844,0.305517
std,0.168855,0.207143,0.265803,0.236976,0.305615,0.28012,0.304003
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.345586,0.206255,0.5,0.666667,0.385891,0.000362,0.0
50%,0.47559,0.356608,0.679043,0.666667,0.5,0.333333,0.276249
75%,0.591082,0.572421,1.0,0.666667,0.776194,0.523955,0.5
max,1.008251,1.0,1.0,1.0,1.0,1.0,1.0


### Encode

In [39]:
test_df['Age'].describe()

count    13840.000000
mean        23.952740
std          5.799814
min         14.000000
25%         20.000000
50%         22.906342
75%         26.000000
max         61.000000
Name: Age, dtype: float64

In [40]:
test_df['Age'] = pd.cut(test_df['Age'],
                        bins=[13,18,30,52,100],
                        labels=['Teenager','Adult','OldAdult','Old'])

In [45]:
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoder.fit(raw_df[categorical_cols])

In [46]:
encoded_cols = list(encoder.get_feature_names_out(categorical_cols))
len(encoded_cols)

26

In [47]:
encoded_df = pd.DataFrame(data=encoder.transform(raw_df[categorical_cols]), columns=encoded_cols)
encoded_df.index = raw_df.index
raw_df = pd.concat([raw_df, encoded_df], axis=1).copy()

In [48]:
encoded_df = pd.DataFrame(data=encoder.transform(test_df[categorical_cols]), columns=encoded_cols)
encoded_df.index = test_df.index
test_df = pd.concat([test_df, encoded_df], axis=1).copy()

### Train, val, test

In [53]:
inputs = raw_df[numeric_cols+encoded_cols]

In [54]:
test_inputs = test_df[numeric_cols+encoded_cols]

In [55]:
inputs

Unnamed: 0,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender_Female,Gender_Male,Age_Adult,...,SCC_no,SCC_yes,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,0.475586,0.338497,0.500000,0.661099,0.881787,0.000000,0.488237,0.0,1.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.209260,0.142792,0.500000,0.666667,0.500000,0.333333,0.500000,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2,0.497391,0.088577,0.440267,0.137228,0.455189,0.288682,0.836792,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.496002,0.732007,1.000000,0.666667,0.337031,0.489288,0.390099,1.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.883049,0.434708,0.839832,0.323824,0.489924,0.655991,0.465861,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20753,0.602336,0.596452,0.959792,0.666667,0.575905,0.443506,0.098340,0.0,1.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
20754,0.494613,0.087262,1.000000,1.000000,0.000000,0.666667,0.500000,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
20755,0.703030,0.528177,0.703909,0.666667,0.500000,0.386013,0.599220,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
20756,0.475590,0.353174,0.835619,0.323824,0.572419,0.000000,0.486917,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


In [57]:
target_map = {
    'Insufficient_Weight': 0,
    'Normal_Weight': 1,
    'Overweight_Level_I': 2,
    'Overweight_Level_II': 3,
    'Obesity_Type_I': 4,
    'Obesity_Type_II': 5,
    'Obesity_Type_III': 6
}
targets = raw_df[target_col].map(target_map)

In [58]:
targets

0        3
1        1
2        0
3        6
4        3
        ..
20753    5
20754    0
20755    5
20756    3
20757    5
Name: NObeyesdad, Length: 20758, dtype: int64

In [60]:
from sklearn.model_selection import train_test_split

In [61]:
train_inputs, val_inputs, train_targets, val_targets = train_test_split(inputs, targets,
                                                                        test_size=0.2,
                                                                        random_state=42)

## Training Model

In [62]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [63]:
classifier = {
    'RF': RandomForestClassifier(),
    'XGB': XGBClassifier(),
    'LGBM': LGBMClassifier()
}
result = {'Classifier': [], 'Accurary': []}

In [64]:
from sklearn.metrics import accuracy_score

In [65]:
for clf_name, clf in classifier.items():
    clf.fit(train_inputs, train_targets)
    preds = clf.predict(val_inputs)
    acc = accuracy_score(val_targets, preds)
    result['Classifier'].append(clf_name)
    result['Accurary'].append(acc)
result_df = pd.DataFrame(result)
result_df

Unnamed: 0,Classifier,Accurary
0,RF,0.879335
1,XGB,0.900048
2,LGBM,0.901252


In [66]:
def test_params(**params):
    model = LGBMClassifier(**params)
    model.fit(train_inputs, train_targets)
    train_preds = model.predict(train_inputs)
    train_acc = accuracy_score(train_targets, train_preds)
    val_preds = model.predict(val_inputs)
    val_acc = accuracy_score(val_targets, val_preds)
    return train_acc, val_acc

In [67]:
test_params()

(0.9757316632542454, 0.901252408477842)

In [70]:
for i in ['gbdt','dart']:
    train_acc, val_acc = test_params(boosting_type=i)
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(i, train_acc, val_acc))

Test gbdt: train_acc = 0.975732, val_acc = 0.901252
Test dart: train_acc = 0.943093, val_acc = 0.904143


In [72]:
for i in [24, 30, 32, 34, 36, 42]:
    train_acc, val_acc = test_params(boosting_type='dart', num_leaves=i)
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(i, train_acc, val_acc))

Test 24: train_acc = 0.935325, val_acc = 0.904143
Test 30: train_acc = 0.942792, val_acc = 0.904143
Test 32: train_acc = 0.943936, val_acc = 0.904624
Test 34: train_acc = 0.947489, val_acc = 0.905588
Test 36: train_acc = 0.948753, val_acc = 0.905347
Test 42: train_acc = 0.954896, val_acc = 0.903661


In [73]:
for i in [-1, 6, 8, 10, 12]:
    train_acc, val_acc = test_params(boosting_type='dart', num_leaves=34,
                                     max_depth=i)
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(i, train_acc, val_acc))

Test -1: train_acc = 0.947489, val_acc = 0.905588
Test 6: train_acc = 0.931531, val_acc = 0.904143
Test 8: train_acc = 0.942129, val_acc = 0.904143
Test 10: train_acc = 0.945201, val_acc = 0.904383
Test 12: train_acc = 0.946766, val_acc = 0.904143


In [74]:
for i in [0.01, 0.05, 0.1, 0.2, 0.4, 0.5]:
    train_acc, val_acc = test_params(boosting_type='dart', num_leaves=34,
                                     learning_rate=i)
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(i, train_acc, val_acc))

Test 0.01: train_acc = 0.904011, val_acc = 0.889451
Test 0.05: train_acc = 0.924907, val_acc = 0.901493
Test 0.1: train_acc = 0.947489, val_acc = 0.905588
Test 0.2: train_acc = 0.976153, val_acc = 0.903420
Test 0.4: train_acc = 0.827653, val_acc = 0.767341
Test 0.5: train_acc = 0.317355, val_acc = 0.324663


In [76]:
for i in [80,90,100,110,120]:
    train_acc, val_acc = test_params(boosting_type='dart', num_leaves=34,
                                     n_estimators=i)
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(i, train_acc, val_acc))

Test 80: train_acc = 0.945201, val_acc = 0.905347
Test 90: train_acc = 0.944899, val_acc = 0.905347
Test 100: train_acc = 0.947489, val_acc = 0.905588
Test 110: train_acc = 0.947549, val_acc = 0.905106
Test 120: train_acc = 0.950018, val_acc = 0.905347


In [77]:
for i in [None, 17, 42, 1212]:
    train_acc, val_acc = test_params(boosting_type='dart', num_leaves=34,
                                     random_state=i)
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(i, train_acc, val_acc))

Test None: train_acc = 0.947489, val_acc = 0.905588
Test 17: train_acc = 0.938697, val_acc = 0.903420
Test 42: train_acc = 0.939179, val_acc = 0.905106
Test 1212: train_acc = 0.941768, val_acc = 0.904865


In [78]:
for i in [0, 0.25, 0.5, 0.75, 1]:
    train_acc, val_acc = test_params(boosting_type='dart', num_leaves=34,
                                     random_state=42, reg_alpha=i)
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(i, train_acc, val_acc))

Test 0: train_acc = 0.939179, val_acc = 0.905106
Test 0.25: train_acc = 0.937613, val_acc = 0.904143
Test 0.5: train_acc = 0.935987, val_acc = 0.904383
Test 0.75: train_acc = 0.932494, val_acc = 0.905829
Test 1: train_acc = 0.932253, val_acc = 0.902938


In [79]:
for i in [0, 0.25, 0.5, 0.75, 1]:
    train_acc, val_acc = test_params(boosting_type='dart', num_leaves=34,
                                     random_state=42, reg_alpha=0.75,
                                     reg_lambda=i)
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(i, train_acc, val_acc))

Test 0: train_acc = 0.932494, val_acc = 0.905829
Test 0.25: train_acc = 0.932675, val_acc = 0.903179
Test 0.5: train_acc = 0.932494, val_acc = 0.903661
Test 0.75: train_acc = 0.931892, val_acc = 0.905106
Test 1: train_acc = 0.931952, val_acc = 0.903902


In [80]:
lgbm_model = LGBMClassifier(boosting_type='dart', num_leaves=34,
                            random_state=42, reg_alpha=0.75)

In [81]:
lgbm_model.fit(train_inputs, train_targets)

In [82]:
lgbm_model.score(val_inputs, val_targets)

0.9058285163776493

In [83]:
test_preds = lgbm_model.predict(test_inputs)

In [87]:
target_map_reverse = {
    0: 'Insufficient_Weight',
    1: 'Normal_Weight',
    2: 'Overweight_Level_I',
    3: 'Overweight_Level_II',
    4: 'Obesity_Type_I',
    5: 'Obesity_Type_II',
    6: 'Obesity_Type_III'
}

In [96]:
preds = []
for i in range(len(test_preds)):
    preds.append(target_map_reverse.get(test_preds[i]))

In [97]:
preds

['Obesity_Type_II',
 'Overweight_Level_I',
 'Obesity_Type_III',
 'Obesity_Type_I',
 'Obesity_Type_III',
 'Insufficient_Weight',
 'Insufficient_Weight',
 'Normal_Weight',
 'Overweight_Level_II',
 'Normal_Weight',
 'Insufficient_Weight',
 'Obesity_Type_III',
 'Obesity_Type_III',
 'Obesity_Type_I',
 'Obesity_Type_III',
 'Overweight_Level_II',
 'Obesity_Type_I',
 'Obesity_Type_I',
 'Obesity_Type_III',
 'Obesity_Type_I',
 'Normal_Weight',
 'Obesity_Type_III',
 'Obesity_Type_III',
 'Obesity_Type_II',
 'Overweight_Level_I',
 'Obesity_Type_I',
 'Obesity_Type_II',
 'Overweight_Level_II',
 'Insufficient_Weight',
 'Obesity_Type_I',
 'Overweight_Level_II',
 'Obesity_Type_III',
 'Obesity_Type_I',
 'Obesity_Type_II',
 'Overweight_Level_I',
 'Overweight_Level_I',
 'Obesity_Type_I',
 'Normal_Weight',
 'Insufficient_Weight',
 'Overweight_Level_II',
 'Obesity_Type_II',
 'Overweight_Level_II',
 'Obesity_Type_II',
 'Overweight_Level_II',
 'Normal_Weight',
 'Insufficient_Weight',
 'Obesity_Type_I',
 'Obesi

In [100]:
sub_df['NObeyesdad'] = preds
sub_df.to_csv('data/sub1.csv',index=None)