# 1. Data and Library Load

In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pickle
import os
import random

# ML
from sklearn.ensemble import RandomForestRegressor  # Bagging
from xgboost.sklearn import XGBRFRegressor           # GBM
from sklearn.linear_model import LogisticRegression  # LogisticRegression


# DL
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Dense, ReLU, Softmax, Dropout
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

# for checking multi-collinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor

# KFold(CV), partial : for optuna
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from functools import partial
from imblearn.over_sampling import SMOTE

# AutoML framework
import optuna
from optuna.samplers import TPESampler

In [112]:
# set configs
is_tuning = True
if is_tuning:
    sampler = TPESampler(seed=42)  # Make the sampler behave in a deterministic way.
is_scaling = True
is_pca = False
apply_vif = False
is_cuml = True
is_debug = True
sampling_method = 'hybrid' # 'under' or 'over'
if is_tuning:
    n_trials=50

# import SVC
# if is_cuml:
#     from cuml.svm import SVC, LinearSVC
# else:
#     from sklearn.svm import SVC
    
from sklearn.svm import SVC

    
# Keras model compile
learning_rate = 1e-2
batch_size = 32
epochs = 10

In [13]:
train = pd.read_csv('/USER/teeth/train.csv')
test = pd.read_csv('/USER/teeth/test.csv')

print(train.shape, test.shape)

(432, 12) (286, 11)


In [14]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 432 entries, 0 to 431
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   filename   432 non-null    object 
 1   operator   432 non-null    object 
 2   date       432 non-null    object 
 3   ID         432 non-null    object 
 4   sex        432 non-null    object 
 5   age        432 non-null    int64  
 6   ext_tooth  432 non-null    int64  
 7   time_min   432 non-null    int64  
 8   mmo        432 non-null    int64  
 9   height     427 non-null    float64
 10  weight     427 non-null    float64
 11  bmi        427 non-null    float64
dtypes: float64(3), int64(4), object(5)
memory usage: 40.6+ KB


# 2. EDA

In [15]:
lb = LabelEncoder()
train.sex = lb.fit_transform(train.sex)  # M->0, F->1

In [16]:
train = train.drop(columns=["ID","filename","ext_tooth","date"])

In [17]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 432 entries, 0 to 431
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   operator  432 non-null    object 
 1   sex       432 non-null    int64  
 2   age       432 non-null    int64  
 3   time_min  432 non-null    int64  
 4   mmo       432 non-null    int64  
 5   height    427 non-null    float64
 6   weight    427 non-null    float64
 7   bmi       427 non-null    float64
dtypes: float64(3), int64(4), object(1)
memory usage: 27.1+ KB


In [30]:
# 결측치 채우기
train.loc[train['height'] != train['height'], 'height'] = train['height'].mean()
train.loc[train['weight'] != train['weight'], 'weight'] = train['weight'].mean()
train.loc[train['bmi'] != train['bmi'], 'bmi'] = train['bmi'].mean()

In [31]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 432 entries, 0 to 431
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   operator  432 non-null    object 
 1   sex       432 non-null    int64  
 2   age       432 non-null    int64  
 3   time_min  432 non-null    int64  
 4   mmo       432 non-null    int64  
 5   height    432 non-null    float64
 6   weight    432 non-null    float64
 7   bmi       432 non-null    float64
dtypes: float64(3), int64(4), object(1)
memory usage: 27.1+ KB


In [32]:
train[['A','K','P','Y']] = pd.get_dummies(train['operator'], dtype=int)

In [34]:
train = train.drop(columns=["operator"])

In [35]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 432 entries, 0 to 431
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   sex       432 non-null    int64  
 1   age       432 non-null    int64  
 2   time_min  432 non-null    int64  
 3   mmo       432 non-null    int64  
 4   height    432 non-null    float64
 5   weight    432 non-null    float64
 6   bmi       432 non-null    float64
 7   A         432 non-null    int64  
 8   K         432 non-null    int64  
 9   P         432 non-null    int64  
 10  Y         432 non-null    int64  
dtypes: float64(3), int64(8)
memory usage: 37.3 KB


In [39]:
# check missing data
train[train.isnull().any(axis=1)]

Unnamed: 0,sex,age,time_min,mmo,height,weight,bmi,A,K,P,Y


In [40]:
def check_vif(df):
    vifs = [variance_inflation_factor(df, i) for i in range(df.shape[1])]
    vif_df = pd.DataFrame({"features":df.columns, "VIF" : vifs})
    vif_df = vif_df.sort_values(by="VIF", ascending=False)
    remove_col = vif_df.iloc[0, 0]
    top_vif = vif_df.iloc[0, 1]
    return vif_df, remove_col, top_vif

In [43]:
# remove all features when VIF is over 10.
apply_vif = False
if apply_vif:
    top_vif = 100

    while(top_vif > 5):
        vif_df, remove_col, top_vif = check_vif(train)
        print(remove_col, top_vif)
        if top_vif < 5:
            break
        train = train.drop(columns=remove_col)

    display(train)

In [47]:
# feature selection via Feature Importance
X = train.drop(columns=["time_min"])
y = train['time_min']

rf = RandomForestRegressor()
rf.fit(X, y)
fi_df = pd.DataFrame({'feature':X.columns, 'importance':rf.feature_importances_})
selected_cols = fi_df.sort_values(by="importance", ascending=False)[:10]["feature"].values
selected_cols

array(['Y', 'age', 'bmi', 'height', 'mmo', 'weight', 'A', 'sex', 'P', 'K'],
      dtype=object)

# 3. Data preprocessing

In [52]:
selected_cols = ['sex', 'age', 'mmo', 'height', 'weight', 'bmi', 'A', 'K','P', 'Y']
target_cols = ['time_min']

In [54]:
from sklearn.model_selection import train_test_split

#X = train.drop(columns=["Class"])
X = train[selected_cols]
y = train['time_min']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=42)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(367, 10) (65, 10) (367,) (65,)


In [59]:
from sklearn.preprocessing import StandardScaler

if is_scaling:
    scaler = StandardScaler()
    data_ = scaler.fit_transform(X_train)
    X_train = pd.DataFrame(data=data_, columns=X_train.columns)
    data_ = scaler.transform(X_val)
    X_val = pd.DataFrame(data=data_, columns=X_val.columns)
    display(X_train)

Unnamed: 0,sex,age,mmo,height,weight,bmi,A,K,P,Y
0,1.013718,0.719932,0.187817,-0.175799,-0.248856,-0.182346,-0.864650,-0.536395,2.674987,-0.540605
1,1.013718,1.256046,0.002025,0.778479,1.999712,1.997820,1.156538,-0.536395,-0.373834,-0.540605
2,-0.986468,-0.888411,1.488359,-0.772222,-1.119270,-1.036708,-0.864650,1.864299,-0.373834,-0.540605
3,1.013718,1.166694,1.859943,1.374902,0.911695,0.318402,-0.864650,-0.536395,2.674987,-0.540605
4,-0.986468,-0.441649,0.002025,-0.414368,-0.466460,-0.333549,-0.864650,1.864299,-0.373834,-0.540605
...,...,...,...,...,...,...,...,...,...,...
362,-0.986468,1.434751,-1.484309,-0.891507,-0.829132,-0.561180,1.156538,-0.536395,-0.373834,-0.540605
363,1.013718,-0.977763,0.930984,2.090610,1.709574,0.762472,-0.864650,-0.536395,2.674987,-0.540605
364,1.013718,-0.441649,-0.555350,1.494187,1.129299,0.492714,1.156538,-0.536395,-0.373834,-0.540605
365,-0.986468,0.005113,1.302567,-1.607215,-1.046735,-0.460847,-0.864650,-0.536395,2.674987,-0.540605


In [60]:
if is_pca:
    from sklearn.decomposition import PCA
    
    pca = PCA(n_components=0.90, random_state=42)
    data_ = pca.fit_transform(X_train)
    X_train = pd.DataFrame(data=data_, columns=[f"PC{i}" for i in range(1, data_.shape[1]+1)])
    data_ = pca.transform(X_val)
    X_val = pd.DataFrame(data=data_, columns=[f"PC{i}" for i in range(1, data_.shape[1]+1)])

    display(X_train)

# 4. Fitting and Evaluation

In [None]:
# from sklearn.metrics import mean_absolute_error
# evaluation_metric = balance_logloss

In [134]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error

def rf_optimizer(trial, X, y, K):
    # define parameter to tune
    n_estimators = trial.suggest_categorical('n_estimators', [50, 100, 200])
    max_depth = trial.suggest_int('max_depth', 4, 10)
    max_features = trial.suggest_categorical('max_features', [0.6, 0.7, 0.8])
    
    
    # set model
    model = RandomForestRegressor(n_estimators=n_estimators,
                                   max_depth=max_depth,
                                   max_features=max_features,
                                   criterion='absolute_error', # log_loss
                                #    class_weight='balanced'
                                  )
    
    # K-Fold Cross validation
    folds = StratifiedKFold(n_splits=K, shuffle=True)
    losses = []
    
    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        loss = mean_absolute_percentage_error(y_val, preds)
        losses.append(loss)
    
    
    # return mean score of CV
    return np.mean(losses)

In [135]:
def xgb_optimizer(trial, X, y, K):
    n_estimators = trial.suggest_categorical('n_estimators', [500, 1000, 2000])
    max_depth = trial.suggest_int('max_depth', 4, 10)
    colsample_bytree = trial.suggest_categorical('colsample_bytree', [0.5, 0.6, 0.7, 0.8])
    learning_rate = trial.suggest_float('learning_rate', 1e-3, 1e-2)
    reg_lambda = trial.suggest_categorical('reg_lambda', [0.1, 0.5, 1, 2])
    
    
    model = XGBRFRegressor(n_estimators=n_estimators,
                          max_depth=max_depth,
                          colsample_bytree=colsample_bytree,
                          learning_rate=learning_rate,
                          reg_lambda=reg_lambda)
#                          scale_pos_weight=4.71)  ## we set class imbalance by using sampling method.
    
    
    folds = StratifiedKFold(n_splits=K, shuffle=True)
    losses = []
    
    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        loss = mean_absolute_percentage_error(y_val, preds)
        losses.append(loss)
    
    
    return np.mean(losses)

In [115]:
K = 4 # set K of K-Fold
opt_func = partial(rf_optimizer, X=X, y=y, K=K)

if is_tuning:
    rf_study = optuna.create_study(direction="minimize", sampler=sampler) # determine minimize or maximize sth
    rf_study.optimize(opt_func, n_trials=n_trials)

[I 2023-11-23 13:04:18,242] A new study created in memory with name: no-name-33e8b61b-5d87-442d-8bff-0c88f9103a5c
[I 2023-11-23 13:04:19,325] Trial 0 finished with value: 4.873645833333333 and parameters: {'n_estimators': 100, 'max_depth': 8, 'max_features': 0.6}. Best is trial 0 with value: 4.873645833333333.
[I 2023-11-23 13:04:19,729] Trial 1 finished with value: 4.6673379629629625 and parameters: {'n_estimators': 50, 'max_depth': 4, 'max_features': 0.6}. Best is trial 1 with value: 4.6673379629629625.
[I 2023-11-23 13:04:22,073] Trial 2 finished with value: 4.852447916666666 and parameters: {'n_estimators': 200, 'max_depth': 7, 'max_features': 0.8}. Best is trial 1 with value: 4.6673379629629625.
[I 2023-11-23 13:04:24,042] Trial 3 finished with value: 4.685763888888889 and parameters: {'n_estimators': 200, 'max_depth': 7, 'max_features': 0.6}. Best is trial 1 with value: 4.6673379629629625.
[I 2023-11-23 13:04:26,020] Trial 4 finished with value: 4.731579861111111 and parameters: 

In [116]:
K = 4
opt_func = partial(xgb_optimizer, X=X, y=y, K=K)

if is_tuning:
    xgb_study = optuna.create_study(direction="minimize", sampler=sampler)
    xgb_study.optimize(opt_func, n_trials=n_trials)

[I 2023-11-23 13:05:03,665] A new study created in memory with name: no-name-64461f1b-08dc-4d77-a8b8-72344a7d0ddc
[I 2023-11-23 13:05:05,463] Trial 0 finished with value: 6.239804111145161 and parameters: {'n_estimators': 500, 'max_depth': 9, 'colsample_bytree': 0.7, 'learning_rate': 0.004226191556898453, 'reg_lambda': 0.5}. Best is trial 0 with value: 6.239804111145161.
[I 2023-11-23 13:05:11,681] Trial 1 finished with value: 6.233454863230388 and parameters: {'n_estimators': 2000, 'max_depth': 9, 'colsample_bytree': 0.6, 'learning_rate': 0.0074192030850069556, 'reg_lambda': 1}. Best is trial 1 with value: 6.233454863230388.
[I 2023-11-23 13:05:12,251] Trial 2 finished with value: 6.23047090680511 and parameters: {'n_estimators': 500, 'max_depth': 4, 'colsample_bytree': 0.6, 'learning_rate': 0.009168098265334837, 'reg_lambda': 1}. Best is trial 2 with value: 6.23047090680511.
[I 2023-11-23 13:05:15,702] Trial 3 finished with value: 6.243838023256373 and parameters: {'n_estimators': 10

In [117]:
# visualize experiment logs
def display_experiment_log(study):
    display(study.trials_dataframe())
    print("Best Score: %.4f" % study.best_value)
    print("Best params: ", study.best_trial.params)
    history = study.trials_dataframe()
    display(history[history.value == study.best_value])
    # optuna.visualization.plot_optimization_history(study).show()
    # optuna.visualization.plot_param_importances(study).show()

In [118]:
if is_tuning:
    display_experiment_log(rf_study)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_max_depth,params_max_features,params_n_estimators,state
0,0,4.873646,2023-11-23 13:04:18.244597,2023-11-23 13:04:19.325609,0 days 00:00:01.081012,8,0.6,100,COMPLETE
1,1,4.667338,2023-11-23 13:04:19.326664,2023-11-23 13:04:19.729646,0 days 00:00:00.402982,4,0.6,50,COMPLETE
2,2,4.852448,2023-11-23 13:04:19.730665,2023-11-23 13:04:22.073336,0 days 00:00:02.342671,7,0.8,200,COMPLETE
3,3,4.685764,2023-11-23 13:04:22.074522,2023-11-23 13:04:24.042536,0 days 00:00:01.968014,7,0.6,200,COMPLETE
4,4,4.73158,2023-11-23 13:04:24.044527,2023-11-23 13:04:26.020799,0 days 00:00:01.976272,5,0.8,200,COMPLETE
5,5,4.61419,2023-11-23 13:04:26.022140,2023-11-23 13:04:26.650080,0 days 00:00:00.627940,8,0.8,50,COMPLETE
6,6,4.893912,2023-11-23 13:04:26.651081,2023-11-23 13:04:27.899016,0 days 00:00:01.247935,8,0.8,100,COMPLETE
7,7,4.835602,2023-11-23 13:04:27.900020,2023-11-23 13:04:29.254353,0 days 00:00:01.354333,10,0.8,100,COMPLETE
8,8,4.693542,2023-11-23 13:04:29.255496,2023-11-23 13:04:30.351510,0 days 00:00:01.096014,6,0.8,100,COMPLETE
9,9,4.753218,2023-11-23 13:04:30.352520,2023-11-23 13:04:32.128310,0 days 00:00:01.775790,4,0.8,200,COMPLETE


Best Score: 4.5773
Best params:  {'n_estimators': 100, 'max_depth': 4, 'max_features': 0.6}


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_max_depth,params_max_features,params_n_estimators,state
42,42,4.577257,2023-11-23 13:04:56.835736,2023-11-23 13:04:57.614133,0 days 00:00:00.778397,4,0.6,100,COMPLETE


In [119]:
if is_tuning:
    display_experiment_log(xgb_study)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_learning_rate,params_max_depth,params_n_estimators,params_reg_lambda,state
0,0,6.239804,2023-11-23 13:05:03.668753,2023-11-23 13:05:05.463400,0 days 00:00:01.794647,0.7,0.004226,9,500,0.5,COMPLETE
1,1,6.233455,2023-11-23 13:05:05.464689,2023-11-23 13:05:11.681177,0 days 00:00:06.216488,0.6,0.007419,9,2000,1.0,COMPLETE
2,2,6.230471,2023-11-23 13:05:11.683561,2023-11-23 13:05:12.251024,0 days 00:00:00.567463,0.6,0.009168,4,500,1.0,COMPLETE
3,3,6.243838,2023-11-23 13:05:12.252259,2023-11-23 13:05:15.702314,0 days 00:00:03.450055,0.7,0.002679,10,1000,2.0,COMPLETE
4,4,6.239931,2023-11-23 13:05:15.703943,2023-11-23 13:05:16.633387,0 days 00:00:00.929444,0.6,0.004757,6,500,2.0,COMPLETE
5,5,6.2427,2023-11-23 13:05:16.634880,2023-11-23 13:05:20.329350,0 days 00:00:03.694470,0.5,0.003708,6,2000,1.0,COMPLETE
6,6,6.23186,2023-11-23 13:05:20.332153,2023-11-23 13:05:23.375864,0 days 00:00:03.043711,0.7,0.007049,5,2000,0.1,COMPLETE
7,7,6.237689,2023-11-23 13:05:23.378602,2023-11-23 13:05:24.494387,0 days 00:00:01.115785,0.5,0.006318,4,1000,0.1,COMPLETE
8,8,6.230482,2023-11-23 13:05:24.496464,2023-11-23 13:05:28.265109,0 days 00:00:03.768645,0.5,0.009322,6,2000,0.1,COMPLETE
9,9,6.241145,2023-11-23 13:05:28.267966,2023-11-23 13:05:28.846179,0 days 00:00:00.578213,0.6,0.004143,4,500,0.5,COMPLETE


Best Score: 6.2239
Best params:  {'n_estimators': 1000, 'max_depth': 8, 'colsample_bytree': 0.8, 'learning_rate': 0.009966161429329495, 'reg_lambda': 1}


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_learning_rate,params_max_depth,params_n_estimators,params_reg_lambda,state
19,19,6.223905,2023-11-23 13:05:44.005680,2023-11-23 13:05:47.151601,0 days 00:00:03.145921,0.8,0.009966,8,1000,1.0,COMPLETE


# 6. Test Prediction and Make Submission

In [120]:
test

Unnamed: 0,filename,operator,date,ID,sex,age,ext_tooth,mmo,height,weight,bmi,A,K,P,Y
0,test_0000.png,A,2021.01.27,0025d728ca,1,25,38,42,178.0,79.0,24.933720,1,0,0,0
1,test_0001.png,A,2020.03.10,0025254212,1,41,38,45,178.0,80.0,25.249337,1,0,0,0
2,test_0002.png,K,2020.03.20,00c1240c5e,0,25,38,44,162.0,71.0,27.053803,0,1,0,0
3,test_0003.png,A,2020.06.13,00afffb836,0,25,38,34,158.0,52.0,20.829995,1,0,0,0
4,test_0004.png,P,2020.05.18,001075979c,1,21,38,50,178.0,67.0,21.146320,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,test_0281.png,A,2020.08.21,000bd141a0,1,26,38,35,174.0,54.0,17.835910,1,0,0,0
282,test_0282.png,A,2020.04.23,00ac217bb0,0,22,48,42,162.0,62.0,23.624447,1,0,0,0
283,test_0283.png,Y,2021.02.03,002fb89e3a,1,21,48,40,179.0,81.0,25.280110,0,0,0,1
284,test_0284.png,K,2021.03.13,0018cd0b04,0,25,38,38,162.0,62.0,23.624447,0,1,0,0


In [121]:
test.isna().sum()

filename     0
operator     0
date         0
ID           0
sex          0
age          0
ext_tooth    0
mmo          0
height       0
weight       0
bmi          0
A            0
K            0
P            0
Y            0
dtype: int64

In [122]:
test.loc[test['height'] != test['height'], 'height'] = test['height'].mean()
test.loc[test['weight'] != test['weight'], 'weight'] = test['weight'].mean()
test.loc[test['bmi'] != test['bmi'], 'bmi'] = test['bmi'].mean()

In [123]:
test[['A','K','P','Y']] = pd.get_dummies(test['operator'], dtype=int)

In [124]:
test.sex = lb.fit_transform(test.sex) 

In [125]:
test

Unnamed: 0,filename,operator,date,ID,sex,age,ext_tooth,mmo,height,weight,bmi,A,K,P,Y
0,test_0000.png,A,2021.01.27,0025d728ca,1,25,38,42,178.0,79.0,24.933720,1,0,0,0
1,test_0001.png,A,2020.03.10,0025254212,1,41,38,45,178.0,80.0,25.249337,1,0,0,0
2,test_0002.png,K,2020.03.20,00c1240c5e,0,25,38,44,162.0,71.0,27.053803,0,1,0,0
3,test_0003.png,A,2020.06.13,00afffb836,0,25,38,34,158.0,52.0,20.829995,1,0,0,0
4,test_0004.png,P,2020.05.18,001075979c,1,21,38,50,178.0,67.0,21.146320,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,test_0281.png,A,2020.08.21,000bd141a0,1,26,38,35,174.0,54.0,17.835910,1,0,0,0
282,test_0282.png,A,2020.04.23,00ac217bb0,0,22,48,42,162.0,62.0,23.624447,1,0,0,0
283,test_0283.png,Y,2021.02.03,002fb89e3a,1,21,48,40,179.0,81.0,25.280110,0,0,0,1
284,test_0284.png,K,2021.03.13,0018cd0b04,0,25,38,38,162.0,62.0,23.624447,0,1,0,0


In [126]:
test.isna().sum()

filename     0
operator     0
date         0
ID           0
sex          0
age          0
ext_tooth    0
mmo          0
height       0
weight       0
bmi          0
A            0
K            0
P            0
Y            0
dtype: int64

In [127]:
## preprocessing in same way
X_test = test[X.columns].fillna(X.mean())
if is_scaling:
    X_test = scaler.transform(X_test)
    X_test = pd.DataFrame(data=X_test, columns=X.columns)

if is_pca:
    data_ = pca.transform(X_test)
    X_test = pd.DataFrame(data=data_, columns=[f"PC{i}" for i in range(1, data_.shape[1]+1)])

X_test

Unnamed: 0,sex,age,mmo,height,weight,bmi,A,K,P,Y
0,1.013718,-0.352297,0.373608,1.255617,1.056764,0.556759,1.156538,-0.536395,-0.373834,-0.540605
1,1.013718,1.077342,0.930984,1.255617,1.129299,0.640161,1.156538,-0.536395,-0.373834,-0.540605
2,-0.986468,-0.352297,0.745192,-0.652938,0.476488,1.116992,-0.864650,1.864299,-0.373834,-0.540605
3,-0.986468,-0.352297,-1.112726,-1.130076,-0.901666,-0.527654,1.156538,-0.536395,-0.373834,-0.540605
4,1.013718,-0.709706,1.859943,1.255617,0.186351,-0.444065,-0.864650,-0.536395,2.674987,-0.540605
...,...,...,...,...,...,...,...,...,...,...
281,1.013718,-0.262944,-0.926934,0.778479,-0.756597,-1.318844,1.156538,-0.536395,-0.373834,-0.540605
282,-0.986468,-0.620354,0.373608,-0.652938,-0.176322,0.210782,1.156538,-0.536395,-0.373834,-0.540605
283,1.013718,-0.709706,0.002025,1.374902,1.201833,0.648293,-0.864650,-0.536395,-0.373834,1.849780
284,-0.986468,-0.352297,-0.369559,-0.652938,-0.176322,0.210782,-0.864650,1.864299,-0.373834,-0.540605


In [128]:
# Finalize Models
if is_tuning:
    rf_best_params = rf_study.best_params
    # lr_best_params = lr_study.best_params
    xgb_best_params = xgb_study.best_params

    best_rf = RandomForestRegressor(**rf_best_params)
    # best_lr = LogisticRegression(**lr_best_params)
    best_xgb = XGBRFRegressor(**xgb_best_params)

In [129]:
# first ensebmle model, then check it.
best_rf.fit(X_train, y_train)
# best_lr.fit(X_train, y_train)
best_xgb.fit(X_train, y_train)
# best_svm.fit(X_train, y_train)

# OOF-prediction
v_rf = best_rf.predict(X_val)
# v_lr = best_lr.predict_proba(X_val)
v_xgb = best_xgb.predict(X_val)
# v_svm = best_svm.predict_proba(X_val)
# print(v_rf.shape, v_lr.shape, v_xgb.shape, v_svm.shape)
print(v_rf.shape, v_xgb.shape)

(65,) (65,)


In [130]:
# model finalization and make prediction
best_rf.fit(X, y)
# best_lr.fit(X, y)
best_xgb.fit(X, y)
# best_svm.fit(X, y)

preds_rf = best_rf.predict(X_test)
# preds_lr = best_lr.predict(X_test)
preds_xgb = best_xgb.predict(X_test)
# preds_svm = best_svm.predict(X_test).values
# print(preds_rf.shape, preds_lr.shape, preds_xgb.shape, preds_svm.shape)
print(preds_rf.shape, preds_xgb.shape)

(286,) (286,)


In [139]:
v_rf = best_rf.predict(X_val)
# v_lr = best_lr.predict(X_val)
v_xgb = best_xgb.predict(X_val)
# v_svm = best_svm.predict(X_val).values
ensembles = np.mean([v_rf, v_xgb], axis=0)
ensembles = 0.7*v_rf+0.3*v_xgb
print("(After finalization)OOF prediction logloss : %.4f" % mean_absolute_percentage_error(y_val, ensembles))

(After finalization)OOF prediction logloss : 0.6642


In [132]:
preds_rf.shape

(286,)

In [96]:
submission = pd.read_csv('/USER/teeth/sample_submission.csv')
submission

Unnamed: 0,filename,time_min
0,test_0000.png,1
1,test_0001.png,1
2,test_0002.png,1
3,test_0003.png,1
4,test_0004.png,1
...,...,...
281,test_0281.png,1
282,test_0282.png,1
283,test_0283.png,1
284,test_0284.png,1


In [140]:
voting_weights = [0.7, 0.3]
voting_weights = [1, 0]
submission['time_min'] = voting_weights[0]*preds_rf[:] + voting_weights[1]*preds_xgb[:]
submission

Unnamed: 0,filename,time_min
0,test_0000.png,8.919955
1,test_0001.png,8.919955
2,test_0002.png,13.715483
3,test_0003.png,8.546059
4,test_0004.png,14.756694
...,...,...
281,test_0281.png,8.919955
282,test_0282.png,8.546059
283,test_0283.png,16.832840
284,test_0284.png,13.715483


In [141]:
submission['time_min'].describe()

count    286.000000
mean      11.848678
std        3.269477
min        8.546059
25%        8.919955
50%       11.317719
75%       14.710652
max       16.832840
Name: time_min, dtype: float64

In [142]:
submission.to_csv("submission.csv", index=False)