<a href="https://www.kaggle.com/code/awesomeharris/ps3e18-baseline-modelling-separately-ensemble?scriptVersionId=135671536" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Getting started


In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import scipy

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

from sklearn.preprocessing import StandardScaler, FunctionTransformer, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, cross_val_score, cross_val_predict, cross_validate
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, HistGradientBoostingClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier

In [2]:
path1 = "/kaggle/input/playground-series-s3e18/"
path2 = "/kaggle/input/multi-label-classification-of-enzyme-substrates/"

sample = pd.read_csv(path1+"sample_submission.csv")
train = pd.read_csv(path1+"train.csv",index_col='id')
test = pd.read_csv(path1+"test.csv",index_col='id')
original = pd.read_csv(path2+"original.csv",index_col='id')
original_1 = pd.read_csv(path2+"original_1.csv",index_col='id')

id_test = test.index # id column required for submission file
targets = ['EC1', 'EC2']
features = [f for f in test.columns if not f.startswith('EC')]
features_EC1EC2 = features + ['EC1', 'EC2']

# Combine playground dataset and synthetic dataset generated
train1 = pd.concat([train,original]).drop_duplicates().reset_index(drop=True)
train2 = pd.concat([train,original_1]).drop_duplicates().reset_index(drop=True)

train1a = train1[features].drop_duplicates().reset_index(drop=True)
train2a = train2[features].drop_duplicates().reset_index(drop=True)

train1b = train1[features_EC1EC2].drop_duplicates().reset_index(drop=True)
train2b = train2[features_EC1EC2].drop_duplicates().reset_index(drop=True)

train.shape, train1.shape, train1a.shape, train1b.shape, train2.shape, train2a.shape, train2b.shape

((14838, 37),
 (15859, 37),
 (15769, 31),
 (15826, 33),
 (15841, 37),
 (15769, 31),
 (15805, 33))

Notes:-  
- The original dataset has multiple files with different target (EC1~EC6) values.
- The original.csv has a set of target values from _ecfp file. This is combined with the trainset as train1 dataset here.
- The original_1.csv has a set of target values from _desc file. This is combined with the trainset as train2 dataset here.

Further notes:-
- train1a and train2a are further skimmed down with dropped duplicates looking at only the features columns only.
- The same is applied to train1b and train2b, with dropping duplicates looking at feature columns and EC1+EC2 columns.
- This means that in the duplicated rows, same set of features results in different target values.


# Modelling the problem

There are two general approaches here:
1. To treat the two targets separately with two separate models to make the best of each models.
2. To use a wrapper such as MultiOutputClassifer from sk.learn package to predict them together in one model.

In such binary classification problem, the following are some popular modelling implementations:
1. RandomForestClassifier
2. KNeighborsClassifier
3. ExtraTreesClassifier
4. LogisticRegression
5. BaggingClassifier with LogisticRegression
6. LGBMClassifier
7. XGBClassifier
8. CatBoostClassfier


# First approach: Modelling Separately

# 1. Using RandomForestClassifier

In [3]:
# Initialize cross-validation parameters
cv = StratifiedKFold(shuffle=True, random_state=42)

In [4]:
%%time
rfc_ec1 = RandomForestClassifier(n_estimators=500, min_samples_leaf=47, random_state=42)
score = cross_val_score(rfc_ec1, train[features], train.EC1, scoring='roc_auc', cv=cv).mean()
print(f'AUC score for RandomForestClassifier = {score.round(3)}')

AUC score for RandomForestClassifier = 0.708
CPU times: user 1min 1s, sys: 56 ms, total: 1min 2s
Wall time: 1min 2s


In [5]:
%%time
rfc_ec2 = RandomForestClassifier(n_estimators=500, min_samples_leaf=80, random_state=42)
score = cross_val_score(rfc_ec2, train[features], train.EC2, scoring='roc_auc', cv=cv).mean()
print(f'AUC score for RandomForestClassifier = {score.round(3)}')

AUC score for RandomForestClassifier = 0.586
CPU times: user 1min, sys: 68.6 ms, total: 1min
Wall time: 1min


In [6]:
'''
%%time
# finding the optimum min_samples_leaf parameter

l = []

for min_samples_leaf in range(10, 100, 5):
    model = RandomForestClassifier(n_estimators=500, min_samples_leaf=min_samples_leaf, random_state=42)
    score = cross_val_score(model, train[features], train.EC1,
                            scoring='roc_auc', cv=cv).mean()
    print(f"{min_samples_leaf:3}: {score:.5f}")
    l.append((min_samples_leaf, score))

df = pd.DataFrame(l, columns=['min_samples_leaf', 'score'])

plt.scatter(df.min_samples_leaf, df.score, s=15)
plt.title('Random Forest for EC1')
plt.xlabel('min_samples_leaf')
plt.ylabel('auc')
plt.show()
best_index = df.score.argsort().iloc[-1]
'''

'\n%%time\n# finding the optimum min_samples_leaf parameter\n\nl = []\n\nfor min_samples_leaf in range(10, 100, 5):\n    model = RandomForestClassifier(n_estimators=500, min_samples_leaf=min_samples_leaf, random_state=42)\n    score = cross_val_score(model, train[features], train.EC1,\n                            scoring=\'roc_auc\', cv=cv).mean()\n    print(f"{min_samples_leaf:3}: {score:.5f}")\n    l.append((min_samples_leaf, score))\n\ndf = pd.DataFrame(l, columns=[\'min_samples_leaf\', \'score\'])\n\nplt.scatter(df.min_samples_leaf, df.score, s=15)\nplt.title(\'Random Forest for EC1\')\nplt.xlabel(\'min_samples_leaf\')\nplt.ylabel(\'auc\')\nplt.show()\nbest_index = df.score.argsort().iloc[-1]\n'

# 2. Using KNeighborsClassifier

In [7]:
%%time
# Right-skewed features will be transformed
log_features = [f for f in features if (train[f] >= 0).all() and scipy.stats.skew(train[f]) > 0]

knc_ec1 = make_pipeline(ColumnTransformer([('log', FunctionTransformer(np.log1p), log_features)],
                                        remainder='passthrough'),
                        StandardScaler(),
                        KNeighborsClassifier(n_neighbors=1190, weights='distance'))
score = cross_val_score(knc_ec1, train[features], train.EC1,
                        scoring='roc_auc', cv=cv).mean()
print(f'AUC score for KNeighborsClassifier = {score.round(3)}')

AUC score for KNeighborsClassifier = 0.698
CPU times: user 18 s, sys: 139 ms, total: 18.2 s
Wall time: 5.13 s


In [8]:
%%time
# Right-skewed features will be transformed
log_features = [f for f in features if (train[f] >= 0).all() and scipy.stats.skew(train[f]) > 0]

knc_ec2 = make_pipeline(ColumnTransformer([('log', FunctionTransformer(np.log1p), log_features)],
                                        remainder='passthrough'),
                        StandardScaler(),
                        KNeighborsClassifier(n_neighbors=1190, weights='distance'))
score = cross_val_score(knc_ec2, train[features], train.EC2,
                        scoring='roc_auc', cv=cv).mean()
print(f'AUC score for KNeighborsClassifier = {score.round(3)}')

AUC score for KNeighborsClassifier = 0.584
CPU times: user 17.7 s, sys: 77.8 ms, total: 17.7 s
Wall time: 4.94 s


In [9]:
"""
%%time

# Right-skewed features will be transformed
log_features = [f for f in features if (train[f] >= 0).all() and scipy.stats.skew(train[f]) > 0]

l = []
for n_neighbors in range(100, 1500, 10):
    model = make_pipeline(ColumnTransformer([('log', FunctionTransformer(np.log1p), log_features)],
                                            remainder='passthrough'),
                          StandardScaler(),
                          KNeighborsClassifier(n_neighbors=n_neighbors, weights='distance'))
    score = cross_val_score(model, train[features], train.EC1,
                            scoring='roc_auc', cv=cv).mean()
    print(f"# {n_neighbors:4}: {score:.5f}")
    l.append((n_neighbors, score))
df = pd.DataFrame(l, columns=['n_neighbors', 'score'])
plt.title('KNN for EC1')
plt.scatter(df.n_neighbors, df.score, s=15)
plt.xlabel('n_neighbors')
plt.ylabel('auc')
plt.show()
best_index = df.score.argsort().iloc[-1]
best_row = df.iloc[best_index]
"""

'\n%%time\n\n# Right-skewed features will be transformed\nlog_features = [f for f in features if (train[f] >= 0).all() and scipy.stats.skew(train[f]) > 0]\n\nl = []\nfor n_neighbors in range(100, 1500, 10):\n    model = make_pipeline(ColumnTransformer([(\'log\', FunctionTransformer(np.log1p), log_features)],\n                                            remainder=\'passthrough\'),\n                          StandardScaler(),\n                          KNeighborsClassifier(n_neighbors=n_neighbors, weights=\'distance\'))\n    score = cross_val_score(model, train[features], train.EC1,\n                            scoring=\'roc_auc\', cv=cv).mean()\n    print(f"# {n_neighbors:4}: {score:.5f}")\n    l.append((n_neighbors, score))\ndf = pd.DataFrame(l, columns=[\'n_neighbors\', \'score\'])\nplt.title(\'KNN for EC1\')\nplt.scatter(df.n_neighbors, df.score, s=15)\nplt.xlabel(\'n_neighbors\')\nplt.ylabel(\'auc\')\nplt.show()\nbest_index = df.score.argsort().iloc[-1]\nbest_row = df.iloc[best_inde

# 3. Using ExtraTreesClassifier

In [10]:
%%time
etc_ec1 = ExtraTreesClassifier(n_estimators=1000,
                               min_samples_leaf=12,
                               random_state=42)
score = cross_val_score(etc_ec1, train[features], train.EC1, 
                        scoring='roc_auc', cv=cv).mean()
print(f'AUC score for ExtraTreesClassifier = {score.round(3)}')

AUC score for ExtraTreesClassifier = 0.707
CPU times: user 51.2 s, sys: 161 ms, total: 51.4 s
Wall time: 51.4 s


In [11]:
%%time
etc_ec2 = ExtraTreesClassifier(n_estimators=1000,
                               min_samples_leaf=12,
                               random_state=42)
score = cross_val_score(etc_ec2, train[features], train.EC2, 
                        scoring='roc_auc', cv=cv).mean()
print(f'AUC score for ExtraTreesClassifier = {score.round(3)}')

AUC score for ExtraTreesClassifier = 0.586
CPU times: user 54.7 s, sys: 82.8 ms, total: 54.8 s
Wall time: 54.9 s


In [12]:
'''
%%time
l = []
for min_samples_leaf in range(4, 30, 2):
    model = ExtraTreesClassifier(n_estimators=1000,
                                 min_samples_leaf=min_samples_leaf,
                                 random_state=1)
    score = cross_val_score(model, train[features], train.EC1, 
                            scoring='roc_auc', cv=cv).mean()
    print(f"# {min_samples_leaf:3}: {score:.5f}")
    l.append((min_samples_leaf, score))
df = pd.DataFrame(l, columns=['min_samples_leaf', 'score'])
plt.scatter(df.min_samples_leaf, df.score, s=15)
plt.xlabel('min_samples_leaf')
plt.ylabel('auc')
plt.title('ExtraTrees for EC1')
plt.show()
best_index = df.score.argsort().iloc[-1]
best_row = df.iloc[best_index]
result_dict[('EC1', f'ET {best_row.min_samples_leaf.astype(int)}')] = best_row.score
'''

'\n%%time\nl = []\nfor min_samples_leaf in range(4, 30, 2):\n    model = ExtraTreesClassifier(n_estimators=1000,\n                                 min_samples_leaf=min_samples_leaf,\n                                 random_state=1)\n    score = cross_val_score(model, train[features], train.EC1, \n                            scoring=\'roc_auc\', cv=cv).mean()\n    print(f"# {min_samples_leaf:3}: {score:.5f}")\n    l.append((min_samples_leaf, score))\ndf = pd.DataFrame(l, columns=[\'min_samples_leaf\', \'score\'])\nplt.scatter(df.min_samples_leaf, df.score, s=15)\nplt.xlabel(\'min_samples_leaf\')\nplt.ylabel(\'auc\')\nplt.title(\'ExtraTrees for EC1\')\nplt.show()\nbest_index = df.score.argsort().iloc[-1]\nbest_row = df.iloc[best_index]\nresult_dict[(\'EC1\', f\'ET {best_row.min_samples_leaf.astype(int)}\')] = best_row.score\n'

# 4. Using Logistic Regression

In [13]:
%%time
lr_ec1 = make_pipeline(ColumnTransformer([('log', FunctionTransformer(np.log1p), log_features)],
                                         remainder='passthrough'),
                       StandardScaler(),
                       PolynomialFeatures(2, include_bias=False),
                       LogisticRegression(C=0.001, max_iter=500))
score = cross_val_score(lr_ec1, train[features], train.EC1,
                        scoring='roc_auc', cv=cv).mean()
print(f'AUC score for Logistic Regression = {score.round(3)}')

AUC score for Logistic Regression = 0.698
CPU times: user 13.8 s, sys: 3.35 s, total: 17.2 s
Wall time: 4.37 s


In [14]:
%%time
lr_ec2 = make_pipeline(ColumnTransformer([('log', FunctionTransformer(np.log1p), log_features)],
                                        remainder='passthrough'),
                       StandardScaler(),
                       PolynomialFeatures(2, include_bias=False),
                       LogisticRegression(C=0.001, max_iter=500))
score = cross_val_score(lr_ec2, train[features], train.EC2,
                        scoring='roc_auc', cv=cv).mean()
print(f'AUC score for Logistic Regression = {score.round(3)}')

AUC score for Logistic Regression = 0.582
CPU times: user 14.9 s, sys: 3.78 s, total: 18.7 s
Wall time: 4.82 s


# 5. Using BaggingClassifier with Logistic Regression

In [15]:
%%time
bclr_ec1 = make_pipeline(ColumnTransformer([('log', FunctionTransformer(np.log1p), log_features)],
                                            remainder='passthrough'),
                         StandardScaler(),
                         BaggingClassifier(make_pipeline(PolynomialFeatures(2, include_bias=False),
                                                         LogisticRegression(C=0.001, max_iter=500)),
                                                         n_estimators=30,
                                                         bootstrap=False,
                                                         max_features=24,
                                                         random_state=42))
score = cross_val_score(bclr_ec1, train[features], train.EC1,
                        scoring='roc_auc', cv=cv).mean()
print(f'AUC score for BaggingClassifier with Logistic Regression = {score.round(3)}')

AUC score for BaggingClassifier with Logistic Regression = 0.698
CPU times: user 2min 57s, sys: 1min 9s, total: 4min 6s
Wall time: 1min 2s


In [16]:
%%time
bclr_ec2 = make_pipeline(ColumnTransformer([('log', FunctionTransformer(np.log1p), log_features)],
                                            remainder='passthrough'),
                         StandardScaler(),
                         BaggingClassifier(make_pipeline(PolynomialFeatures(2, include_bias=False),
                                                         LogisticRegression(C=0.001, max_iter=500)),
                                                         n_estimators=30,
                                                         bootstrap=False,
                                                         max_features=24,
                                                         random_state=42))
score = cross_val_score(bclr_ec2, train[features], train.EC2,
                        scoring='roc_auc', cv=cv).mean()
print(f'AUC score for BaggingClassifier with Logistic Regression = {score.round(3)}')

AUC score for BaggingClassifier with Logistic Regression = 0.584
CPU times: user 2min 47s, sys: 1min 3s, total: 3min 51s
Wall time: 58.7 s


# 6. Using LGBMClassifier

In [17]:
%%time
from lightgbm import LGBMClassifier
params_ec1 = {'n_estimators': 753, 'colsample_bytree': 0.1, 'min_child_samples': 38, 'num_leaves': 15, 'learning_rate': 0.01, 'reg_lambda': 1}
lgbm_ec1 = LGBMClassifier(**params_ec1, random_state=42)
score = cross_val_score(lgbm_ec1, train[features], train.EC1,
                        scoring='roc_auc', cv=cv).mean()
print(f'AUC score for LGBMClassifier = {score.round(3)}')

AUC score for LGBMClassifier = 0.71
CPU times: user 13.4 s, sys: 10.3 s, total: 23.7 s
Wall time: 18 s


In [18]:
%%time
from lightgbm import LGBMClassifier
params_ec2 = {'n_estimators': 753, 'colsample_bytree': 0.1, 'min_child_samples': 38, 'num_leaves': 15, 'learning_rate': 0.01, 'reg_lambda': 1}
lgbm_ec2 = LGBMClassifier(**params_ec2, random_state=42)
score = cross_val_score(lgbm_ec2, train[features], train.EC2,
                        scoring='roc_auc', cv=cv).mean()
print(f'AUC score for LGBMClassifier = {score.round(3)}')

AUC score for LGBMClassifier = 0.583
CPU times: user 12.4 s, sys: 9.71 s, total: 22.1 s
Wall time: 16.6 s


# 7. Using XGBClassifier

In [19]:
%%time
from xgboost import XGBClassifier
params_ec1 = {'learning_rate':0.1,
              'n_estimators':1000,  
              'max_depth':6, 
              'subsample':0.6,
              'colsample_bytree':0.5,
              'reg_alpha':0.01, 
              'reg_lambda':0.01, 
              'min_child_weight':10,
              'gamma':0,
              'objective':'binary:logistic'} #objective can be 'multi:softmax' for multiclass classification
xgb_ec1 = XGBClassifier(**params_ec1, random_state=42)
score = cross_val_score(xgb_ec1, train[features], train.EC1,
                        scoring='roc_auc', cv=cv).mean()
print(f'AUC score for XGBClassifier = {score.round(3)}')

AUC score for XGBClassifier = 0.668
CPU times: user 1min 35s, sys: 174 ms, total: 1min 35s
Wall time: 1min 35s


In [20]:
%%time
from xgboost import XGBClassifier
params_ec2 = {'learning_rate':0.1,
              'n_estimators':1000,  
              'max_depth':6, 
              'subsample':0.6,
              'colsample_bytree':0.5,
              'reg_alpha':0.01, 
              'reg_lambda':0.01, 
              'min_child_weight':10,
              'gamma':0,
              'objective':'binary:logistic'} #objective can be 'multi:softmax' for multiclass classification
xgb_ec2 = XGBClassifier(**params_ec2, random_state=42)
score = cross_val_score(xgb_ec1, train[features], train.EC2,
                        scoring='roc_auc', cv=cv).mean()
print(f'AUC score for XGBClassifier = {score.round(3)}')

AUC score for XGBClassifier = 0.541
CPU times: user 1min 34s, sys: 160 ms, total: 1min 34s
Wall time: 1min 34s


# 8. Using CatBoostClassifier

In [21]:
%%time
from catboost import CatBoostClassifier
params_ec1 = {'n_estimators':5000, 
              'learning_rate':0.01, 
              'depth':6, 
              'subsample':0.6, 
              'colsample_bylevel':0.5, 
              'l2_leaf_reg':2, 
              'min_data_in_leaf':0.5, 
              'max_bin':128,
              'early_stopping_rounds':10}
cb_ec1 = CatBoostClassifier(**params_ec1, random_state=42, verbose=False)
score = cross_val_score(cb_ec1, train[features], train.EC1,
                        scoring='roc_auc', cv=cv).mean()
print(f'AUC score for CatBoostClassifier = {score.round(3)}')

AUC score for CatBoostClassifier = 0.7
CPU times: user 6min 48s, sys: 35.6 s, total: 7min 23s
Wall time: 1min 58s


In [22]:
%%time
from catboost import CatBoostClassifier
params_ec2 = {'n_estimators':5000, 
              'learning_rate':0.01, 
              'depth':6, 
              'subsample':0.6, 
              'colsample_bylevel':0.5, 
              'l2_leaf_reg':2, 
              'min_data_in_leaf':0.5, 
              'max_bin':128,
              'early_stopping_rounds':10}
cb_ec2 = CatBoostClassifier(**params_ec2, random_state=42, verbose=False)
score = cross_val_score(cb_ec2, train[features], train.EC2,
                        scoring='roc_auc', cv=cv).mean()
print(f'AUC score for CatBoostClassifier = {score.round(3)}')

AUC score for CatBoostClassifier = 0.563
CPU times: user 6min 50s, sys: 36 s, total: 7min 26s
Wall time: 1min 58s


# Ensembling the models

In [23]:
%%time
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import roc_auc_score
models = [rfc_ec1,knc_ec1,etc_ec1,lr_ec1,bclr_ec1,lgbm_ec1,xgb_ec1,cb_ec1]
model_names = ['rfc_ec1','knc_ec1','etc_ec1','lr_ec1','bclr_ec1','lgbm_ec1','xgb_ec1','cb_ec1']

ensemble_ec1 = VotingClassifier(estimators=[('rfc_ec1', rfc_ec1),
                                            ('knc_ec1', knc_ec1),
                                            ('etc_ec1', etc_ec1),
                                            ('lr_ec1', lr_ec1),
                                            ('bclr_ec1', bclr_ec1),
                                            ('lgbm_ec1', lgbm_ec1),
                                            ('xgb_ec1', xgb_ec1),
                                            ('cb_ec1', cb_ec1)],
                                voting='soft')
score = cross_val_score(ensemble_ec1, train[features], train.EC1,
                        scoring='roc_auc', cv=cv).mean()
print(f'AUC score for the Ensemble model = {score.round(3)}')

AUC score for the Ensemble model = 0.708
CPU times: user 14min 9s, sys: 2min 3s, total: 16min 12s
Wall time: 7min 6s


In [24]:
%%time
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import roc_auc_score
models = [rfc_ec2,knc_ec2,etc_ec2,lr_ec2,bclr_ec2,lgbm_ec2,xgb_ec2,cb_ec2]
model_names = ['rfc_ec2','knc_ec2','etc_ec2','lr_ec2','bclr_ec2','lgbm_ec2','xgb_ec2','cb_ec2']

ensemble_ec2 = VotingClassifier(estimators=[('rfc_ec2', rfc_ec2),
                                            ('knc_ec2', knc_ec2),
                                            ('etc_ec2', etc_ec2),
                                            ('lr_ec2', lr_ec2),
                                            ('bclr_ec2', bclr_ec2),
                                            ('lgbm_ec2', lgbm_ec2),
                                            ('xgb_ec2', xgb_ec2),
                                            ('cb_ec2', cb_ec2)],
                                voting='soft')
score = cross_val_score(ensemble_ec2, train[features], train.EC2,
                        scoring='roc_auc', cv=cv).mean()
print(f'AUC score for the Ensemble model = {score.round(3)}')

AUC score for the Ensemble model = 0.581
CPU times: user 13min 37s, sys: 1min 54s, total: 15min 32s
Wall time: 6min 54s


# Second Approach： Using MultiOutputClassifier

to be explored later

# Final prediction and submission


In [25]:
%%time
X_train = train[features]
y_train_EC1 = train.EC1
y_train_EC2 = train.EC2
X_test = test

ensemble_ec1.fit(X_train,y_train_EC1)
pred_ec1 = ensemble_ec1.predict_proba(X_test)

ensemble_ec2.fit(X_train,y_train_EC2)
pred_ec2 = ensemble_ec2.predict_proba(X_test)


CPU times: user 7min 4s, sys: 59 s, total: 8min 3s
Wall time: 3min 37s


In [26]:
sub = pd.DataFrame({'EC1':pred_ec1[:,1], 'EC2':pred_ec2[:,1]},index=test.index)

sub.to_csv('submission.csv')
!head submission.csv

id,EC1,EC2
14838,0.4380516893091811,0.7901953198177006
14839,0.8333989566954694,0.8229668982705554
14840,0.785062571417183,0.7641057448352698
14841,0.7068100648508504,0.820360886817204
14842,0.7868260130684259,0.8133271785413876
14843,0.5651707450844874,0.8246739756770008
14844,0.5444887960324278,0.8629905496923573
14845,0.6081195814566478,0.8475070386979278
14846,0.675624904683796,0.760425484088733


# References
@AmbrosM: https://www.kaggle.com/code/ambrosm/pss3e18-eda-which-makes-sense  
@AmbrosM: https://www.kaggle.com/competitions/playground-series-s3e18/discussion/420127  