In [1]:
import numpy as np # linear algebra 
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os 
from matplotlib import pyplot as plt
import os
from sklearn import tree
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer
from sklearn.metrics import log_loss
from imblearn.over_sampling import ADASYN
from imblearn.pipeline import Pipeline as imbpipeline

for dirname, _, filenames in os.walk('/kaggle/input'): 
    for filename in filenames: 
        print(os.path.join(dirname, filename))

/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv
/kaggle/input/icr-identify-age-related-conditions/greeks.csv
/kaggle/input/icr-identify-age-related-conditions/train.csv
/kaggle/input/icr-identify-age-related-conditions/test.csv


In [2]:
input_path = "/kaggle/input/icr-identify-age-related-conditions/"
work_path = os.getcwd()
df_train_file = pd.read_csv(input_path + 'train.csv')

In [3]:
# from https://www.kaggle.com/code/datafan07/icr-simple-eda-baseline
# see also https://www.kaggle.com/code/dan3dewey/icr-2023-balanced-log-loss/notebook
# see also https://www.kaggle.com/competitions/icr-identify-age-related-conditions/discussion/409691

def balance_loglossv2(y_true, y_pred):
    target_mean = y_true.mean()
    w0 = 1/(1-target_mean)
    w1 = 1/target_mean
    sample_weight = [w0 if y == 0 else w1 for y in y_true]
    loss = log_loss(y_true, y_pred, sample_weight=sample_weight)
    
    return loss

In [4]:
def prepare_initial(df):
    df.sort_values(by=['Id'], axis=0, ascending=True, inplace=True)
    
    # training set labels
    df_y = None
    if 'Class' in df.columns:
      df_y = df['Class']
    
    # training set input
    best_columns = ['AB', 'DI', 'DH', 'GL', 'DE', 'DA', 'EB', 'CR', 'EE', 'EH', 'FD ', 'CC', 'DL', 'FE', 'DU', 'AF', 'FI', 'BC', 'FL', 
                    'FR', 'BR', 'AM']
    df_input = df.loc[:, best_columns]
    
    return df_input, df_y

In [5]:
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3)
competition_balanced_log_loss = make_scorer(balance_loglossv2, needs_proba=True, greater_is_better=False)
scores = []
grid_results = []

def compute_scores(clf, cv_strat, df_x, df_y, col_name):
    for i in ['accuracy', 'neg_log_loss', competition_balanced_log_loss]:
        scores.append([col_name, i, cross_val_score(clf, df_x, df_y, cv=cv_strat, scoring=i).mean()])
    return scores

In [6]:
df_ready_x, df_ready_y = prepare_initial(df_train_file)

imputer_params = {'n_neighbors': 2, 'weights': "uniform", 'missing_values': np.nan, 'copy': False}

In [7]:
imbalance_ratio = (df_ready_y == 0).sum() / (df_ready_y == 1).sum() # maj /  min classes
# upsampling and cleaning
clf = KNNImputer(**imputer_params).fit_transform(df_ready_x, df_ready_y)

In [8]:
clf_smote = ADASYN()
print(f'Raw data: ratio is {imbalance_ratio:.4f} with {df_ready_x.shape}')
print(df_ready_y.value_counts())
x_resampled, y_resampled = clf_smote.fit_resample(df_ready_x, df_ready_y)

resampled_ratio = (y_resampled == 0).sum() / (y_resampled == 1).sum() 
print(f'After baseline SMOTE: ratio is {resampled_ratio:.4f} with {x_resampled.shape}.')
print(y_resampled.value_counts())

Raw data: ratio is 4.7130 with (617, 22)
0    509
1    108
Name: Class, dtype: int64
After baseline SMOTE: ratio is 1.0200 with (1008, 22).
0    509
1    499
Name: Class, dtype: int64


In [9]:
clf = imbpipeline(steps=[("a", KNNImputer(**imputer_params)), ("b", RobustScaler()), ("c", ADASYN()), ("d", XGBClassifier())])
clf.fit(df_ready_x, df_ready_y)
compute_scores(clf, cv, df_ready_x, df_ready_y, 'baseline')
scores

[['baseline', 'accuracy', 0.9146254043185593],
 ['baseline', 'neg_log_loss', -0.24031096049225212],
 ['baseline',
  make_scorer(balance_loglossv2, greater_is_better=False, needs_proba=True),
  -0.38229210176063455]]

In [10]:
#clf = Pipeline(steps=[("a", KNNImputer(**imputer_params)), ("b", RobustScaler()), ("c", SMOTEENN()), ("d", XGBClassifier())])
clf = imbpipeline(steps=[("a", KNNImputer(**imputer_params)), ("b", RobustScaler()), ("c", ADASYN()), ("d", XGBClassifier())])

param_grid = {
#'d__scale_pos_weight': [imbalance_ratio], 
    'a__n_neighbors': [2, 4, 6, 8, 10, 12, 14, 16], 
    'a__weights': ["uniform"], 
    'a__missing_values': [np.nan], 
    'a__copy': [False],
    'c__n_neighbors': [4, 6, 8, 10, 12, 14, 16, 18, 20], 
    'c__sampling_strategy': [0.5, 0.6, 0.7, 0.8, 0.9, 1],
    'd__n_estimators': [30, 40, 50, 50, 70, 80, 90, 100, 120, 130, 140, 150, 160, 170, 180, 190, 200], 
    'd__max_depth': [3, 5, 7, 9, 11, 13, 15], 
    'd__learning_rate': [1, 0.5, 0.1, 0.05, 0.01,0.005],
}


In [11]:
#parameters = {
#    'alpha': [0.8, 1, 2, 3], 
#   'hidden_layer_sizes': [4, 5, 6], 
#    'max_iter': [700, 800, 900], 
#}


In [12]:
#parameters = {
#    'max_depth': [9, 10, 11],
#    'max_features': [10, 15, 17], 
#    'n_estimators': [28, 30, 32],
#}


In [13]:
#parameters = {
#    'verbose': [0], 
#    'auto_class_weights' : ['Balanced'],
#    'learning_rate': [0.1, 1],
#    'iterations': [100, 1000],
#    'max_depth': [7],
#}

In [14]:
grid_search = RandomizedSearchCV(
   estimator = clf,
   #param_grid = param_grid,
   param_distributions = param_grid,
   scoring = 'neg_log_loss', #competition_balanced_log_loss,
   n_iter = 1000,
   n_jobs = 10,
   cv = 5,
   verbose = True
)

In [15]:
#grid_search.fit(df_ready_x, df_ready_y)

In [16]:
#grid_search.best_params_

{'d__n_estimators': 120,
 'd__max_depth': 5,
 'd__learning_rate': 0.1,
 'c__sampling_strategy': 0.6,
 'c__n_neighbors': 10,
 'a__weights': 'uniform',
 'a__n_neighbors': 4,
 'a__missing_values': nan,
 'a__copy': False}
 
 {'d__n_estimators': 200,
 'd__max_depth': 11,
 'd__learning_rate': 0.05,
 'c__sampling_strategy': 0.5,
 'c__n_neighbors': 8,
 'a__weights': 'uniform',
 'a__n_neighbors': 6,
 'a__missing_values': nan,
 'a__copy': False}
 
 {'d__n_estimators': 160,
 'd__max_depth': 3,
 'd__learning_rate': 0.05,
 'c__sampling_strategy': 1,
 'c__n_neighbors': 18,
 'a__weights': 'uniform',
 'a__n_neighbors': 6,
 'a__missing_values': nan,
 'a__copy': False}
 
 {'d__n_estimators': 140,
 'd__max_depth': 11,
 'd__learning_rate': 0.05,
 'c__sampling_strategy': 0.5,
 'c__n_neighbors': 18,
 'a__weights': 'uniform',
 'a__n_neighbors': 8,
 'a__missing_values': nan,
 'a__copy': False}

In [17]:
smote_params = {'n_neighbors': 18, 'sampling_strategy': 0.5}
xgb_params = {'learning_rate': 0.05, 'max_depth': 11, 'n_estimators': 140}
imputer_params = {'n_neighbors': 8, 'weights': 'uniform','missing_values': np.nan, 'copy': False}
clf = imbpipeline(steps=[("a", KNNImputer(**imputer_params)), ("b", RobustScaler()), ("c", ADASYN(**smote_params)), ("d", XGBClassifier(**xgb_params))])

clf.fit(df_ready_x, df_ready_y)
#compute_scores(clf, cv, df_ready_x, df_ready_y, 'optim')
#scores

* -0.3258 XGB {'learning_rate': 0.09, 'max_depth': 2, 'n_estimators': 65, 'scale_pos_weight': 4.712962962962963}
* -0.4539 MLP {'alpha': 2, 'hidden_layer_sizes': 5, 'max_iter': 800}
* -0.4461 RF  {'max_depth': 10, 'max_features': 15, 'n_estimators': 30}
* -0.3428 Cat {'auto_class_weights': 'Balanced', 'iterations': 100, 'learning_rate': 0.1, 'max_depth': 7, 'verbose': 0}
* -0.3231 XGB fe select {'learning_rate': 0.1, 'max_depth': 2,'n_estimators': 65, 'scale_pos_weight': 4.712962962962963}


* XGB with KNN, RobustScaler and Boruta scored **0.27**
** optim accuracy 0.9027755922720518
** optim neg_log_loss -0.2641812458160158
** optim -0.325687961912535
* Now adding SMOTE
* -0.46142 with SMOTEEN and {'learning_rate': 0.1,'max_depth': 3,'n_estimators': 65,'scale_pos_weight': 0.7865497076023392}  ERROR? why is it better with the unbalanced ratio?
** optim accuracy 0.9238482384823851
** optim neg_log_loss -0.20840998742484473
** optim loglossv2 -0.4498135188099039
* -0.350951 with SMOTE and {'b__k_neighbors': 11, 'b__sampling_strategy': 0.75, 'd__learning_rate': 0.1, 'd__max_depth': 
 'd__n_estimators': 50}
** optim accuracy 0.9108269953667277
** optim neg_log_loss -0.2246716546812891
** optim balanc_loglossv2 -0.350951546247298
* -0.36220598 with SMOTE and {'b__k_neighbors': 14, 'b__sampling_strategy': 0.75, 'd__learning_rate': 0.11, 'd__max_depth': 8, 'd__n_estimators': 47}
** optim accuracy 0.9114083398898505
** optim neg_log_loss -0.23217032137811883
** optim balance_loglossv2 -0.36220598518369224
* -0.320258 with SMOTE and {'c__k_neighbors': 7, 'c__sampling_strategy': 1, 'd__learning_rate': 0.15, 'd__max_depth': 3, 'd__n_estimators': 38}
** optim accuracy 0.9054899903837748
** optim neg_log_loss -0.22611896007194524
** optim balance_loglossv2 -0.3202587222164088
* -0.361626 and acc 0.921151 with ADASYN and {'d__n_estimators': 120, 'd__max_depth': 5, 'd__learning_rate': 0.1, 'c__sampling_strategy': 0.6, 'c__n_neighbors': 10, 'a__weights': 'uniform', 'a__n_neighbors': 4, 'a__missing_values': n an, 'a__copy': False}
* -0.374 and acc with ADASYN and {'d__n_estimators': 200, 'd__max_depth': 11, 'd__learning_rate': 0.05, 'c__sampling_strategy': 0.5, 'c__n_neighbors': 8, 'a__weights': 'uniform', 'a__n_neighbors': 6, 'a__missing_values': nan, 'a__copy': False}
* -0.297683and acc with ADASYN and  {'d__n_estimators': 160, 'd__max_depth': 3, 'd__learning_rate': 0.05, 'c__sampling_strategy': 1, 'c__n_neighbors': 18, 'a__weights': 'uniform', 'a__n_neighbors': 6, 'a__missing_values': nan, 'a__copy': False}
* -0.388858 and acc 0.921 and neg_log_loss -0.19661032 with ADASYN and {'d__n_estimators': 140, 'd__max_depth': 11, 'd__learning_rate': 0.05, 'c__sampling_strategy': 0.5, 'c__n_neighbors': 18, 'a__weights': 'uniform', 'a__n_neighbors': 8, 'a__missing_values': nan, 'a__copy': False}
 


In [18]:
df_test_file = pd.read_csv(input_path + 'test.csv')
df_test_x, df_test_y = prepare_initial(df_test_file)
df_test_preds = pd.DataFrame(clf.predict_proba(df_test_x))
df_test_preds

Unnamed: 0,0,1
0,0.91256,0.08744
1,0.91256,0.08744
2,0.91256,0.08744
3,0.91256,0.08744
4,0.91256,0.08744


In [19]:
df_test_y = pd.concat([df_test_file['Id'], df_test_preds], axis=1)
df_test_y

Unnamed: 0,Id,0,1
0,00eed32682bb,0.91256,0.08744
1,010ebe33f668,0.91256,0.08744
2,02fa521e1838,0.91256,0.08744
3,040e15f562a2,0.91256,0.08744
4,046e85c7cc7f,0.91256,0.08744


In [20]:
df_test_y.to_csv('/kaggle/working/submission.csv', header=['Id', 'class_0', 'class_1'], index=False)
!head /kaggle/working/submission.csv

Id,class_0,class_1
00eed32682bb,0.9125601,0.08743991
010ebe33f668,0.9125601,0.08743991
02fa521e1838,0.9125601,0.08743991
040e15f562a2,0.9125601,0.08743991
046e85c7cc7f,0.9125601,0.08743991


### version 4 scored **0.27** with Boruta, KNN, Robust and XGB optim
### version 5 scored **0.31** with Boruta, KNN, Robust, SMOTEEN and XGB optim
### version 6 scored **0.31** with Boruta, KNN, Robust, SMOTE and XGB optim