In [1]:
import numpy as np  
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.impute import KNNImputer
from sklearn.preprocessing import RobustScaler
#from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
#from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import make_scorer
from sklearn.metrics import log_loss
from imblearn.over_sampling import ADASYN
from imblearn.pipeline import Pipeline as imbpipeline

In [2]:
input_path = "/kaggle/input/icr-identify-age-related-conditions/"
#work_path = os.getcwd()
df_train_file = pd.read_csv(input_path + 'train.csv')
#df_greek_file = pd.read_csv(input_path + 'greeks.csv')
#df_test_file = pd.read_csv(input_path + 'test.csv')
#df_greek_file.sample(10)

## Strategy: build ensemble averaging predictions with best algorithms so far, no PCA, Boruta feature reduction, RobustScaler, KNN imputer 

In [3]:
#def balance_loglossv2(y_true, y_pred):
#    target_mean = y_true.mean()
#    w0 = 1/(1-target_mean)
#    w1 = 1/target_mean
#    sample_weight = [w0 if y == 0 else w1 for y in y_true]
#    loss = log_loss(y_true, y_pred, sample_weight=sample_weight)
#    
#    return loss

In [4]:
#cv = RepeatedStratifiedKFold(n_splits=5)
#competition_balanced_log_loss = make_scorer(balance_loglossv2, needs_proba=True)
#scores = []

#def compute_scores(clf, cv_strat, df_x, df_y, col_name):
#    for i in ['accuracy', 'neg_log_loss', competition_balanced_log_loss]:
#        scores.append([col_name, i, cross_val_score(clf, df_x, df_y, cv=cv_strat, scoring=i).mean()])
#    return scores

In [5]:
def prepare_initial(df):
    df.sort_values(by=['Id'], axis=0, ascending=True, inplace=True)
    
    # training set labels
    df_y = None
    if 'Class' in df.columns:
      df_y = df['Class']
    
    # training set input
    best_columns = ['AB', 'DI', 'DH', 'GL', 'DE', 'DA', 'EB', 'CR', 'EE', 'EH', 'FD ', 'CC', 'DL', 'FE', 'DU', 'AF', 'FI', 'BC', 'FL', 
                    'FR', 'BR', 'AM']
    df_input = df.loc[:, best_columns]
    
    return df_input, df_y

In [6]:
df_ready_x, df_ready_y = prepare_initial(df_train_file)

Let's do an ensemble of models

In [7]:
ensemble = {}
#imputer_params = {'strategy':'median', 'missing_values': pd.NA}
imputer_params = {'n_neighbors': 2, 'weights': "uniform", 'missing_values': np.nan, 'copy': False}

In [8]:
# upsampling and cleaning
#clf = KNNImputer(**imputer_params).fit_transform(df_ready_x, df_ready_y)

#clf_smote = ADASYN()
#print(f'Raw data: ratio is {imbalance_ratio:.4f} with {df_ready_x.shape}')
#print(df_ready_y.value_counts())
#x_resampled, y_resampled = clf_smote.fit_resample(df_ready_x, df_ready_y)

#resampled_ratio = (y_resampled == 0).sum() / (y_resampled == 1).sum() 
#print(f'After ADASYN: ratio is {resampled_ratio:.4f} with {x_resampled.shape}.')
#print(y_resampled.value_counts())

In [9]:
#imbalance_ratio = (df_ready_y == 0).sum() / (df_ready_y == 1).sum() # maj /  min classes
#parameters = {'learning_rate': 0.09, 'max_depth': 2, 'n_estimators': 65, 'scale_pos_weight': imbalance_ratio}

smote_params = {'n_neighbors': 18, 'sampling_strategy': 0.5}
parameters = {'learning_rate': 0.05, 'max_depth': 11, 'n_estimators': 140}
imputer_params = {'n_neighbors': 8, 'weights': 'uniform','missing_values': np.nan, 'copy': False}

clf = imbpipeline(steps=[("a", KNNImputer(**imputer_params)), ("b", RobustScaler()), ("c", ADASYN(**smote_params)), ("d", XGBClassifier(**parameters))])

clf.fit(df_ready_x, df_ready_y)
#compute_scores(clf, cv, df_ready_x, df_ready_y, 'XGB')
ensemble['XGB'] = clf

In [10]:
#parameters = {'auto_class_weights': 'Balanced', 'iterations': 100, 'learning_rate': 0.1, 'max_depth': 7, 'verbose': 0}
#smote_params = {'n_neighbors': 18, 'sampling_strategy': 0.5}
smote_params = {'n_neighbors': 8, 'sampling_strategy': 0.6}
parameters = {'verbose': 0, 'max_depth': 9, 'learning_rate': 0.1, 'iterations': 200}

clf = imbpipeline(steps=[("a", KNNImputer(**imputer_params)), ("b", RobustScaler()), ("c", ADASYN(**smote_params)), ("d", CatBoostClassifier(**parameters))])

clf.fit(df_ready_x, df_ready_y)
#compute_scores(clf, cv, df_ready_x, df_ready_y, 'CatB')
ensemble['CatB'] = clf

In [11]:
parameters = {'alpha': 2, 'hidden_layer_sizes': 5, 'max_iter': 800}
smote_params = {'n_neighbors': 18, 'sampling_strategy': 0.5}

clf = imbpipeline(steps=[("a", KNNImputer(**imputer_params)), ("b", RobustScaler()), ("c", ADASYN(**smote_params)), ("d", MLPClassifier(**parameters))])
#clf = make_pipeline(KNNImputer(**imputer_params), RobustScaler(), MLPClassifier(**parameters))

clf.fit(df_ready_x, df_ready_y)
#compute_scores(clf, cv, df_ready_x, df_ready_y, 'MLP')
ensemble['MLP'] = clf

In [12]:
#scores

In [13]:
estimators = [(k, v) for k, v in ensemble.items()]
estimators

[('XGB',
  Pipeline(steps=[('a', KNNImputer(copy=False, n_neighbors=8)),
                  ('b', RobustScaler()),
                  ('c', ADASYN(n_neighbors=18, sampling_strategy=0.5)),
                  ('d',
                   XGBClassifier(base_score=None, booster=None, callbacks=None,
                                 colsample_bylevel=None, colsample_bynode=None,
                                 colsample_bytree=None,
                                 early_stopping_rounds=None,
                                 enable_categorical=False, eval_metric=None,
                                 feature_types=None, g...one,
                                 grow_policy=None, importance_type=None,
                                 interaction_constraints=None, learning_rate=0.05,
                                 max_bin=None, max_cat_threshold=None,
                                 max_cat_to_onehot=None, max_delta_step=None,
                                 max_depth=11, max_leaves=None,
     

In [14]:
## try with stacking

#stacker = StackingClassifier(estimators, stack_method='predict_proba', cv=StratifiedKFold(n_splits=5), final_estimator=LogisticRegression(class_weight='balanced'))
#stacker.fit(df_ready_x, df_ready_y)
#compute_scores(stacker, cv, df_ready_x, df_ready_y, 'Stacking')                                                

With all ensemble stacking => 0.276120 largely overfitting
* ['RF', 'accuracy', 0.907601188915115],
* ['RF', 'neg_log_loss', -0.2667133453792237],
* ['RF', make_scorer(balance_loglossv2, needs_proba=True), 0.4991141391198179],
* ['XGB', 'accuracy', 0.9032957426348456],
* ['XGB', 'neg_log_loss', -0.261652610888779],
* ['XGB', make_scorer(balance_loglossv2, needs_proba=True), 0.3296584422965175],
* ['CatB', 'accuracy', 0.9275941952967917],
* ['CatB', 'neg_log_loss', -0.1858073385100156],
* ['CatB', make_scorer(balance_loglossv2, needs_proba=True), 0.33843747450124706],
* ['MLP', 'accuracy', 0.9044059795436663],
* ['MLP', 'neg_log_loss', -0.3032586188188822],
* ['MLP', make_scorer(balance_loglossv2, needs_proba=True), 0.6351087131152136],
* ['Stacking', 'accuracy', 0.9140965119328611],
* ['Stacking', 'neg_log_loss', -0.27113339775863726],
* ['Stacking', make_scorer(balance_loglossv2, needs_proba=True), 0.27612014609236973]]

XGB, CatB and MLP stacking => 0.2842077 largely overfitting
* ['XGB', 'accuracy', 0.9065390331322667],
* ['XGB', 'neg_log_loss', -0.2694250966625059],
* ['XGB', make_scorer(balance_loglossv2, needs_proba=True), 0.33095943225544733],
* ['CatB', 'accuracy', 0.9270390768423813],
* ['CatB', 'neg_log_loss', -0.18895375315789595],
* ['CatB', make_scorer(balance_loglossv2, needs_proba=True), 0.32433161130442684],
* ['MLP', 'accuracy', 0.9076186729609231],
* ['MLP', 'neg_log_loss', -0.26615718540054334],
* ['MLP', make_scorer(balance_loglossv2, needs_proba=True), 0.571068775679251],
* ['Stacking', 'accuracy', 0.9157137861701198],
* ['Stacking', 'neg_log_loss', -0.27339630727668607],
* ['Stacking', make_scorer(balance_loglossv2, needs_proba=True), 0.28420778284662507]]

XGB, CatB and MLP stacking + median imputer => 0.2775539 largely overfitting
* ['XGB', 'accuracy', 0.9070766675408689],
* ['XGB', 'neg_log_loss', -0.27578745834853036],
* ['XGB', make_scorer(balance_loglossv2, needs_proba=True), 0.33812925985610964],
* ['CatB', 'accuracy', 0.9367995454148089],
* ['CatB', 'neg_log_loss', -0.19564630341230985],
* ['CatB', make_scorer(balance_loglossv2, needs_proba=True), 0.3419744543909858],
* ['MLP', 'accuracy', 0.9043884954978582],
* ['MLP', 'neg_log_loss', -0.302471357758935],
* ['MLP', make_scorer(balance_loglossv2, needs_proba=True), 0.6099516179380783],
* ['Stacking', 'accuracy', 0.9178643238045284],
* ['Stacking', 'neg_log_loss', -0.2732104797654024],
* ['Stacking', make_scorer(balance_loglossv2, needs_proba=True), 0.2775539457629785]]

XGB, CatB and MLP stacking + knn imputer => 0.283525 largely overfitting
* ['XGB', 'accuracy', 0.907614301949471],
* ['XGB', 'neg_log_loss', -0.2734284490401531],
* ['XGB', make_scorer(balance_loglossv2, needs_proba=True), 0.33533867402966666],
* ['CatB', 'accuracy', 0.9319433516915814],
* ['CatB', 'neg_log_loss', -0.18600481218275972],
* ['CatB', make_scorer(balance_loglossv2, needs_proba=True), 0.3522857255339534],
* ['MLP', 'accuracy', 0.9065390331322669],
* ['MLP', 'neg_log_loss', -0.2873080264105889],
* ['MLP', make_scorer(balance_loglossv2, needs_proba=True), 0.5035410498876508],
* ['Stacking', 'accuracy', 0.9167715709415157],
* ['Stacking', 'neg_log_loss', -0.2763960346923296],
* ['Stacking', make_scorer(balance_loglossv2, needs_proba=True), 0.2835253646985966]]

TabPFN, XGB averaging + knn imputer, RobustScaler, Boruta => ??
* ['TabPFN', 'accuracy', 0.9319695777602938],
* ['TabPFN', 'neg_log_loss', -0.1952366313150684],
* ['TabPFN', make_scorer(balance_loglossv2, needs_proba=True), 0.40145512164254554],
* ['XGB', 'accuracy', 0.9141358510359296],
* ['XGB', 'neg_log_loss', -0.2636786792332117],
* ['XGB', make_scorer(balance_loglossv2, needs_proba=True), 0.31515798371908404]]

In [15]:
#from sklearn.ensemble import VotingClassifier
#
#voter = VotingClassifier(estimators, weights = weights, voting = 'soft')
#voter.fit(df_ready_x, df_ready_y)
#compute_scores(voter, cv, df_ready_x, df_ready_y, 'Voting')

* Voting is not overfitting but not delivering any benefits either
* Stacking is much better on training set but not better on test: largely overfitting 
* Averaging is even better

In [16]:
#df_scores = pd.DataFrame(scores, columns=['model', 'metric', 'score'])
#df_scores[(df_scores['metric'] != 'neg_log_loss') & (df_scores['metric'] != 'accuracy')]

In [17]:
df_test_file = pd.read_csv(input_path + 'test.csv')
df_test_x, df_test_y = prepare_initial(df_test_file)
df_test_preds = None
for k, v in estimators:
    preds = pd.DataFrame(v.predict_proba(df_test_x), columns=[k + '_0', k + '_1'])
    #preds.renamecolumns([k + '_0', k + '_1'])
    df_test_preds = pd.concat([df_test_preds, preds], axis=1)
#df_test_preds = pd.DataFrame(stacker.predict_proba(df_test_x))
df_test_preds['0'] = (df_test_preds['XGB_0'] + df_test_preds['CatB_0'] + df_test_preds['MLP_0']) / 3
df_test_preds['1'] = (df_test_preds['XGB_1'] + df_test_preds['CatB_1'] + df_test_preds['MLP_1']) / 3
df_test_preds

Unnamed: 0,XGB_0,XGB_1,CatB_0,CatB_1,MLP_0,MLP_1,0,1
0,0.885691,0.114309,0.964992,0.035008,0.105818,0.894182,0.652167,0.347833
1,0.885691,0.114309,0.964992,0.035008,0.105818,0.894182,0.652167,0.347833
2,0.885691,0.114309,0.964992,0.035008,0.105818,0.894182,0.652167,0.347833
3,0.885691,0.114309,0.964992,0.035008,0.105818,0.894182,0.652167,0.347833
4,0.885691,0.114309,0.964992,0.035008,0.105818,0.894182,0.652167,0.347833


In [18]:
df_test_y = pd.concat([df_test_file['Id'], df_test_preds[['0', '1']]], axis=1)
df_test_y

Unnamed: 0,Id,0,1
0,00eed32682bb,0.652167,0.347833
1,010ebe33f668,0.652167,0.347833
2,02fa521e1838,0.652167,0.347833
3,040e15f562a2,0.652167,0.347833
4,046e85c7cc7f,0.652167,0.347833


In [19]:
df_test_y.to_csv('/kaggle/working/submission.csv', header=['Id', 'class_0', 'class_1'], index=False)

In [20]:
df_test_y

Unnamed: 0,Id,0,1
0,00eed32682bb,0.652167,0.347833
1,010ebe33f668,0.652167,0.347833
2,02fa521e1838,0.652167,0.347833
3,040e15f562a2,0.652167,0.347833
4,046e85c7cc7f,0.652167,0.347833


In [21]:
!head /kaggle/working/submission.csv

Id,class_0,class_1
00eed32682bb,0.6521670904701113,0.3478328995957811
010ebe33f668,0.6521670904701113,0.3478328995957811
02fa521e1838,0.6521670904701113,0.3478328995957811
040e15f562a2,0.6521670904701113,0.3478328995957811
046e85c7cc7f,0.6521670904701112,0.3478328995957812


## 20230603 version 2 scored **0.46** with a straight GB
## 20230610 version 3 scored **0.46** with a straight ensemble RF and boosters
## 20230616 version 4 scored **0.42** with a straight ensemble RF, XGB, MLP
## 20230617 version 5 scored **0.43** with an ensemble and reduced features
## 20230623 version 6 scored **0.29** with an ensemble, RobustSclarer and Boruta features
## 20230624 version 7 scored **0.32** with XGB, CatB, RF (no MLP) RobustScaler and Boruta features
## 20230626 version 9 scored **0.29** with XGB, CatB, MLP (no RF) RobustScaler and Boruta features
## 20230629 version 10 scored **0.26** with XGB, CatB, MLP (no RF) RobustScaler and Boruta features averaging


TODO: 
* Add tabpfn 
* Make data more diverse to avoid correlation inside the ensemble