In [1]:
import numpy as np  
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import RobustScaler
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import ADASYN
from imblearn.pipeline import Pipeline as imbpipeline



In [2]:
input_path = "/kaggle/input/icr-identify-age-related-conditions/"
df_train_file = pd.read_csv(input_path + 'train.csv')

In [3]:
def prepare_initial(df):
    df.sort_values(by=['Id'], axis=0, ascending=True, inplace=True)
    
    # training set labels
    df_y = None
    if 'Class' in df.columns:
      df_y = df['Class']
    
    # training set input
    best_columns = ['AB', 'DI', 'DH', 'GL', 'DE', 'DA', 'EB', 'CR', 'EE', 'EH', 'FD ', 'CC', 'DL', 'FE', 'DU', 'AF', 'FI', 'BC', 'FL', 
                    'FR', 'BR', 'AM']
    df_input = df.loc[:, best_columns]
    
    return df_input, df_y

In [4]:
df_ready_x, df_ready_y = prepare_initial(df_train_file)

ensemble = {}

In [5]:
smote_params = {'n_neighbors': 18, 'sampling_strategy': 0.5}
parameters = {'learning_rate': 0.05, 'max_depth': 11, 'n_estimators': 140}
imputer_params = {'n_neighbors': 8, 'weights': 'uniform','missing_values': np.nan, 'copy': False}

clf = imbpipeline(steps=[("a", KNNImputer(**imputer_params)), ("b", RobustScaler()), ("c", ADASYN(**smote_params)), ("d", XGBClassifier(**parameters))])

clf.fit(df_ready_x, df_ready_y)
ensemble['XGB'] = clf

In [6]:
parameters = {'auto_class_weights': 'Balanced', 'iterations': 100, 'learning_rate': 0.1, 'max_depth': 7, 'verbose': 0}
smote_params = {'n_neighbors': 18, 'sampling_strategy': 0.5}
imputer_params = {'n_neighbors': 8, 'weights': 'uniform','missing_values': np.nan, 'copy': False}

clf = imbpipeline(steps=[("a", KNNImputer(**imputer_params)), ("b", RobustScaler()), ("c", ADASYN(**smote_params)), ("d", CatBoostClassifier(**parameters))])

clf.fit(df_ready_x, df_ready_y)
ensemble['CatB'] = clf

In [7]:
parameters = {'alpha': 2, 'hidden_layer_sizes': 5, 'max_iter': 800}
smote_params = {'n_neighbors': 18, 'sampling_strategy': 0.5}
imputer_params = {'n_neighbors': 8, 'weights': 'uniform','missing_values': np.nan, 'copy': False}

clf = imbpipeline(steps=[("a", KNNImputer(**imputer_params)), ("b", RobustScaler()), ("c", ADASYN(**smote_params)), ("d", MLPClassifier(**parameters))])

clf.fit(df_ready_x, df_ready_y)
ensemble['MLP'] = clf

In [8]:
estimators = [(k, v) for k, v in ensemble.items()]

In [9]:
df_test_file = pd.read_csv(input_path + 'test.csv')
df_test_x, df_test_y = prepare_initial(df_test_file)
df_test_preds = None
for k, v in estimators:
    preds = pd.DataFrame(v.predict_proba(df_test_x), columns=[k + '_0', k + '_1'])
    df_test_preds = pd.concat([df_test_preds, preds], axis=1)

df_test_preds['0'] = (df_test_preds['XGB_0'] + df_test_preds['CatB_0'] + df_test_preds['MLP_0']) / 3
df_test_preds['1'] = (df_test_preds['XGB_1'] + df_test_preds['CatB_1'] + df_test_preds['MLP_1']) / 3

In [10]:
df_test_y = pd.concat([df_test_file['Id'], df_test_preds[['0', '1']]], axis=1)
df_test_y

Unnamed: 0,Id,0,1
0,00eed32682bb,0.660558,0.339442
1,010ebe33f668,0.660558,0.339442
2,02fa521e1838,0.660558,0.339442
3,040e15f562a2,0.660558,0.339442
4,046e85c7cc7f,0.660558,0.339442


In [11]:
df_test_y.to_csv('/kaggle/working/submission.csv', header=['Id', 'class_0', 'class_1'], index=False)

In [12]:
!head /kaggle/working/submission.csv

Id,class_0,class_1
00eed32682bb,0.6605581666097762,0.33944182345611634
010ebe33f668,0.6605581666097762,0.33944182345611634
02fa521e1838,0.6605581666097762,0.33944182345611634
040e15f562a2,0.6605581666097762,0.33944182345611634
046e85c7cc7f,0.6605581666097762,0.33944182345611634
