In [32]:
import numpy as np # linear algebra 
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os 
from sklearn.model_selection import cross_val_score
import os
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import log_loss

In [33]:
input_path = "/kaggle/input/icr-identify-age-related-conditions/"
work_path = os.getcwd()
df_train_file = pd.read_csv(input_path + 'train.csv')
df_greek_file = pd.read_csv(input_path + 'greeks.csv')
df_test_file = pd.read_csv(input_path + 'test.csv')

In [34]:
def balance_loglossv2(y_true, y_pred):
    target_mean = y_true.mean()
    w0 = 1/(1-target_mean)
    w1 = 1/target_mean
    sample_weight = [w0 if y == 0 else w1 for y in y_true]
    loss = log_loss(y_true, y_pred, sample_weight=sample_weight)
    
    return loss

In [35]:
#cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3)
cv = StratifiedKFold(n_splits=5)
competition_balanced_log_loss = make_scorer(balance_loglossv2, needs_proba=True)
scores = []

def compute_scores(clf, cv_strat, df_x, df_y, col_name):
    for i in ['accuracy', 'neg_log_loss', competition_balanced_log_loss]:
        scores.append([col_name, i, cross_val_score(clf, df_x, df_y, cv=cv_strat, scoring=i).mean()])
    return scores

In [36]:
def prepare_initial(df, df_greek_f, robust=True):
    df.sort_values(by=['Id'], axis=0, ascending=True, inplace=True)
    
    # training set labels
    df_y = None
    if 'Class' in df.columns:
      df_y = df['Class']
    
    # training set input
    df['EJ_dummy'] = (df['EJ'] == 'B')
    df['EJ_dummy'] = df['EJ_dummy'].astype('float64')
    temp_col = df['EJ_dummy']
    df_input = df.drop(['Id', 'EJ', 'EJ_dummy'], axis=1)
    if 'Class' in df.columns:
        df_input = df_input.drop(['Class'], axis=1)
    for i in df_input.columns:
        df_input[i].fillna(df_input[i].mode()[0], inplace=True)
    
    scaler = StandardScaler()
    if robust:
        scaler = RobustScaler()
    df_x = pd.DataFrame(scaler.fit_transform(df_input), columns=df_input.columns)
    df_x['EJ_dummy'] = temp_col
    # no feature selection
    
    # additional data
    df_greek_f.sort_values(by=['Id'], axis=0, ascending=True, inplace=True)
    df_greek_f.loc[df_greek_f['Epsilon'] == 'Unknown', 'Epsilon'] = pd.to_datetime('1/1/2021')
    df_greek_f['Epsilon_date'] = pd.to_datetime(df_greek_f.Epsilon)
    df_greek_f['Age'] = pd.to_datetime('1/1/2023') - df_greek_f.Epsilon_date
    df_greek_f['Age_days'] = df_greek_f['Age'].dt.total_seconds() / (24 * 60 * 60)
    
    scaler = StandardScaler()
    if robust:
        scaler = RobustScaler()
    df_greek_f[['Age_days']] = pd.DataFrame(scaler.fit_transform(df_greek_f[['Age_days']]), columns=['Age_days'])
    
    df_greek_x = pd.get_dummies(df_greek_f, columns = ['Alpha', 'Beta', 'Gamma', 'Delta'])
    df_greek_x = df_greek_x.drop(['Age_days', 'Alpha_A', 'Alpha_B', 'Alpha_D', 'Alpha_G', 'Age', 'Epsilon_date', 'Id', 'Epsilon'], axis=1)
    # Age_days must be dropped for RF multy categories

    return df_x, df_y, df_greek_x

In [37]:
df_ready_x, df_ready_y, df_greek_x = prepare_initial(df_train_file, df_greek_file, True)

In [38]:
imbalance_ratio = (df_ready_y == 0).sum() / (df_ready_y == 1).sum() # maj /  min classes

# Part 1: Boruta

In [39]:
#from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier

parameters = {}

clf = RandomForestClassifier(**parameters)
clf.fit(df_ready_x, df_greek_x)
compute_scores(clf, cv, df_ready_x, df_ready_y, 'x2g RF baseline')

#trans = BorutaPy(clf, verbose=0)
#sel = trans.fit_transform(df_ready_x.to_numpy(), df_ready_y.to_numpy())


[['x2g RF baseline', 'accuracy', 0.9157356412273799],
 ['x2g RF baseline', 'neg_log_loss', -0.2923499607620197],
 ['x2g RF baseline',
  make_scorer(balance_loglossv2, needs_proba=True),
  0.46567892637215047]]

In [46]:
parameters = {
    'max_depth': [14, 15, 16, 17, 18],
    'max_features': [13, 14, 15, 16, 17], 
    'n_estimators': [28, 30, 32],
}

grid_search = GridSearchCV(
    estimator=clf,
    param_grid=parameters,
    scoring = competition_balanced_log_loss,
    n_jobs = 10,
    cv = 5,
    verbose=True
)

grid_search.fit(df_ready_x, df_greek_x)
grid_results = []
grid_results.append((grid_search.best_score_, grid_search.best_params_))
grid_results

Fitting 5 folds for each of 75 candidates, totalling 375 fits


Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 327, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "/tmp/ipykernel_32/2732884102.py", line 6, in balance_loglossv2
  File "/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 2587, in log_loss
    y_pred = check_array(
  File "/opt/conda/lib/python3.10/site-packages/sklearn/utils/validation.py", line 915, in check_array
    raise ValueError(
ValueError: Found array with dim 3. None expected <= 2.

Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, 

[(nan, {'max_depth': 14, 'max_features': 13, 'n_estimators': 28})]

In [45]:
parameters = {'max_depth': 15, 'max_features': 15, 'n_estimators': 30}

clf = RandomForestClassifier(**parameters)
clf.fit(df_ready_x, df_greek_x)
compute_scores(clf, cv, df_ready_x, df_ready_y, 'x2g RF optim')
scores

[['x2g RF baseline', 'accuracy', 0.9157356412273799],
 ['x2g RF baseline', 'neg_log_loss', -0.2923499607620197],
 ['x2g RF baseline',
  make_scorer(balance_loglossv2, needs_proba=True),
  0.46567892637215047],
 ['x2g RF optim', 'accuracy', 0.9125360608444794],
 ['x2g RF optim', 'neg_log_loss', -0.2838981872758064],
 ['x2g RF optim',
  make_scorer(balance_loglossv2, needs_proba=True),
  1.2270986108290043]]

In [10]:
trans.ranking_

array([ 1,  1, 25,  1, 17,  2, 29, 27,  1, 19,  4,  9,  7,  1, 32,  7,  1,
        3, 16, 19, 30,  1, 14,  2, 31,  1,  1, 10,  1,  1,  1, 17,  1, 33,
        5,  1,  1, 12,  1, 24, 15, 22, 13,  1,  1,  1,  1,  1, 26, 21, 28,
       10,  6, 23,  1, 34])

In [11]:
# number of selected features
print ('\n Number of selected features:')
print (trans.n_features_)

feature_df = pd.DataFrame(df_ready_x.columns.tolist(), columns=['features'])
feature_df['rank'] = trans.ranking_
feature_df = feature_df.sort_values('rank', ascending=True).reset_index(drop=True)
print ('\n Top %d features:' % trans.n_features_)
print (feature_df.head(trans.n_features_))


 Number of selected features:
22

 Top 22 features:
   features  rank
0        AB     1
1        DI     1
2        DH     1
3        GL     1
4        DE     1
5        DA     1
6        EB     1
7        CR     1
8        EE     1
9        EH     1
10      FD      1
11       CC     1
12       DL     1
13       FE     1
14       DU     1
15       AF     1
16       FI     1
17       BC     1
18       FL     1
19       FR     1
20       BR     1
21       AM     1


In [19]:
list(feature_df.head(trans.n_features_)['features'].to_numpy())

['AB',
 'DI',
 'DH',
 'GL',
 'DE',
 'DA',
 'EB',
 'CR',
 'EE',
 'EH',
 'FD ',
 'CC',
 'DL',
 'FE',
 'DU',
 'AF',
 'FI',
 'BC',
 'FL',
 'FR',
 'BR',
 'AM']