In [None]:
import numpy as np # linear algebra 
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os 
from sklearn.model_selection import cross_val_score
import os
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import RidgeClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import make_scorer
from sklearn.metrics import log_loss

In [None]:
input_path = "/kaggle/input/icr-identify-age-related-conditions/"
work_path = os.getcwd()
df_train_file = pd.read_csv(input_path + 'train.csv')
df_greek_file = pd.read_csv(input_path + 'greeks.csv')
df_test_file = pd.read_csv(input_path + 'test.csv')
df_greek_file.sample(10)

In [None]:
def balance_loglossv2(y_true, y_pred):
    target_mean = y_true.mean()
    w0 = 1/(1-target_mean)
    w1 = 1/target_mean
    sample_weight = [w0 if y == 0 else w1 for y in y_true]
    loss = log_loss(y_true, y_pred, sample_weight=sample_weight)
    
    return loss

In [None]:
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3)
competition_balanced_log_loss = make_scorer(balance_loglossv2, needs_proba=True)
scores = []

def compute_scores(clf, cv_strat, df_x, df_y, col_name):
    for i in ['accuracy', 'neg_log_loss', competition_balanced_log_loss]:
        scores.append([col_name, i, cross_val_score(clf, df_x, df_y, cv=cv_strat, scoring=i).mean()])
    return scores

In [None]:
def prepare_initial(df, df_greek_f, robust=True):
    df.sort_values(by=['Id'], axis=0, ascending=True, inplace=True)
    
    # training set labels
    df_y = None
    if 'Class' in df.columns:
      df_y = df['Class']
    
    # training set input
    df['EJ_dummy'] = (df['EJ'] == 'B')
    df['EJ_dummy'] = df['EJ_dummy'].astype('float64')
    temp_col = df['EJ_dummy']
    df_input = df.drop(['Id', 'EJ', 'EJ_dummy'], axis=1)
    if 'Class' in df.columns:
        df_input = df_input.drop(['Class'], axis=1)
    for i in df_input.columns:
        df_input[i].fillna(df_input[i].mode()[0], inplace=True)
    
    scaler = StandardScaler()
    if robust:
        scaler = RobustScaler()
    df_x = pd.DataFrame(scaler.fit_transform(df_input), columns=df_input.columns)
    df_x['EJ_dummy'] = temp_col
    
    # additional data
    df_greek_f.sort_values(by=['Id'], axis=0, ascending=True, inplace=True)
    df_greek_f.loc[df_greek_f['Epsilon'] == 'Unknown', 'Epsilon'] = pd.to_datetime('1/1/2021')
    df_greek_f['Epsilon_date'] = pd.to_datetime(df_greek_f.Epsilon)
    df_greek_f['Age'] = pd.to_datetime('1/1/2023') - df_greek_f.Epsilon_date
    df_greek_f['Age_days'] = df_greek_f['Age'].dt.total_seconds() / (24 * 60 * 60)
    
    scaler = StandardScaler()
    if robust:
        scaler = RobustScaler()
    df_greek_f[['Age_days']] = pd.DataFrame(scaler.fit_transform(df_greek_f[['Age_days']]), columns=['Age_days'])
    
    df_greek_x = pd.get_dummies(df_greek_f, columns = ['Alpha', 'Beta', 'Gamma', 'Delta'])
    df_greek_x = df_greek_x.drop(['Age_days', 'Alpha_A', 'Alpha_B', 'Alpha_D', 'Alpha_G', 'Age', 'Epsilon_date', 'Id', 'Epsilon'], axis=1)
    # Age_days must be dropped for RF multy categories

    return df_x, df_y, df_greek_x

In [None]:
df_ready_x, df_ready_y, df_greek_x = prepare_initial(df_train_file, df_greek_file, False)