In [1]:
import numpy as np # linear algebra 
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os 
from matplotlib import pyplot as plt
import os
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
import seaborn as sns

for dirname, _, filenames in os.walk('/kaggle/input'): 
    for filename in filenames: 
        print(os.path.join(dirname, filename))

/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv
/kaggle/input/icr-identify-age-related-conditions/greeks.csv
/kaggle/input/icr-identify-age-related-conditions/train.csv
/kaggle/input/icr-identify-age-related-conditions/test.csv


In [2]:
input_path = "/kaggle/input/icr-identify-age-related-conditions/"
work_path = os.getcwd()
df_train_file = pd.read_csv(input_path + 'train.csv')
df_greek_file = pd.read_csv(input_path + 'greeks.csv')

In [3]:
def prepare_initial(df, df_greek_f, robust=True):
    df.sort_values(by=['Id'], axis=0, ascending=True, inplace=True)
    
    # training set labels
    df_y = None
    if 'Class' in df.columns:
      df_y = df['Class']
    
    # training set input
    df['EJ_dummy'] = (df['EJ'] == 'B')
    df['EJ_dummy'] = df['EJ_dummy'].astype('float64')
    temp_col = df['EJ_dummy']
    df_input = df.drop(['Id', 'EJ', 'EJ_dummy'], axis=1)
    if 'Class' in df.columns:
        df_input = df_input.drop(['Class'], axis=1)
    for i in df_input.columns:
        df_input[i].fillna(df_input[i].mode()[0], inplace=True)
    
    scaler = StandardScaler()
    if robust:
        scaler = RobustScaler()
    df_x = pd.DataFrame(scaler.fit_transform(df_input), columns=df_input.columns)
    df_x['EJ_dummy'] = temp_col
    
    # additional data
    df_greek_f.sort_values(by=['Id'], axis=0, ascending=True, inplace=True)
    df_greek_f.loc[df_greek_f['Epsilon'] == 'Unknown', 'Epsilon'] = pd.to_datetime('1/1/2021')
    df_greek_f['Epsilon_date'] = pd.to_datetime(df_greek_f.Epsilon)
    df_greek_f['Age'] = pd.to_datetime('1/1/2023') - df_greek_f.Epsilon_date
    df_greek_f['Age_days'] = df_greek_f['Age'].dt.total_seconds() / (24 * 60 * 60)
    
    scaler = StandardScaler()
    if robust:
        scaler = RobustScaler()
    df_greek_f[['Age_days']] = pd.DataFrame(scaler.fit_transform(df_greek_f[['Age_days']]), columns=['Age_days'])
    
    df_greek_x = pd.get_dummies(df_greek_f, columns = ['Alpha', 'Beta', 'Gamma', 'Delta'])
    df_greek_x = df_greek_x.drop(['Age_days', 'Alpha_A', 'Alpha_B', 'Alpha_D', 'Alpha_G', 'Age', 'Epsilon_date', 'Id', 'Epsilon'], axis=1)
    # Age_days must be dropped for RF multy categories

    return df_x, df_y, df_greek_x

In [95]:
# from https://www.kaggle.com/competitions/icr-identify-age-related-conditions/discussion/409691
def balanced_log_loss(y_true, y_pred, **kwargs):
  df_preds = pd.DataFrame(y_pred, columns=['pos'])
  df_preds['neg'] = 1 - df_preds['pos']
  y_pred = df_preds.to_numpy()
  # calculate the number of observations for each class
  N_0 = np.sum(1 - y_true)
  N_1 = np.sum(y_true)
   # calculate the weights for each class
  w_0 = 1 / N_0
  w_1 = 1 / N_1
   # calculate the predicted probabilities for each class
  p_0 = np.clip(y_pred[:, 0], 1e-15, 1 - 1e-15)
  p_1 = np.clip(y_pred[:, 1], 1e-15, 1 - 1e-15)
   # calculate the log loss for each class
  log_loss_0 = -w_0 * np.sum(y_true * np.log(p_0))
  log_loss_1 = -w_1 * np.sum(y_true * np.log(p_1))
   # calculate the balanced logarithmic loss
  balanced_log_loss = (log_loss_0 + log_loss_1) / (w_0 + w_1)
  return balanced_log_loss

In [96]:
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3)


from sklearn.metrics import make_scorer
competition_balanced_log_loss = make_scorer(balanced_log_loss, needs_proba=True)
competition_balanced_log_loss

make_scorer(balanced_log_loss, needs_proba=True)

In [97]:
scores = []
def compute_scores(clf, cv_strat, df_x, df_y, col_name):
    for i in ['accuracy', 'precision', 'recall', 'f1', 'roc_auc', 'neg_log_loss', competition_balanced_log_loss]:
        scores.append([col_name, i, cross_val_score(clf, df_x, df_y, cv=cv_strat, scoring=i).mean()])
    return scores

In [98]:
df_ready_x, df_ready_y, df_ready_greek = prepare_initial(df_train_file, df_greek_file, False)

In [99]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

clf = RandomForestClassifier(max_depth=13).fit(df_ready_x, df_ready_y)

In [100]:
preds = clf.predict_proba(df_ready_x)
print(preds)
balanced_log_loss(df_ready_y, preds[:,0])

[[1.90000000e-01 8.10000000e-01]
 [9.79722567e-01 2.02774329e-02]
 [9.89798516e-01 1.02014836e-02]
 ...
 [9.89223789e-01 1.07762113e-02]
 [9.99924051e-01 7.59493671e-05]
 [9.10000000e-01 9.00000000e-02]]


52.347478682563214

In [101]:
competition_balanced_log_loss(clf, df_ready_x, df_ready_y)

164.24878020440957

In [102]:
cross_val_score(clf, df_ready_x, df_ready_y, cv=cv, scoring=competition_balanced_log_loss)

array([16.88440922, 17.64237375, 17.336333  , 16.77465013, 16.5331671 ,
       16.99533762, 16.793869  , 19.16733074, 15.18500014, 15.85252528,
       16.88798521, 18.58427973, 17.4386841 , 16.00115558, 17.57021209])

In [103]:
compute_scores(clf, cv, df_ready_x, df_ready_y, 'RF')
scores


[['RF', 'accuracy', 0.9189833027362531],
 ['RF', 'precision', 0.8743304356462251],
 ['RF', 'recall', 0.6326118326118326],
 ['RF', 'f1', 0.7214677695563385],
 ['RF', 'roc_auc', 0.9456813048251604],
 ['RF', 'neg_log_loss', -0.24963390458653154],
 ['RF', make_scorer(balanced_log_loss, needs_proba=True), 17.170726821598805]]

#Legacy

### Step one: RF from train to greek

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

clf_1 = RandomForestClassifier(max_depth=3, random_state=0).fit(df_ready_x, df_ready_greek)
scores_1 = cross_val_score(clf_1, df_ready_x, df_ready_greek, cv=5)
scores_1.mean()

array([0.7983871 , 0.75806452, 0.74796748, 0.67479675, 0.80487805])

* all except Alpha_A and Age array([0.4516129 , 0.5       , 0.43089431, 0.36585366, 0.42276423])
* with only Gamma_M and Gamma_N array([0.81451613, 0.75806452, 0.75609756, 0.70731707, 0.78861789])
* with only Gamma_M and Gamma_H array([0.76612903, 0.75      , 0.70731707, 0.64227642, 0.76422764])
* with only Gamma_M, Gamma_N and Delta_C array([0.73387097, 0.71774194, 0.69105691, 0.64227642, 0.65853659])
* with only Gamma_M, Delta_D, Beta_A array([0.7983871 , 0.75806452, 0.74796748, 0.67479675, 0.80487805])
same with RobustScaler



### Step two: from greek to class 

In [18]:
from sklearn.ensemble import GradientBoostingClassifier

clf_2 = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
clf_2.fit(df_ready_greek, df_ready_y)
scores_2 = cross_val_score(clf_2, df_ready_greek, df_ready_y, cv=5)#, scoring='neg_log_loss')
scores_2.mean()

0.8962627852084971

### Step 3: predict through the two steps

In [19]:
df_test_file = pd.read_csv(input_path + 'test.csv')
df_test_x, df_test_y, df_test_greek = prepare_initial(df_test_file, df_greek_file)

    AB   AF   AH   AM   AR   AX   AY   AZ   BC  BD   ...   FL   FR   FS   GB  \
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   

    GE   GF   GH   GI   GL  EJ_dummy  
0  0.0  0.0  0.0  0.0  0.0       0.0  
1  0.0  0.0  0.0  0.0  0.0       0.0  
2  0.0  0.0  0.0  0.0  0.0       0.0  
3  0.0  0.0  0.0  0.0  0.0       0.0  
4  0.0  0.0  0.0  0.0  0.0       0.0  

[5 rows x 56 columns]
   Gamma_M  Beta_A  Delta_D
0        0       0        1
1        1       0        0
2        1       0        0
3        1       0        0
4        0       0        0


In [20]:
intermed = pd.DataFrame(clf_1.predict(df_test_x), columns=df_ready_greek.columns)
intermed

Unnamed: 0,Gamma_M,Beta_A,Delta_D
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0


In [21]:
preds = pd.DataFrame(clf_2.predict_proba(intermed))
preds

Unnamed: 0,0,1
0,1.0,1.799544e-19
1,1.0,1.799544e-19
2,1.0,1.799544e-19
3,1.0,1.799544e-19
4,1.0,1.799544e-19


In [22]:
#df_test_y = pd.concat([df_test_file['Id'], preds], axis=1)
#df_test_y

In [23]:
#df_test_y.to_csv('/kaggle/working/submission.csv', header=['Id', 'class_0', 'class_1'], index=False)
#!head /kaggle/working/submission.csv