In [1]:
import numpy as np # linear algebra 
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os 
from matplotlib import pyplot as plt
import os
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import seaborn as sns

for dirname, _, filenames in os.walk('/kaggle/input'): 
    for filename in filenames: 
        print(os.path.join(dirname, filename))

/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv
/kaggle/input/icr-identify-age-related-conditions/greeks.csv
/kaggle/input/icr-identify-age-related-conditions/train.csv
/kaggle/input/icr-identify-age-related-conditions/test.csv


In [2]:
input_path = "/kaggle/input/icr-identify-age-related-conditions/"
work_path = os.getcwd()
df_train_file = pd.read_csv(input_path + 'train.csv')
df_greek_file = pd.read_csv(input_path + 'greeks.csv')

In [3]:
def prepare_initial(df, df_greek_f):
    df.sort_values(by=['Id'], axis=0, ascending=True, inplace=True)
    
    # training set labels
    df_y = None
    if 'Class' in df.columns:
      df_y = df['Class']
    
    # training set input
    df['EJ_dummy'] = (df['EJ'] == 'B')
    df['EJ_dummy'] = df['EJ_dummy'].astype('float64')
    temp_col = df['EJ_dummy']
    df_input = df.drop(['Id', 'EJ', 'EJ_dummy'], axis=1)
    if 'Class' in df.columns:
        df_input = df_input.drop(['Class'], axis=1)
    for i in df_input.columns:
        df_input[i].fillna(df_input[i].mode()[0], inplace=True)
    
    scaler = StandardScaler()
    df_x = pd.DataFrame(scaler.fit_transform(df_input), columns=df_input.columns)
    df_x['EJ_dummy'] = temp_col
    print(df_x.head())
    
    if (df_y is None):
        pass
    else:
        print(df_y.head())
    
    # additional data
    df_greek_f.sort_values(by=['Id'], axis=0, ascending=True, inplace=True)
    df_greek_f.loc[df_greek_f['Epsilon'] == 'Unknown', 'Epsilon'] = pd.to_datetime('1/1/2021')
    df_greek_f['Epsilon_date'] = pd.to_datetime(df_greek_f.Epsilon)
    df_greek_f['Age'] = pd.to_datetime('1/1/2023') - df_greek_f.Epsilon_date
    df_greek_f['Age_days'] = df_greek_f['Age'].dt.total_seconds() / (24 * 60 * 60)
    
    scaler = StandardScaler()
    df_greek_f[['Age_days']] = pd.DataFrame(scaler.fit_transform(df_greek_f[['Age_days']]), columns=['Age_days'])
    
    df_greek_x = pd.get_dummies(df_greek_f, columns = ['Alpha', 'Beta', 'Gamma', 'Delta'])
    df_greek_x = df_greek_x.loc[:, (df_greek_x.columns != 'Age_days') & (df_greek_x.columns != 'Alpha_A') & (df_greek_x.columns != 'Age') & (df_greek_x.columns != 'Epsilon_date') & (df_greek_x.columns != 'Id') & (df_greek_x.columns != 'Epsilon')]
    print(df_greek_x.head())
    # dropping Alpha_A because equal to Class
    # dropping Age_days because numerical
    
    # imp_mean = SimpleImputer(missing_values=pd.NA, strategy='most_frequent')
    # imp_mean.fit_transform(df_x)
    return df_x, df_y, df_greek_x

In [4]:
df_ready_x, df_ready_y, df_ready_greek = prepare_initial(df_train_file, df_greek_file)

         AB        AF        AH        AM        AR        AX        AY  \
0 -0.572153 -0.170975 -0.261669 -0.237889 -0.189295 -1.900558 -0.083417   
1 -0.709105 -1.097801 -0.261669 -0.028701 -0.189295 -0.750457 -0.083417   
2 -0.015212 -0.377169 -0.261669 -0.094845 -0.189295  0.465662 -0.083417   
3 -0.480851  0.138196  0.012347  0.547477 -0.189295 -0.729610 -0.083417   
4 -0.206946  0.100517 -0.261669 -0.356885 -0.189295 -0.628845 -0.013229   

         AZ        BC       BD   ...        FL        FR        FS        GB  \
0 -0.173502 -0.038354 -0.405383  ...  0.163202 -0.035806 -0.249959 -0.940094   
1  0.678919 -0.104787  0.048541  ... -0.457460 -0.060566  0.114085 -1.145070   
2  0.519453 -0.104787 -0.071089  ...  0.199040 -0.051023  0.597743  1.637944   
3  0.112088 -0.104787 -0.391109  ...  0.060759 -0.060566 -0.104341 -0.219883   
4 -1.649292  1.445139  0.125327  ...  0.237673  0.896815 -0.229156 -0.432313   

         GE        GF        GH        GI        GL  EJ_dummy  
0 -0

### Step one: RF from train to greek

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

clf_1 = RandomForestClassifier(max_depth=3, random_state=0).fit(df_ready_x, df_ready_greek)
scores_1 = cross_val_score(clf_1, df_ready_x, df_ready_greek, cv=5)
scores_1

array([0.4516129 , 0.5       , 0.43089431, 0.36585366, 0.42276423])

### Step two: from greek to class 

In [6]:
from sklearn.ensemble import GradientBoostingClassifier

clf_2 = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
clf_2.fit(df_ready_greek, df_ready_y)
scores_2 = cross_val_score(clf_2, df_ready_greek, df_ready_y, cv=5)#, scoring='neg_log_loss')
scores_2

array([1., 1., 1., 1., 1.])

### Step 3: predict through the two steps

In [7]:
df_test_file = pd.read_csv(input_path + 'test.csv')
df_test_x, df_test_y, df_test_greek = prepare_initial(df_test_file, df_greek_file)

    AB   AF   AH   AM   AR   AX   AY   AZ   BC  BD   ...   FL   FR   FS   GB  \
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   

    GE   GF   GH   GI   GL  EJ_dummy  
0  0.0  0.0  0.0  0.0  0.0       0.0  
1  0.0  0.0  0.0  0.0  0.0       0.0  
2  0.0  0.0  0.0  0.0  0.0       0.0  
3  0.0  0.0  0.0  0.0  0.0       0.0  
4  0.0  0.0  0.0  0.0  0.0       0.0  

[5 rows x 56 columns]
   Alpha_B  Alpha_D  Alpha_G  Beta_A  Beta_B  Beta_C  Gamma_A  Gamma_B  \
0        1        0        0       0       0       1        0        0   
1        0        0        0       0       0       1        0        0   
2        0        0        0       0    

In [8]:
intermed = pd.DataFrame(clf_1.predict(df_test_x), columns=df_ready_greek.columns)
intermed

Unnamed: 0,Alpha_B,Alpha_D,Alpha_G,Beta_A,Beta_B,Beta_C,Gamma_A,Gamma_B,Gamma_E,Gamma_F,Gamma_G,Gamma_H,Gamma_M,Gamma_N,Delta_A,Delta_B,Delta_C,Delta_D
0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0
1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0


In [9]:
preds = pd.DataFrame(clf_2.predict_proba(intermed))
preds

Unnamed: 0,0,1
0,1.0,2.956926e-09
1,1.0,2.956926e-09
2,1.0,2.956926e-09
3,1.0,2.956926e-09
4,1.0,2.956926e-09


In [10]:
df_test_y = pd.concat([df_test_file['Id'], preds], axis=1)
df_test_y

Unnamed: 0,Id,0,1
0,00eed32682bb,1.0,2.956926e-09
1,010ebe33f668,1.0,2.956926e-09
2,02fa521e1838,1.0,2.956926e-09
3,040e15f562a2,1.0,2.956926e-09
4,046e85c7cc7f,1.0,2.956926e-09


In [11]:
df_test_y.to_csv('/kaggle/working/submission.csv', header=['Id', 'class_0', 'class_1'], index=False)
!head /kaggle/working/submission.csv

Id,class_0,class_1
00eed32682bb,0.9999999970430744,2.956925575738693e-09
010ebe33f668,0.9999999970430744,2.956925575738693e-09
02fa521e1838,0.9999999970430744,2.956925575738693e-09
040e15f562a2,0.9999999970430744,2.956925575738693e-09
046e85c7cc7f,0.9999999970430744,2.956925575738693e-09
