ICR - Identify Age_Related Conditions
=====================================

[Kaggle Competition](https://www.kaggle.com/competitions/icr-identify-age-related-conditions)

Sample submission
Based on: [gusthema/identifying-age-related-conditions-w-tfdf](https://www.kaggle.com/code/gusthema/identifying-age-related-conditions-w-tfdf)

# Data Exploration

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# read in data
df = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
test_df = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
sample = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv')
greeks = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/greeks.csv')

df = pd.read_csv('../data/train.csv')
df.head()

Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
0,000ff2bfdfe9,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1
1,007255e47698,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.56275,29.13543,32.131996,21.978,0
2,013f2bd269f5,0.47003,2635.10654,85.200147,32.360553,8.138688,6.73284,0.025578,12.82457,1.2299,...,7.70956,0.97556,1.198821,37.077772,88.609437,13676.95781,28.022851,35.192676,0.196941,0
3,043ac50845d5,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.2299,...,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,0
4,044fb8a146ec,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.05481,3.396778,102.15198,...,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,1


# Data Cleaning

# Model

## Data splitting

In [12]:
from sklearn.model_selection import KFold
import tensorflow as tf
import tensorflow_decision_forests as tfdf

In [13]:
# Creates a GroupKFold with 5 splits
kf = KFold(n_splits=5, shuffle=True)

# Create list of ids for the creation of oof dataframe.
ID_LIST = df.index

# Create a dataframe of required size with zero values.
oof = pd.DataFrame(data=np.zeros((len(ID_LIST),1)), index=ID_LIST)

# Create an empty dictionary to store the models trained for each fold.
models = {}

# Create empty dict to save metrics for the models trained for each fold.
accuracy = {}
cross_entropy = {}

# Save the name of the label column to a variable.
label = "Class"

# Calculate the number of samples for each label.
neg, pos = np.bincount(df['Class'])

# Calculate total samples.
total = neg + pos

# Calculate the weight for each label.
weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)

class_weight = {0: weight_for_0, 1: weight_for_1}

FEATURE_COLUMNS = [i for i in df.columns if i not in ["Id"]]

## Model training

In [14]:
# Loop through each fold
for i, (train_index, valid_index) in enumerate(kf.split(X=df)):
        print('##### Fold',i+1)

        # Fetch values corresponding to the index 
        train_df = df.iloc[train_index]
        valid_df = df.iloc[valid_index]
        valid_ids = valid_df.index.values
        
        # Select only feature columns for training.
        train_df = train_df[FEATURE_COLUMNS]
        valid_df = valid_df[FEATURE_COLUMNS]
        
        # There's one more step required before we can train the model. 
        # We need to convert the datatset from Pandas format (pd.DataFrame)
        # into TensorFlow Datasets format (tf.data.Dataset).
        # TensorFlow Datasets is a high performance data loading library 
        # which is helpful when training neural networks with accelerators like GPUs and TPUs.
        # Note: Some column names contains white spaces at the end of their name, 
        # which is non-comaptible with SavedModels save format. 
        # By default, `pd_dataframe_to_tf_dataset` function will convert 
        # this column names into a compatible format. 
        # So you can safely ignore the warnings related to this.
        train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_df, label=label)
        valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_df, label=label)

        # Define the model and metrics
        rf = tfdf.keras.RandomForestModel()
        rf.compile(metrics=["accuracy", "binary_crossentropy"]) 
        
        # Train the model
        # We will train the model using a one-liner.
        # Note: you may see a warning about Autograph. 
        # You can safely ignore this, it will be fixed in the next release.
        # Previously calculated class weights is used to handle imbalance.
        rf.fit(x=train_ds, class_weight=class_weight)
        
        # Store the model
        models[f"fold_{i+1}"] = rf
        
        
        # Predict OOF value for validation data
        predict = rf.predict(x=valid_ds)
        
        # Store the predictions in oof dataframe
        oof.loc[valid_ids, 0] = predict.flatten() 
        
        # Evaluate and store the metrics in respective dicts
        evaluation = rf.evaluate(x=valid_ds,return_dict=True)
        accuracy[f"fold_{i+1}"] = evaluation["accuracy"]
        cross_entropy[f"fold_{i+1}"]= evaluation["binary_crossentropy"]

##### Fold 1








Use /tmp/tmpvzqdaqvn as temporary training directory


Reading training dataset...


2023-06-28 17:20:13.525105: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_48' with dtype double and shape [493]
	 [[{{node Placeholder/_48}}]]






Training dataset read in 0:00:01.698775. Found 493 examples.
Training model...


[INFO 23-06-28 17:20:15.2116 BST kernel.cc:1242] Loading model from path /tmp/tmpvzqdaqvn/model/ with prefix 3a7f46ab95e34c93


Model trained in 0:00:00.232975
Compiling model...


[INFO 23-06-28 17:20:15.2535 BST decision_forest.cc:660] Model loaded with 300 root(s), 13900 node(s), and 56 input feature(s).
[INFO 23-06-28 17:20:15.2536 BST abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 23-06-28 17:20:15.2536 BST kernel.cc:1074] Use fast generic engine
2023-06-28 17:20:15.278368: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_30' with dtype double and shape [493]
	 [[{{node Placeholder/_30}}]]


Model compiled.


2023-06-28 17:20:15.998345: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_11' with dtype double and shape [124]
	 [[{{node Placeholder/_11}}]]






##### Fold 2








Use /tmp/tmpnu8bi486 as temporary training directory
Reading training dataset...


2023-06-28 17:20:17.079121: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_10' with dtype double and shape [493]
	 [[{{node Placeholder/_10}}]]


Training dataset read in 0:00:01.459050. Found 493 examples.
Training model...
Model trained in 0:00:00.338936
Compiling model...


[INFO 23-06-28 17:20:18.6828 BST kernel.cc:1242] Loading model from path /tmp/tmpnu8bi486/model/ with prefix 2d716e761fdd4c2b
[INFO 23-06-28 17:20:18.7497 BST decision_forest.cc:660] Model loaded with 300 root(s), 13706 node(s), and 56 input feature(s).
[INFO 23-06-28 17:20:18.7498 BST kernel.cc:1074] Use fast generic engine
2023-06-28 17:20:18.773957: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_48' with dtype double and shape [493]
	 [[{{node Placeholder/_48}}]]


Model compiled.


2023-06-28 17:20:19.607446: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_3' with dtype double and shape [124]
	 [[{{node Placeholder/_3}}]]


##### Fold 3








Use /tmp/tmpha7z35x1 as temporary training directory
Reading training dataset...


2023-06-28 17:20:21.100080: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_39' with dtype string and shape [494]
	 [[{{node Placeholder/_39}}]]


Training dataset read in 0:00:01.476824. Found 494 examples.
Training model...
Model trained in 0:00:00.218188
Compiling model...


[INFO 23-06-28 17:20:22.6315 BST kernel.cc:1242] Loading model from path /tmp/tmpha7z35x1/model/ with prefix 99be69db021b4972
[INFO 23-06-28 17:20:22.6662 BST decision_forest.cc:660] Model loaded with 300 root(s), 13922 node(s), and 56 input feature(s).
[INFO 23-06-28 17:20:22.6663 BST kernel.cc:1074] Use fast generic engine
2023-06-28 17:20:22.688153: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_36' with dtype double and shape [494]
	 [[{{node Placeholder/_36}}]]


Model compiled.


2023-06-28 17:20:23.422780: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_54' with dtype double and shape [123]
	 [[{{node Placeholder/_54}}]]


##### Fold 4








Use /tmp/tmp_kovz9s_ as temporary training directory
Reading training dataset...


2023-06-28 17:20:24.532509: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_55' with dtype double and shape [494]
	 [[{{node Placeholder/_55}}]]


Training dataset read in 0:00:01.952112. Found 494 examples.
Training model...
Model trained in 0:00:00.250130
Compiling model...


[INFO 23-06-28 17:20:26.5748 BST kernel.cc:1242] Loading model from path /tmp/tmp_kovz9s_/model/ with prefix f58eb1b6402e447e
[INFO 23-06-28 17:20:26.6122 BST decision_forest.cc:660] Model loaded with 300 root(s), 13762 node(s), and 56 input feature(s).
[INFO 23-06-28 17:20:26.6122 BST abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 23-06-28 17:20:26.6122 BST kernel.cc:1074] Use fast generic engine
2023-06-28 17:20:26.643327: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_9' with dtype double and shape [494]
	 [[{{node Placeholder/_9}}]]


Model compiled.


2023-06-28 17:20:27.444398: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_55' with dtype double and shape [123]
	 [[{{node Placeholder/_55}}]]


##### Fold 5








Use /tmp/tmpsyyo71dm as temporary training directory
Reading training dataset...


2023-06-28 17:20:28.619394: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_63' with dtype double and shape [2]
	 [[{{node Placeholder/_63}}]]


Training dataset read in 0:00:01.551913. Found 494 examples.
Training model...
Model trained in 0:00:00.221345
Compiling model...


[INFO 23-06-28 17:20:30.1905 BST kernel.cc:1242] Loading model from path /tmp/tmpsyyo71dm/model/ with prefix 9cb2352228764db5
[INFO 23-06-28 17:20:30.2271 BST decision_forest.cc:660] Model loaded with 300 root(s), 13718 node(s), and 56 input feature(s).
[INFO 23-06-28 17:20:30.2271 BST kernel.cc:1074] Use fast generic engine
2023-06-28 17:20:30.252432: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype double and shape [494]
	 [[{{node Placeholder/_0}}]]


Model compiled.


2023-06-28 17:20:31.047344: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_47' with dtype double and shape [123]
	 [[{{node Placeholder/_47}}]]




### check the model

In [15]:
for _model in models:
    inspector = models[_model].make_inspector()
    print(_model, inspector.evaluation())

fold_1 Evaluation(num_examples=493, accuracy=0.8657528687111298, loss=0.3706634126093555, rmse=None, ndcg=None, aucs=None, auuc=None, qini=None)
fold_2 Evaluation(num_examples=493, accuracy=0.8315627471876799, loss=0.3926845095516372, rmse=None, ndcg=None, aucs=None, auuc=None, qini=None)
fold_3 Evaluation(num_examples=494, accuracy=0.846170714097701, loss=0.4112626668134328, rmse=None, ndcg=None, aucs=None, auuc=None, qini=None)
fold_4 Evaluation(num_examples=494, accuracy=0.8972720420322019, loss=0.37219883450948366, rmse=None, ndcg=None, aucs=None, auuc=None, qini=None)
fold_5 Evaluation(num_examples=494, accuracy=0.8613401632281109, loss=0.390130241494607, rmse=None, ndcg=None, aucs=None, auuc=None, qini=None)


# Submission

In [16]:
test_ds_pd = test_df
test_df_columns = test_ds_pd.columns.tolist()
TEST_FEATURE_COLUMNS = [i for i in FEATURE_COLUMNS \
                        if i in test_df_columns and i != "Class"]
test_ds_pd = test_ds_pd[TEST_FEATURE_COLUMNS]
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_ds_pd)
predictions = models['fold_1'].predict(test_ds)
n_predictions= [[round(abs(i-1), 8), i] for i in predictions.ravel()]
print(n_predictions)





2023-06-28 17:20:31.974745: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_9' with dtype double and shape [5]
	 [[{{node Placeholder/_9}}]]


[[0.65333357, 0.34666643], [0.65333357, 0.34666643], [0.65333357, 0.34666643], [0.65333357, 0.34666643], [0.65333357, 0.34666643]]


In [17]:
sample_submission = sample
sample_submission[['class_0', 'class_1']] = n_predictions
sample_submission.to_csv('../submissions/submission.csv', index=False)