ICR - Identify Age_Related Conditions
=====================================

[Kaggle Competition](https://www.kaggle.com/competitions/icr-identify-age-related-conditions)

icr_notebook_290623
Based on: [gusthema/identifying-age-related-conditions-w-tfdf](https://www.kaggle.com/code/gusthema/identifying-age-related-conditions-w-tfdf)

In [3]:
!pip install sklearn 
!pip install tensorflow
!pip install tensorflow_decision_forests

Collecting sklearn
  Downloading sklearn-0.0.post5.tar.gz (3.7 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25ldone
[?25h  Created wheel for sklearn: filename=sklearn-0.0.post5-py3-none-any.whl size=2950 sha256=86bdf83909fe9daa5d49bf587cf9aa3d8ca8da6fde0acf23a1351648c4faa86a
  Stored in directory: /home/t/.cache/pip/wheels/5f/28/a6/4e4fc2959e4ed9b33bf517703534fd8b19b76a842f74c9ed4c
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0.post5


# Data Exploration

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# read in data


df = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
test_df = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
sample_submission = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv')

# df = pd.read_csv('../data/train.csv')
df.head()

Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
0,000ff2bfdfe9,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1
1,007255e47698,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.56275,29.13543,32.131996,21.978,0
2,013f2bd269f5,0.47003,2635.10654,85.200147,32.360553,8.138688,6.73284,0.025578,12.82457,1.2299,...,7.70956,0.97556,1.198821,37.077772,88.609437,13676.95781,28.022851,35.192676,0.196941,0
3,043ac50845d5,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.2299,...,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,0
4,044fb8a146ec,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.05481,3.396778,102.15198,...,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,1


# Data Cleaning

# Model

## Data splitting

In [5]:
from sklearn.model_selection import KFold
import tensorflow as tf
import tensorflow_decision_forests as tfdf

2023-06-29 20:45:32.446654: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-29 20:45:32.483827: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-06-29 20:45:32.698319: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-06-29 20:45:32.700992: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
# Creates a GroupKFold with 5 splits
kf = KFold(n_splits=5, shuffle=True)

# Create list of ids for the creation of oof dataframe.
ID_LIST = df.index

# Create a dataframe of required size with zero values.
oof = pd.DataFrame(data=np.zeros((len(ID_LIST),1)), index=ID_LIST)

# Create an empty dictionary to store the models trained for each fold.
models = {}

# Create empty dict to save metrics for the models trained for each fold.
accuracy = {}
cross_entropy = {}

# Save the name of the label column to a variable.
label = "Class"

# Calculate the number of samples for each label.
neg, pos = np.bincount(df['Class'])

# Calculate total samples.
total = neg + pos

# Calculate the weight for each label.
weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)

class_weight = {0: weight_for_0, 1: weight_for_1}

FEATURE_COLUMNS = [i for i in df.columns if i not in ["Id"]]

## Model training

In [7]:
# Loop through each fold
for i, (train_index, valid_index) in enumerate(kf.split(X=df)):
    print('##### Fold',i+1)

    # Fetch values corresponding to the index 
    train_df = df.iloc[train_index]
    valid_df = df.iloc[valid_index]
    valid_ids = valid_df.index.values
    
    # Select only feature columns for training.
    train_df = train_df[FEATURE_COLUMNS]
    valid_df = valid_df[FEATURE_COLUMNS]
    
    train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_df, label=label)
    valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_df, label=label)

    # Define the model and metrics
    rf = tfdf.keras.RandomForestModel(
        num_trees=500,   # Number of trees to use in the ensemble
        max_depth=8,     # Max depth of the trees
        min_examples=5,  # Min samples per leaf
    )
    rf.compile(metrics=["accuracy", "binary_crossentropy"]) 
    
    # Model Training
    rf.fit(x=train_ds, class_weight=class_weight)
    models[f"fold_{i+1}"] = rf
    
    
    # Predict OOF value for validation data
    predict = rf.predict(x=valid_ds)
    
    # Store the predictions in oof dataframe
    oof.loc[valid_ids, 0] = predict.flatten() 
    
    # Evaluate and store the metrics in respective dicts
    evaluation = rf.evaluate(x=valid_ds,return_dict=True)
    accuracy[f"fold_{i+1}"] = evaluation["accuracy"]
    cross_entropy[f"fold_{i+1}"]= evaluation["binary_crossentropy"]

##### Fold 1








Use /tmp/tmppfrb2leh as temporary training directory
Reading training dataset...


2023-06-29 20:45:38.918816: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_63' with dtype double and shape [2]
	 [[{{node Placeholder/_63}}]]


Training dataset read in 0:00:09.873712. Found 493 examples.
Training model...
Model trained in 0:00:00.400065
Compiling model...


[INFO 23-06-29 20:45:48.9029 BST kernel.cc:1242] Loading model from path /tmp/tmppfrb2leh/model/ with prefix 3f68cd3c9be64881
[INFO 23-06-29 20:45:48.9681 BST decision_forest.cc:660] Model loaded with 500 root(s), 21354 node(s), and 56 input feature(s).
[INFO 23-06-29 20:45:48.9682 BST abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 23-06-29 20:45:48.9682 BST kernel.cc:1074] Use fast generic engine
2023-06-29 20:45:49.016392: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_6' with dtype double and shape [493]
	 [[{{node Placeholder/_6}}]]


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: could not get source code


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: could not get source code


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: could not get source code
Model compiled.


2023-06-29 20:45:53.504846: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_38' with dtype double and shape [124]
	 [[{{node Placeholder/_38}}]]


##### Fold 2








Use /tmp/tmpf2_r7twy as temporary training directory
Reading training dataset...


2023-06-29 20:45:55.224614: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_7' with dtype double and shape [493]
	 [[{{node Placeholder/_7}}]]


Training dataset read in 0:00:01.626641. Found 493 examples.
Training model...
Model trained in 0:00:00.317812
Compiling model...


[INFO 23-06-29 20:45:56.9437 BST kernel.cc:1242] Loading model from path /tmp/tmpf2_r7twy/model/ with prefix 9ef9b3c9f5a945c6
[INFO 23-06-29 20:45:56.9982 BST decision_forest.cc:660] Model loaded with 500 root(s), 22100 node(s), and 56 input feature(s).
[INFO 23-06-29 20:45:56.9982 BST kernel.cc:1074] Use fast generic engine
2023-06-29 20:45:57.025125: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_31' with dtype double and shape [493]
	 [[{{node Placeholder/_31}}]]


Model compiled.


2023-06-29 20:45:57.849868: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_54' with dtype double and shape [124]
	 [[{{node Placeholder/_54}}]]


##### Fold 3








Use /tmp/tmp56t_3eze as temporary training directory
Reading training dataset...


2023-06-29 20:45:59.125674: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_10' with dtype double and shape [494]
	 [[{{node Placeholder/_10}}]]


Training dataset read in 0:00:01.612320. Found 494 examples.
Training model...
Model trained in 0:00:00.325085
Compiling model...


[INFO 23-06-29 20:46:00.8631 BST kernel.cc:1242] Loading model from path /tmp/tmp56t_3eze/model/ with prefix a075adb8945e4941
[INFO 23-06-29 20:46:00.9227 BST decision_forest.cc:660] Model loaded with 500 root(s), 21368 node(s), and 56 input feature(s).
[INFO 23-06-29 20:46:00.9228 BST abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 23-06-29 20:46:00.9228 BST kernel.cc:1074] Use fast generic engine
2023-06-29 20:46:00.953692: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_27' with dtype double and shape [494]
	 [[{{node Placeholder/_27}}]]






Model compiled.


2023-06-29 20:46:01.833400: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_3' with dtype double and shape [123]
	 [[{{node Placeholder/_3}}]]






##### Fold 4








Use /tmp/tmpnyr8m0l3 as temporary training directory
Reading training dataset...


2023-06-29 20:46:02.990787: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_54' with dtype double and shape [494]
	 [[{{node Placeholder/_54}}]]


Training dataset read in 0:00:01.531640. Found 494 examples.
Training model...
Model trained in 0:00:00.348780
Compiling model...


[INFO 23-06-29 20:46:04.6887 BST kernel.cc:1242] Loading model from path /tmp/tmpnyr8m0l3/model/ with prefix 3b9cf26875184f5c
[INFO 23-06-29 20:46:04.7429 BST decision_forest.cc:660] Model loaded with 500 root(s), 21664 node(s), and 56 input feature(s).
[INFO 23-06-29 20:46:04.7430 BST kernel.cc:1074] Use fast generic engine
2023-06-29 20:46:04.768853: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_53' with dtype double and shape [494]
	 [[{{node Placeholder/_53}}]]


Model compiled.


2023-06-29 20:46:05.590195: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_16' with dtype double and shape [123]
	 [[{{node Placeholder/_16}}]]


##### Fold 5








Use /tmp/tmpj23gwivr as temporary training directory
Reading training dataset...


2023-06-29 20:46:06.662924: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_54' with dtype double and shape [494]
	 [[{{node Placeholder/_54}}]]






Training dataset read in 0:00:01.434024. Found 494 examples.
Training model...
Model trained in 0:00:00.488326
Compiling model...


[INFO 23-06-29 20:46:08.4019 BST kernel.cc:1242] Loading model from path /tmp/tmpj23gwivr/model/ with prefix e74174db25d24de5
[INFO 23-06-29 20:46:08.4569 BST decision_forest.cc:660] Model loaded with 500 root(s), 21514 node(s), and 56 input feature(s).
[INFO 23-06-29 20:46:08.4569 BST kernel.cc:1074] Use fast generic engine
2023-06-29 20:46:08.484702: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_11' with dtype double and shape [494]
	 [[{{node Placeholder/_11}}]]


Model compiled.


2023-06-29 20:46:09.219641: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_9' with dtype double and shape [123]
	 [[{{node Placeholder/_9}}]]








### check the model

In [8]:
for _model in models:
    inspector = models[_model].make_inspector()
    print(_model, inspector.evaluation())

fold_1 Evaluation(num_examples=493, accuracy=0.8712442942887775, loss=0.3748308322515354, rmse=None, ndcg=None, aucs=None, auuc=None, qini=None)
fold_2 Evaluation(num_examples=493, accuracy=0.8566565663217668, loss=0.392042446409438, rmse=None, ndcg=None, aucs=None, auuc=None, qini=None)
fold_3 Evaluation(num_examples=494, accuracy=0.8631426144431145, loss=0.41297781446765913, rmse=None, ndcg=None, aucs=None, auuc=None, qini=None)
fold_4 Evaluation(num_examples=494, accuracy=0.8543841141679036, loss=0.38893190349244894, rmse=None, ndcg=None, aucs=None, auuc=None, qini=None)
fold_5 Evaluation(num_examples=494, accuracy=0.8701393066836405, loss=0.361885872269077, rmse=None, ndcg=None, aucs=None, auuc=None, qini=None)


# Prediction

## Preparing test data

In [9]:
test_ds_pd = test_df
test_df_columns = test_ds_pd.columns.tolist()
TEST_FEATURE_COLUMNS = [i for i in FEATURE_COLUMNS \
                        if i in test_df_columns and i != "Class"]
test_ds_pd = test_ds_pd[TEST_FEATURE_COLUMNS]
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_ds_pd)





In [10]:
## Making Prediction

In [11]:
# Ensemble Predictions
predictions = np.zeros((len(test_df), len(models)))
for i, model in enumerate(models.values()):
    predictions[:, i] = model.predict(test_ds).ravel()

# Averaging the predictions
average_predictions = predictions.mean(axis=1)
n_predictions = [[round(abs(i-1), 8), i] for i in average_predictions]
print(n_predictions)

2023-06-29 20:46:10.308383: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_19' with dtype double and shape [5]
	 [[{{node Placeholder/_19}}]]


[[0.66199967, 0.3380003273487091], [0.66199967, 0.3380003273487091], [0.66199967, 0.3380003273487091], [0.66199967, 0.3380003273487091], [0.66199967, 0.3380003273487091]]


# Submission

In [12]:
sample_submission[['class_0', 'class_1']] = n_predictions
sample_submission.to_csv('submission.csv', index=False)