In [327]:
import tensorflow_decision_forests as tfdf
import pandas as pd
from wurlitzer import sys_pipes
import numpy as np

In [328]:
names = ["age","sex","on_thyroxine","query_on_thyroxine","on_antihyroid_meds","sick","pregnant","thyroid_surgery","I131_treatment","query_hypothyroid","query_hyperthyroid","lithium","goitre","tumor","hypopituitary","psych","TSH_measured","TSH","T3_measured","T3","TT4_measured","TT4","T4U_measured","T4U","FTI_measured","FTI","TBG_measured","TBG","referral_source","target"]
df = pd.read_csv('/config/workspace/thyroid0387.data',names=names)

In [329]:
# tidy the target column
df['patient_id'] = df["target"].apply(lambda x: x.split("[")[1].strip(']'))
df['target'] = df["target"].apply(lambda x: x.split("[")[0])

In [330]:
# replacing ? with np.nan
df.replace({"?":np.nan},inplace=True)

In [331]:
# converting object to float
num_cols = ["TSH","T3","TT4","T4U","FTI","TBG"]
for i in num_cols:
    df[i] = df[i].astype(float)

In [332]:
# age cannot be 65526 
# capping age to 100 years
df = df[df["age"] <= 100]

In [333]:
df.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antihyroid_meds,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,target,patient_id
0,29,F,f,f,f,f,f,f,f,t,...,,f,,f,,f,,other,-,840801013
1,29,F,f,f,f,f,f,f,f,f,...,128.0,f,,f,,f,,other,-,840801014
2,41,F,f,f,f,f,f,f,f,f,...,,f,,f,,t,11.0,other,-,840801042
3,36,F,f,f,f,f,f,f,f,f,...,,f,,f,,t,26.0,other,-,840803046
4,32,F,f,f,f,f,f,f,f,f,...,,f,,f,,t,36.0,other,S,840803047


In [334]:
# Remove reduntant columns
df.drop(['TSH_measured','T3_measured','TT4_measured','T4U_measured','FTI_measured','TBG_measured','referral_source','patient_id','TBG'],axis=1, inplace=True)

In [335]:
# Selecting a subset of target which can be classified as Hyper , hypo or Euthyroid (Negative) state
df = df[df['target'].isin(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'AK', 'C|I', 'H|K', 'GK', 'FK', 'GI', 'GKJ', 'D|R', '-'])]
# mapping the target column
mapping = {'-':"Negative",
           'A':'Hyperthyroid','AK':"Hyperthyroid",'B':"Hyperthyroid", 'C':"Hyperthyroid", 'C|I': 'Hyperthyroid', 'D':"Hyperthyroid", 'D|R':"Hyperthyroid",
           'E': "Hypothyroid", 'F': "Hypothyroid", 'FK': "Hypothyroid", "G": "Hypothyroid", "GK": "Hypothyroid", "GI": "Hypothyroid", 'GKJ': 'Hypothyroid', 'H|K': 'Hypothyroid',
          }
df['target'] = df['target'].map(mapping)

In [336]:
# Load a dataset into a Pandas Dataframe.
dataset_df = df.copy()

In [337]:
# Name of the label column.
label = "target"

classes = dataset_df[label].unique().tolist()
print(f"Label classes: {classes}")

dataset_df[label] = dataset_df[label].map(classes.index)

Label classes: ['Negative', 'Hypothyroid', 'Hyperthyroid']


In [338]:
# Split the dataset into a training and a testing dataset.

def split_dataset(dataset, test_ratio=0.30):
  """Splits a panda dataframe in two."""
  test_indices = np.random.rand(len(dataset)) < test_ratio
  print(len(test_indices))
  return dataset[~test_indices], dataset[test_indices]


train_ds_pd, test_ds_pd = split_dataset(dataset_df)
print("{} examples in training, {} examples for testing.".format(
    len(train_ds_pd), len(test_ds_pd)))

7675
5326 examples in training, 2349 examples for testing.


In [339]:
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=label)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_ds_pd, label=label)

In [340]:
tuner = tfdf.tuner.RandomSearch(num_trials=20)

# Hyper-parameters to optimize.
tuner.choice("max_depth", [4, 5, 6, 7])

# Specify the model.
model_1 = tfdf.keras.RandomForestModel(verbose=2,tuner=tuner,random_seed = 42)

# Train the model.
model_1.fit(train_ds)





Use /tmp/tmpkm6b1d_o as temporary training directory
Reading training dataset...
Training tensor examples:
Features: {'age': <tf.Tensor 'data:0' shape=(None,) dtype=int64>, 'sex': <tf.Tensor 'data_1:0' shape=(None,) dtype=string>, 'on_thyroxine': <tf.Tensor 'data_2:0' shape=(None,) dtype=string>, 'query_on_thyroxine': <tf.Tensor 'data_3:0' shape=(None,) dtype=string>, 'on_antihyroid_meds': <tf.Tensor 'data_4:0' shape=(None,) dtype=string>, 'sick': <tf.Tensor 'data_5:0' shape=(None,) dtype=string>, 'pregnant': <tf.Tensor 'data_6:0' shape=(None,) dtype=string>, 'thyroid_surgery': <tf.Tensor 'data_7:0' shape=(None,) dtype=string>, 'I131_treatment': <tf.Tensor 'data_8:0' shape=(None,) dtype=string>, 'query_hypothyroid': <tf.Tensor 'data_9:0' shape=(None,) dtype=string>, 'query_hyperthyroid': <tf.Tensor 'data_10:0' shape=(None,) dtype=string>, 'lithium': <tf.Tensor 'data_11:0' shape=(None,) dtype=string>, 'goitre': <tf.Tensor 'data_12:0' shape=(None,) dtype=string>, 'tumor': <tf.Tensor 'dat

[INFO 24-01-17 17:20:35.8482 IST kernel.cc:771] Start Yggdrasil model training
[INFO 24-01-17 17:20:35.8483 IST kernel.cc:772] Collect training examples
[INFO 24-01-17 17:20:35.8483 IST kernel.cc:785] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: CATEGORICAL
  categorial {
    min_vocab_frequency: 0
    max_vocab_count: -1
  }
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

[INFO 24-01-17 17:20:35.8485 IST kernel.cc:391] Number of batches: 6
[INFO 24-01-17 17:20:35.8485 IST kernel.cc:392] Number of examples: 5326
[INFO 24-01-17 17:20:35.8553 IST kernel.cc:792] Training dataset:
Number of records: 5326
Number of columns: 22

Number of columns by type:
	CATEGORICAL: 16 (72.7273%)
	NUMERICAL: 6 (27.2727%)

Columns:

CATEGORICAL: 16 (72.7273%)
	1: "I131_treatment" CATEGORICAL has-dict vocab-size:3

Model trained in 0:00:01.568050
Compiling model...
Model compiled.


<keras.src.callbacks.History at 0x7f6ffc601550>

In [341]:
model_1.compile(metrics=["accuracy"])
evaluation = model_1.evaluate(test_ds, return_dict=True)
print()

for name, value in evaluation.items():
  print(f"{name}: {value:.4f}")


loss: 0.0000
accuracy: 0.9851


In [342]:
model_1.summary()

Model: "random_forest_model_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1 (1.00 Byte)
Trainable params: 0 (0.00 Byte)
Non-trainable params: 1 (1.00 Byte)
_________________________________________________________________
Type: "RANDOM_FOREST"
Task: CLASSIFICATION
Label: "__LABEL"

Input Features (21):
	FTI
	I131_treatment
	T3
	T4U
	TSH
	TT4
	age
	goitre
	hypopituitary
	lithium
	on_antihyroid_meds
	on_thyroxine
	pregnant
	psych
	query_hyperthyroid
	query_hypothyroid
	query_on_thyroxine
	sex
	sick
	thyroid_surgery
	tumor

No weights

Variable Importance: INV_MEAN_MIN_DEPTH:
    1.                "TSH"  0.323719 ################
    2.                "FTI"  0.288116 ############
    3.                "TT4"  0.278822 ###########
    4.                 "T3"  0.228731 ######
    5.                "T4U"  0.188294 ##
    6.       "on_thyroxine"  0.179285 #
    7. "query_hyperthyroid"  0.178