<a href="https://colab.research.google.com/github/zabihin/BERT-SymptomDiagnosis/blob/main/Train_huggingface_bert_base_cased_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow transformers

In [None]:
!pip install datasets

In [None]:
!pip install huggingface_hub

In [None]:
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import classification_report
from huggingface_hub import notebook_login
from datasets import load_dataset

#1. Dataset Loading:

The dataset contains symptom descriptions and associated diagnoses for various medical conditions. Each text sample in the dataset represents a symptom description provided by an individual, and the corresponding diagnosis label indicates the medical condition or disease associated with that symptom.

In [None]:
# Load the dataset
dataset = load_dataset("gretelai/symptom_to_diagnosis")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/2.46k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/171k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.5k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

#2. Data Preprocessing:


The text data is converted to a DataFrame

In [None]:
# Convert train_dataset to DataFrame
df_train = pd.DataFrame(dataset["train"])
df_test = pd.DataFrame(dataset["test"])

df_train['input_text'] = df_train['input_text'].astype(str)
df_test['input_text'] = df_test['input_text'].astype(str)

In [None]:
df_train.head(2)

Unnamed: 0,output_text,input_text
0,cervical spondylosis,I've been having a lot of pain in my neck and ...
1,impetigo,I have a rash on my face that is getting worse...


Label encoding

In [None]:
# Encode labels using LabelEncoder
label_encoder = LabelEncoder()
df_train['encoded_labels'] = label_encoder.fit_transform(df_train['output_text'])
df_test['encoded_labels'] = label_encoder.transform(df_test['output_text'])

In [None]:
df_train.head(2).head(2)

Unnamed: 0,output_text,input_text,encoded_labels
0,cervical spondylosis,I've been having a lot of pain in my neck and ...,3
1,impetigo,I have a rash on my face that is getting worse...,12


In [None]:
label_encoder.classes_

array(['allergy', 'arthritis', 'bronchial asthma', 'cervical spondylosis',
       'chicken pox', 'common cold', 'dengue', 'diabetes',
       'drug reaction', 'fungal infection',
       'gastroesophageal reflux disease', 'hypertension', 'impetigo',
       'jaundice', 'malaria', 'migraine', 'peptic ulcer disease',
       'pneumonia', 'psoriasis', 'typhoid', 'urinary tract infection',
       'varicose veins'], dtype=object)

In [None]:
len(label_encoder.classes_)

22

The data is split into training and validation sets

In [None]:
# Split the data into train and validation sets
train_df, val_df = train_test_split(df_train, test_size=0.1, stratify=df_train['output_text'])

Tokenization of input text is done using the BERT tokenizer

In [None]:
# Tokenize the text data and apply preprocessing
max_length = 150
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

tokenized_train = tokenizer(list(train_df['input_text']), padding=True, truncation=True, max_length=max_length, return_tensors="tf")
tokenized_val = tokenizer(list(val_df['input_text']), padding=True, truncation=True, max_length=max_length, return_tensors="tf")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
train_df['input_text'][0]

"I've been having a lot of pain in my neck and back. I've also been having trouble with my balance and coordination. I've been coughing a lot and my limbs feel weak."

In [None]:
tokenizer(train_df['input_text'][0], padding=True, truncation=True, max_length=max_length, return_tensors="tf")

{'input_ids': <tf.Tensor: shape=(1, 43), dtype=int32, numpy=
array([[  101,   146,   112,  1396,  1151,  1515,   170,  1974,  1104,
         2489,  1107,  1139,  2455,  1105,  1171,   119,   146,   112,
         1396,  1145,  1151,  1515,  3819,  1114,  1139,  5233,  1105,
        14501,   119,   146,   112,  1396,  1151, 24992,   170,  1974,
         1105,  1139, 10765,  1631,  4780,   119,   102]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(1, 43), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
      dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 43), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
      dtype=int32)>}

In [None]:
print(tokenized_train['input_ids'][0])

tf.Tensor(
[  101  1422  6130  1138  1151  4780  1105  1139  2455  1110  3600   119
   146  1138 13930 19365  1115  1294  1122  1662  1106  1815  1105  3179
  1110  8920   119   102     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0], shape=(76,), dtype=int32)


Labels are converted to one-hot encoded format.


In [None]:
# Convert labels to one-hot encoded format
Y_train_encoded = tf.keras.utils.to_categorical(train_df['encoded_labels'], num_classes=len(label_encoder.classes_))
Y_val_encoded = tf.keras.utils.to_categorical(val_df['encoded_labels'], num_classes=len(label_encoder.classes_))

In [None]:
train_df['encoded_labels'][0]

3

In [None]:
tf.keras.utils.to_categorical(train_df['encoded_labels'][0], num_classes=len(label_encoder.classes_))

array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.], dtype=float32)

Create TensorFlow datasets

In [None]:
# Create TensorFlow datasets
BATCH_SIZE = 64
train_dataset = tf.data.Dataset.from_tensor_slices((dict(tokenized_train), Y_train_encoded)).batch(BATCH_SIZE)
val_dataset = tf.data.Dataset.from_tensor_slices((dict(tokenized_val), Y_val_encoded)).batch(BATCH_SIZE)

#3. Model Definition:



*   The code defines a sequence classification model using BERT-based transformer architecture (TFAutoModelForSequenceClassification).
*   The model is compiled with categorical crossentropy loss and Adam optimizer.

*   The number of output classes is determined by the unique labels in the dataset.





In [None]:
# Define the model
num_classes = len(label_encoder.classes_)
int2label = dict(enumerate(label_encoder.classes_))
label2int = {v: k for k, v in int2label.items()}

model = TFAutoModelForSequenceClassification.from_pretrained(
    "bert-base-cased",
    num_labels=num_classes,
    id2label=int2label,
    label2id=label2int,
    output_attentions=True
)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.compile(
    loss=keras.losses.CategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.Adam(learning_rate=3e-5),
    metrics=['accuracy']
)

#4. Model Training:

In [None]:
# Train the model
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=20
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


#5. Model Evaluation:

In [None]:
# Evaluate the model on the test set
tokenized_test = tokenizer(list(df_test['input_text']), padding=True, truncation=True, max_length=max_length, return_tensors="tf")
Y_test_encoded = tf.keras.utils.to_categorical(df_test['encoded_labels'], num_classes=num_classes)

test_dataset = tf.data.Dataset.from_tensor_slices((dict(tokenized_test), Y_test_encoded)).batch(BATCH_SIZE)

test_loss, test_accuracy = model.evaluate(test_dataset)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')

Test Loss: 0.16056963801383972, Test Accuracy: 0.9716981053352356


#6. Classification Report:

In [None]:
# Print the classification report for more detailed metrics
predictions = model.predict(test_dataset)
predicted_labels = tf.argmax(predictions.logits, axis=1)
true_labels = tf.argmax(tf.constant(Y_test_encoded), axis=1)

print(classification_report(true_labels, predicted_labels, target_names=label_encoder.classes_))

                                 precision    recall  f1-score   support

                        allergy       0.91      1.00      0.95        10
                      arthritis       1.00      1.00      1.00        10
               bronchial asthma       1.00      1.00      1.00        10
           cervical spondylosis       0.91      1.00      0.95        10
                    chicken pox       1.00      1.00      1.00        10
                    common cold       1.00      1.00      1.00        10
                         dengue       1.00      0.90      0.95        10
                       diabetes       1.00      0.80      0.89        10
                  drug reaction       0.80      1.00      0.89         8
               fungal infection       1.00      1.00      1.00         9
gastroesophageal reflux disease       1.00      0.90      0.95        10
                   hypertension       0.91      1.00      0.95        10
                       impetigo       1.00      1.

#7. Hugging Face Hub Integration:

In [None]:
notebook_login()
model.push_to_hub("Zabihin/Symptom_to_Diagnosis", use_auth_token="********")
tokenizer.push_to_hub("Zabihin/Symptom_to_Diagnosis", use_auth_token="****************")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…



HfHubHTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-6597da30-3245f5f55947fb0258a96d80;ec403606-c0a7-4077-88d6-38bb00f5e828)

Invalid username or password.