In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences

from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf
from tensorflow.keras.metrics import Precision, Recall

## First Pass (Ensemble)

In [3]:
# Load your dataset
# Assuming you have a DataFrame with 'text' column and 'label' column
# where 'label' contains A1, A2, A3, B1, B2, B3, B4 labels
# Replace 'your_dataset.csv' with your actual dataset file
df = pd.read_csv('/content/Problem_Dataset.csv')

df

Unnamed: 0,sID,Obs,Type,A1,A2,A3,B1,B2,B3,B4
0,150668,Observed child fixated on a particular texture...,B3,0,0,0,0,0,1,0
1,150409,"Patient's focus centers on vacuum cleaners, st...",B3,0,0,0,0,0,1,0
2,150264,Displays a strong interest in smelling various...,B4,0,0,0,0,0,0,1
3,150373,Patient's attachment to a specific book is evi...,B3,0,0,0,0,0,1,0
4,150343,"Limited awareness of personal boundaries, inva...",A3,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
1045,150491,Displays a need for consistency in clothing ch...,B2,0,0,0,0,1,0,0
1046,150884,Patient shows a strong attachment to a single ...,B3,0,0,0,0,0,1,0
1047,151017,Child displays a tendency to focus on irreleva...,A2,0,1,0,0,0,0,0
1048,150297,Displays a tendency to monologue about persona...,A3,0,0,1,0,0,0,0


In [8]:
df1 = df[["Obs", "Type"]]
# Split the dataset into training and validation sets
train_data, val_data = train_test_split(df1, test_size=0.2, random_state=101)

In [None]:
# Tokenize your text data
tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(train_data['Obs'])

vocab_size = len(tokenizer.word_index) + 1
max_len = max(len(sentence.split()) for sentence in df['Obs'])



In [None]:
# Define a simple deep learning model
def create_model(embedding_dim=50):
    model = keras.Sequential([
        layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [None]:
# integer encode and pad documents
def encode_docs(tokenizer, max_length, docs):
    # integer encode
    encoded = tokenizer.texts_to_sequences(docs)
    # pad sequences
    padded = pad_sequences(encoded, maxlen=max_length, padding='post')
    return padded

In [None]:
encode_docs(tokenizer, max_len, train_data["Obs"])

array([[ 12, 427,  87, ...,   0,   0,   0],
       [ 14,  17,   5, ...,   0,   0,   0],
       [ 12,   5,  32, ...,   0,   0,   0],
       ...,
       [ 14,  45,   2, ...,   0,   0,   0],
       [523, 866,   1, ...,   0,   0,   0],
       [ 11,   5,  48, ...,   0,   0,   0]])

In [14]:
df1['Type'].unique()

array(['B3', 'B4', 'A3', 'B1', 'B2', 'A1', 'A2'], dtype=object)

In [None]:
# Train a model for each label
models = {}
for label in df_['Type'].unique():
    binary_labels = (train_data['Type'] == label).astype(int)
    X_data = encode_docs(tokenizer, max_len, train_data["Obs"])
    y_data = np.asarray(binary_labels)
    model = create_model()
    model.fit(X_data, y_data, epochs=5, batch_size=32, validation_split=0.2)
    models[label] = model



Train on 672 samples, validate on 168 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 672 samples, validate on 168 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 672 samples, validate on 168 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 672 samples, validate on 168 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 672 samples, validate on 168 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 672 samples, validate on 168 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 672 samples, validate on 168 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
models

{'B3': <tensorflow.python.keras.engine.sequential.Sequential at 0x20790227088>,
 'B4': <tensorflow.python.keras.engine.sequential.Sequential at 0x20791ff7e08>,
 'A3': <tensorflow.python.keras.engine.sequential.Sequential at 0x20791470508>,
 'B1': <tensorflow.python.keras.engine.sequential.Sequential at 0x20791259f48>,
 'B2': <tensorflow.python.keras.engine.sequential.Sequential at 0x207914e85c8>,
 'A1': <tensorflow.python.keras.engine.sequential.Sequential at 0x20791aa0308>,
 'A2': <tensorflow.python.keras.engine.sequential.Sequential at 0x207917dc3c8>}

In [None]:
# Load all the models into an ensemble
ensemble_predictions = []
for model_label, model in models.items():
    val_predictions = model.predict(encode_docs(tokenizer, max_len, val_data["Obs"]))
    ensemble_predictions.append(val_predictions)



In [None]:
# Create an array of predictions for each input text
ensemble_predictions = np.array(ensemble_predictions).squeeze().T



In [None]:
# Threshold ensemble predictions for binary classification
binary_ensemble_predictions = (ensemble_predictions > 0.5).astype(int)



In [None]:
# Obtain precision, recall, F1-score, and print the classification report

y_val = val_data["Type"].apply(lambda x: list(models.keys()).index(x)).values
y_hat = np.argmax((ensemble_predictions > 0.5).astype(int), axis=1)

print("Ensemble Classification Report:")
print(classification_report(y_val, y_hat, target_names=models.keys()))

Ensemble Classification Report:
              precision    recall  f1-score   support

          B3       0.27      0.94      0.42        33
          B4       0.89      0.49      0.63        35
          A3       0.67      0.13      0.22        31
          B1       1.00      0.47      0.64        30
          B2       0.96      0.81      0.88        27
          A1       0.89      0.29      0.43        28
          A2       0.71      0.65      0.68        26

    accuracy                           0.54       210
   macro avg       0.77      0.54      0.56       210
weighted avg       0.76      0.54      0.55       210



## Second Pass (Multi-label)

In [None]:

df2 = pd.DataFrame()
df2["Obs"] = df["Obs"]
df2["Type"] = [arr for arr in df[df["Type"].unique()].values]

df2

Unnamed: 0,Obs,Type
0,Observed child fixated on a particular texture...,"[1, 0, 0, 0, 0, 0, 0]"
1,"Patient's focus centers on vacuum cleaners, st...","[1, 0, 0, 0, 0, 0, 0]"
2,Displays a strong interest in smelling various...,"[0, 1, 0, 0, 0, 0, 0]"
3,Patient's attachment to a specific book is evi...,"[1, 0, 0, 0, 0, 0, 0]"
4,"Limited awareness of personal boundaries, inva...","[0, 0, 1, 0, 0, 0, 0]"
...,...,...
1045,Displays a need for consistency in clothing ch...,"[0, 0, 0, 0, 1, 0, 0]"
1046,Patient shows a strong attachment to a single ...,"[1, 0, 0, 0, 0, 0, 0]"
1047,Child displays a tendency to focus on irreleva...,"[0, 0, 0, 0, 0, 0, 1]"
1048,Displays a tendency to monologue about persona...,"[0, 0, 1, 0, 0, 0, 0]"


In [None]:
# Split the dataset into training and validation sets

train_data, val_data = train_test_split(df2, test_size=0.2, random_state=101)


In [None]:
# Tokenize your text data
tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(df2['Obs'])


vocab_size = len(tokenizer.word_index) + 1
max_len = max(len(sentence.split()) for sentence in df2['Obs'])

def encode_docs(tokenizer, max_length, docs):
    # integer encode
    encoded = tokenizer.texts_to_sequences(docs)
    # pad sequences
    padded = pad_sequences(encoded, maxlen=max_length, padding='post')
    return padded


array([[ 12, 427,  87, ...,   0,   0,   0],
       [ 14,  17,   5, ...,   0,   0,   0],
       [ 12,   5,  32, ...,   0,   0,   0],
       ...,
       [ 14,  45,   2, ...,   0,   0,   0],
       [523, 866,   1, ...,   0,   0,   0],
       [ 11,   5,  48, ...,   0,   0,   0]])

In [None]:
# class_to_index = dict(zip(df_['Type'].unique(), range(len(df_['Type'].unique()))))
class_to_index

{'B3': 0, 'B4': 1, 'A3': 2, 'B1': 3, 'B2': 4, 'A1': 5, 'A2': 6}

In [None]:
binary_labels = np.array([np.asarray(x, dtype = np.float64) for x in train_data["Type"]])
binary_labels.shape

(840, 7)

In [None]:
encoded_data = encode_docs(tokenizer, max_len, train_data["Obs"])
encoded_data.shape

(840, 26)

In [None]:
# Define a multi-label deep learning model

def create_multi_label_model(embedding_dim=50, num_classes=7):  # Set appropriate values for embedding_dim and num_classes
    model = keras.Sequential([
        layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dense(num_classes, activation='sigmoid')  # Use sigmoid activation for multi-label classification
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [None]:
# Train a multi-label model
num_classes = 7  # Number of unique classes (A1, A2, A3, B1, B2, B3, B4)

multi_label_model = create_multi_label_model()
multi_label_model.fit(encoded_data, binary_labels, epochs=15, batch_size=32, validation_split=0.2)

# Save the model
# model.save('multi_label_model.h5')



Train on 672 samples, validate on 168 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x2079a155508>

In [None]:
# Load the model
# loaded_model = keras.models.load_model('multi_label_model.h5')

# Predictions on validation set
val_binary_labels = np.array([np.asarray(x, dtype = np.float64) for x in val_data["Type"]])

val_predictions = multi_label_model.predict(encode_docs(tokenizer, max_len, val_data['Obs']))


In [None]:
# Threshold predictions for binary classification
binary_val_predictions = (val_predictions > 0.5).astype(int)



array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [None]:
# Obtain precision, recall, F1-score, and print the classification report
print("Classification Report:")
print(classification_report(val_binary_labels, binary_val_predictions, target_names=class_to_index.keys()))

Classification Report:
              precision    recall  f1-score   support

          B3       0.94      0.52      0.67        33
          B4       0.91      0.57      0.70        35
          A3       0.62      0.16      0.26        31
          B1       0.95      0.67      0.78        30
          B2       0.95      0.67      0.78        27
          A1       0.79      0.39      0.52        28
          A2       0.63      0.46      0.53        26

   micro avg       0.85      0.49      0.62       210
   macro avg       0.83      0.49      0.61       210
weighted avg       0.83      0.49      0.61       210
 samples avg       0.49      0.49      0.49       210



  _warn_prf(average, modifier, msg_start, len(result))


## Third Pass (BERT)

In [None]:
!pip install transformers

In [154]:

from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch
from torch.utils.data import Dataset


In [121]:
df3 = pd.DataFrame()
df3["Obs"] = df["Obs"]
df3["Type"] = [arr for arr in df[df["Type"].unique()].values]

Unnamed: 0,Obs,Type
0,Observed child fixated on a particular texture...,"[1, 0, 0, 0, 0, 0, 0]"
1,"Patient's focus centers on vacuum cleaners, st...","[1, 0, 0, 0, 0, 0, 0]"
2,Displays a strong interest in smelling various...,"[0, 1, 0, 0, 0, 0, 0]"
3,Patient's attachment to a specific book is evi...,"[1, 0, 0, 0, 0, 0, 0]"
4,"Limited awareness of personal boundaries, inva...","[0, 0, 1, 0, 0, 0, 0]"
...,...,...
1045,Displays a need for consistency in clothing ch...,"[0, 0, 0, 0, 1, 0, 0]"
1046,Patient shows a strong attachment to a single ...,"[1, 0, 0, 0, 0, 0, 0]"
1047,Child displays a tendency to focus on irreleva...,"[0, 0, 0, 0, 0, 0, 1]"
1048,Displays a tendency to monologue about persona...,"[0, 0, 1, 0, 0, 0, 0]"


In [122]:

# Split the dataset into training and validation sets
train_data, val_data = train_test_split(df3, test_size=0.2, random_state=42)


In [218]:


# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'  # You can try other BERT models as well
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=len(df['Type'].unique()))


train_inputs = tokenizer.batch_encode_plus(list(train_data['Obs']), add_special_tokens=True, padding=True, truncation=True, return_tensors='np', max_length=tokenizer.model_max_length)
val_inputs = tokenizer.batch_encode_plus(list(val_data['Obs']), add_special_tokens=True, padding=True, truncation=True, return_tensors='np', max_length=tokenizer.model_max_length)



train_labels = tf.convert_to_tensor([label for label in train_data["Type"]])
val_labels = tf.convert_to_tensor([label for label in val_data["Type"]])




All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [219]:


model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0),
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
              metrics=[Precision(), Recall()])

history = model.fit(dict(train_inputs), train_labels, epochs=10, batch_size=8,
          validation_data=(dict(val_inputs), val_labels)
          )

Epoch 9/10
Epoch 10/10


dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [220]:

# Save the fine-tuned model
model.save_pretrained('/content/fine_tuned_bert_model_tf')

# Load the fine-tuned model
fine_tuned_model_tf = TFBertForSequenceClassification.from_pretrained('/content/fine_tuned_bert_model_tf')



Some layers from the model checkpoint at /content/fine_tuned_bert_model_tf were not used when initializing TFBertForSequenceClassification: ['dropout_455']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at /content/fine_tuned_bert_model_tf.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [222]:
# Evaluation on the validation set
val_preds_tf = fine_tuned_model_tf.predict(dict(val_inputs))['logits']
val_preds_tf = tf.math.sigmoid(val_preds_tf)

# Threshold predictions for binary classification
binary_val_preds_tf = (val_preds_tf > 0.5).numpy()




In [224]:


# Obtain precision, recall, F1-score, and print the classification report
print("Fine-tuned BERT Classification Report:")
print(classification_report(val_labels.numpy(), binary_val_preds_tf, target_names=df['Type'].explode().unique()))


Fine-tuned BERT Classification Report:
              precision    recall  f1-score   support

          B3       0.70      0.97      0.81        36
          B4       0.61      1.00      0.76        27
          A3       0.65      0.90      0.76        31
          B1       0.79      0.94      0.86        35
          B2       0.48      1.00      0.65        26
          A1       0.44      0.93      0.60        29
          A2       0.67      0.92      0.77        26

   micro avg       0.61      0.95      0.74       210
   macro avg       0.62      0.95      0.74       210
weighted avg       0.63      0.95      0.75       210
 samples avg       0.72      0.95      0.79       210

