In [323]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf
from tensorflow.keras.metrics import Precision, Recall

In [262]:
def read_and_split_data(return_one_hot = True, random_state=101):
    df = pd.read_csv('/content/Problem_Dataset.csv')

    if return_one_hot:
        df_ = pd.DataFrame()
        df_["Obs"] = df["Obs"]
        df_["Type"] = [arr for arr in df[df["Type"].unique()].values]
    else:
        df_ = df[["Obs", "Type"]]

    train_data, val_data = train_test_split(df_, test_size=0.2, random_state=random_state)

    return train_data, val_data

def tokenize_and_fit(docs):
    # Tokenize your text data
    tokenizer = keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(docs)

    vocab_size = len(tokenizer.word_index) + 1
    max_len = max(len(sentence.split()) for sentence in docs)

    return tokenizer, max_len


def encode_docs(tokenizer, max_length, docs):
    # integer encode
    encoded = tokenizer.texts_to_sequences(docs)
    # pad sequences
    padded = pad_sequences(encoded, maxlen=max_length, padding='post')

    return padded

def create_multi_label_model(vocab_size, max_len, class_to_index: dict, embedding_dim=50, ensemble=False):

    if ensemble:
        model = keras.Sequential([
            layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len),
            layers.Flatten(),
            layers.Dense(128, activation='relu'),
            layers.Dense(1, activation='sigmoid')
            ])
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    else:
        num_classes = len(class_to_index.keys())

        model = keras.Sequential([
            layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len),
            layers.Flatten(),
            layers.Dense(128, activation='relu'),
            layers.Dense(num_classes, activation='sigmoid')  # Use sigmoid activation for multi-label classification
        ])
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    return model

## First Pass (Ensemble)

In [237]:
df = pd.read_csv('/content/Problem_Dataset.csv')

In [248]:
class_to_index = dict(zip(df['Type'].unique(), range(len(df['Type'].unique()))))

In [238]:
ensemble_train, ensemble_val = read_and_split_data(return_one_hot=False)

In [251]:
# Tokenize your text data
tokenizer, max_len = tokenize_and_fit(ensemble_train['Obs'])
vocab_size = len(tokenizer.word_index) + 1


In [261]:
# Train a model for each label
models = {}
for label in df['Type'].unique():

    X_data = encode_docs(tokenizer, max_len, ensemble_train["Obs"])
    binary_labels = (ensemble_train['Type'] == label).astype(int)
    y_data = np.asarray(binary_labels)

    model = create_multi_label_model(vocab_size=vocab_size,
                                     ensemble=True,
                                     class_to_index=class_to_index,
                                     max_len=max_len)
    model.fit(X_data, y_data, epochs=5, batch_size=32, validation_split=0.2)
    models[label] = model



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [292]:
def save_model_s(models, name_prefix):
    if type(models) == dict:
        for l,m in zip(models.keys(),models.values()):
            m.save(f'{name_prefix}_{l}.h5')
    else:
        models.save(f"{name_prefix}.h5")


In [296]:
def load_model_s(name_prefix, class_to_index, ensemble=True):
    from tensorflow.keras.models import load_model

    if ensemble:
        models={}
        for l in class_to_index.keys():
            m = load_model(f"{name_prefix}_{l}.h5")
            models[l] = m
        return models
    else:
        model = load_model(f"{name_prefix}.h5")
        return model

In [None]:
save_model_s(models, "ensemble_model")

In [320]:
# Load all the models into an ensemble

models = load_model_s(ensemble=True, name_prefix="ensemble_model", class_to_index=class_to_index)

def get_predictions(model, val_data, ensemble=True):

    if ensemble:
        ensemble_predictions = []
        for model_label, model in models.items():
            val_predictions = model.predict(encode_docs(tokenizer, max_len, val_data["Obs"]))
            ensemble_predictions.append(val_predictions)

        # Create an array of predictions for each input text
        ensemble_predictions = np.array(ensemble_predictions).squeeze().T
        # Threshold ensemble predictions for binary classification
        binary_ensemble_predictions = (ensemble_predictions > 0.5).astype(int)

        y_hat = np.argmax((ensemble_predictions > 0.5).astype(int), axis=1)

        return y_hat
    else:

        multiclass_predictions = multi_label_model.predict(encode_docs(tokenizer, max_len, val_data['Obs']))
        y_hat = (multiclass_predictions > 0.5).astype(int)


        return y_hat

In [309]:
def evaluate_model(val_data, y_hat, ensemble=True):

    if ensemble:
        y_val = val_data["Type"].apply(lambda x: list(models.keys()).index(x)).values
    else:
        # Predictions on validation set
        y_val = np.array([np.asarray(x, dtype = np.float64) for x in val_data["Type"]])


    # Obtain precision, recall, F1-score, and print the classification report
    print("Ensemble Classification Report:")
    print(classification_report(y_val, y_hat, target_names=models.keys()))

In [303]:
y_hat = get_predictions(models, ensemble_val)




In [304]:
evaluate_model(ensemble_val, y_hat, ensemble=True)

Ensemble Classification Report:
              precision    recall  f1-score   support

          B3       0.28      0.94      0.44        33
          B4       0.88      0.20      0.33        35
          A3       0.69      0.35      0.47        31
          B1       0.96      0.77      0.85        30
          B2       0.96      0.81      0.88        27
          A1       0.88      0.25      0.39        28
          A2       0.64      0.54      0.58        26

    accuracy                           0.55       210
   macro avg       0.75      0.55      0.56       210
weighted avg       0.75      0.55      0.55       210



## Second Pass (Multi-label)

In [287]:
multiclass_train, multiclass_val = read_and_split_data(return_one_hot=True)

In [None]:
# Tokenize your text data
tokenizer, max_len = tokenize_and_fit(multiclass_train["Obs"])
vocab_size = len(tokenizer.word_index) + 1



In [291]:
# Train a multi-label model
X_data = encode_docs(tokenizer, max_len, multiclass_train["Obs"])
binary_labels = [np.asarray(x, dtype = np.float64) for x in multiclass_train["Type"]]
y_data = np.array(binary_labels)

multi_label_model = create_multi_label_model(vocab_size=vocab_size,
                                             max_len=max_len,
                                             class_to_index=class_to_index,
                                             ensemble=False)
multi_label_model.fit(X_data, y_data, epochs=15, batch_size=32, validation_split=0.2)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.src.callbacks.History at 0x7c9891d4e1d0>

In [293]:
save_model_s(multi_label_model, "multilabel_model")

  saving_api.save_model(


In [321]:
# Load the model
model = load_model_s(ensemble=False, name_prefix='multilabel_model', class_to_index=class_to_index)

y_hat = get_predictions(model, multiclass_val, ensemble=False)



In [322]:
evaluate_model(y_hat=y_hat, val_data=multiclass_val, ensemble=False)

Ensemble Classification Report:
              precision    recall  f1-score   support

          B3       0.94      0.48      0.64        33
          B4       0.84      0.46      0.59        35
          A3       0.73      0.26      0.38        31
          B1       0.95      0.63      0.76        30
          B2       0.94      0.59      0.73        27
          A1       0.67      0.36      0.47        28
          A2       0.68      0.50      0.58        26

   micro avg       0.83      0.47      0.60       210
   macro avg       0.82      0.47      0.59       210
weighted avg       0.83      0.47      0.59       210
 samples avg       0.47      0.47      0.47       210



  _warn_prf(average, modifier, msg_start, len(result))


## Third Pass (BERT)

In [None]:
# !pip install transformers

In [324]:
bert_train, bert_val = read_and_split_data(return_one_hot=True)

In [None]:


# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'  # You can try other BERT models as well
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=len(df['Type'].unique()))


train_inputs = tokenizer.batch_encode_plus(list(train_data['Obs']), add_special_tokens=True, padding=True, truncation=True, return_tensors='np', max_length=tokenizer.model_max_length)
val_inputs = tokenizer.batch_encode_plus(list(val_data['Obs']), add_special_tokens=True, padding=True, truncation=True, return_tensors='np', max_length=tokenizer.model_max_length)


train_labels = tf.convert_to_tensor([label for label in train_data["Type"]])
val_labels = tf.convert_to_tensor([label for label in val_data["Type"]])




In [None]:

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0),
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
              metrics=[Precision(), Recall()])

history = model.fit(dict(train_inputs), train_labels,
                    epochs=10,
                    batch_size=8,
                    validation_data=(dict(val_inputs), val_labels)
          )

In [None]:

# Save the fine-tuned model
model.save_pretrained('/content/fine_tuned_bert_model_tf')

# Load the fine-tuned model
fine_tuned_model_tf = TFBertForSequenceClassification.from_pretrained('/content/fine_tuned_bert_model_tf')



In [None]:
# Evaluation on the validation set
val_preds_tf = fine_tuned_model_tf.predict(dict(val_inputs))['logits']
val_preds_tf = tf.math.sigmoid(val_preds_tf)

# Threshold predictions for binary classification
binary_val_preds_tf = (val_preds_tf > 0.5).numpy()




In [None]:


# Obtain precision, recall, F1-score, and print the classification report
print("Fine-tuned BERT Classification Report:")
print(classification_report(val_labels.numpy(), binary_val_preds_tf, target_names=df['Type'].explode().unique()))


Fine-tuned BERT Classification Report:
              precision    recall  f1-score   support

          B3       0.70      0.97      0.81        36
          B4       0.61      1.00      0.76        27
          A3       0.65      0.90      0.76        31
          B1       0.79      0.94      0.86        35
          B2       0.48      1.00      0.65        26
          A1       0.44      0.93      0.60        29
          A2       0.67      0.92      0.77        26

   micro avg       0.61      0.95      0.74       210
   macro avg       0.62      0.95      0.74       210
weighted avg       0.63      0.95      0.75       210
 samples avg       0.72      0.95      0.79       210

