In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf
import os
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
import itertools
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import time
from datetime import datetime
import csv
import re

In [None]:
#Hyperparameters
num_epochs = 1
learning_rate = 5e-5
batch_size = 16
num_trains = 15
validation_size = 0.1
label_modularity = "NewClass"
#bert-base-uncased, bert-large-cased
name_model = "bert-base-uncased"

crop_categories = True

In [None]:
#Get documents saved in new categories and their labels
filename = 'FinalDataset/polished_dataset_nosub_67.csv'
name_to_save = "nosub"

data = pd.read_csv(filename)

train_data, validation_data = train_test_split(data, test_size=validation_size, random_state=42)

In [None]:
#GET NUMBER OF LABELS
original_labels = train_data[label_modularity].unique().tolist()
num_labels = num_unique_labels = train_data[label_modularity].nunique()
print(num_labels)

In [None]:
# Get total number of texts
num_texts = train_data.shape[0]
print(num_texts)

In [None]:
#Get total number of words
all_text = ' '.join(train_data['Text'].values)
num_words = len(all_text.split())
print(num_words)

In [None]:
#Count token time
token_start = time.time()

In [None]:
tokenizer = BertTokenizer.from_pretrained(name_model)
train_encodings = tokenizer(list(train_data['Text']), truncation=True, padding=True, max_length=128)

label_encoder = LabelEncoder()
train_data[label_modularity] = label_encoder.fit_transform(train_data[label_modularity])
labels = train_data[label_modularity].tolist()

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    labels
))

In [None]:
val_encodings = tokenizer(list(validation_data['Text']), truncation=True, padding=True, max_length=128)

print(tf.shape(val_encodings['input_ids']))

validation_data[label_modularity] = label_encoder.fit_transform(validation_data[label_modularity])
val_labels = validation_data[label_modularity].tolist()

val_encodings = tf.convert_to_tensor(val_encodings['input_ids'])
val_labels = tf.convert_to_tensor(val_labels)

In [None]:
token_finish = time.time()

print(token_finish - token_start)

In [None]:
train_start = time.time()

In [None]:
model = TFBertForSequenceClassification.from_pretrained(name_model, num_labels=num_labels)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])
history = model.fit(train_dataset.batch(batch_size), validation_data=(val_encodings, val_labels), epochs=num_epochs)

In [None]:
train_finish = time.time()

print(train_finish - train_start)

In [None]:
val_start = time.time()

In [None]:
val_predictions = model.predict(val_encodings)
val_predictions = np.argmax(val_predictions.logits, axis=1)

precision = precision_score(val_labels, val_predictions, average='weighted')
recall = recall_score(val_labels, val_predictions, average='weighted')
f1 = f1_score(val_labels, val_predictions, average='weighted')
acc = history.history['val_accuracy'][-1]

print("Accuracy: "+str(acc))
print("Precision: "+str(precision))
print("Recall: "+str(recall))
print("F1: "+str(f1))

with open('Results/results_2Data_'+name_model+'.txt', 'a') as file:
    file.write(f'Accuracy: {acc}\n')
    file.write(f'Precision: {precision}\n')
    file.write(f'Recall: {recall}\n')
    file.write(f'F1: {f1}\n')

In [None]:
val_finish = time.time()

print(val_finish - val_start)

In [None]:
#Save results
csv_file = "Results/ModelResults.csv"
current_date = datetime.now().strftime('%Y-%m-%d')

with open(csv_file, mode='a', newline='') as file:
    writer = csv.writer(file)

    if file.tell() == 0:
        writer.writerow(column_names)
    
    writer.writerow(["BERT", num_labels, num_texts, num_words, acc, precision, f1, recall, train_finish - train_start, val_finish - val_start, current_date, name_to_save])

In [None]:
conf_matrix = confusion_matrix(val_labels, val_predictions)
print("Confusion Matrix:")
print(conf_matrix)

#Plot confusion matrix
plt.figure(figsize=(8, 6))
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion matrix')
plt.colorbar()
tick_marks = np.arange(len(set(labels)))
plt.xticks(tick_marks, set(labels), rotation=45)
plt.yticks(tick_marks, set(labels))

fmt = 'd'
thresh = conf_matrix.max() / 2.
for i, j in itertools.product(range(conf_matrix.shape[0]), range(conf_matrix.shape[1])):
    plt.text(j, i, format(conf_matrix[i, j], fmt),
             horizontalalignment="center",
             color="white" if conf_matrix[i, j] > thresh else "black")

plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
#Plot training loss and accuracy
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.show()

In [None]:
#Get accuracy of every class
class_accuracy = {}
for i in range(len(label_encoder.classes_)):
    class_name = label_encoder.classes_[i]
    total = conf_matrix[i].sum()
    correct = conf_matrix[i][i]
    accuracy = correct / total if total > 0 else 0.0
    class_accuracy[class_name] = accuracy

sorted_class_accuracy = sorted(class_accuracy.items(), key=lambda x: x[1], reverse=True)

print("Accuracy by class:")
for class_name, acc in sorted_class_accuracy:
    print(f"{class_name}: {acc}")

# General accuracy
accuracy = accuracy_score(val_labels, val_predictions)
print("General accuracy:", accuracy)

In [None]:
cmap=plt.cm.Blues

plt.figure(figsize=(14, 10))
plt.title("")

plt.imshow(conf_matrix, interpolation='nearest', cmap=cmap)
plt.colorbar()
tick_marks = np.arange(num_labels)
plt.xticks(tick_marks, original_labels, rotation=90)
plt.yticks(tick_marks, original_labels)

for i in range(num_labels):
        for j in range(num_labels):
            plt.gca().add_patch(plt.Rectangle((j - 0.5, i - 0.5), 1, 1, color='black', fill=None, linewidth=0.5))

plt.xlabel('Etiqueta predicha')
plt.ylabel('Etiqueta real')
plt.show()

In [None]:
model.save("Models/"+name_model+"_model")