In [1]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
import os
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix
import itertools
from sklearn.metrics import precision_score, recall_score, f1_score

2024-03-25 13:00:51.794243: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-25 13:00:51.796426: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-25 13:00:51.837275: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-25 13:00:51.838042: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
#Hyperparameters
num_epochs = 10
learning_rate = 5e-5
batch_size = 16
num_trains = 15
# bert-base-uncased, bert-large-cased
name_model = "bert-base-uncased"

In [3]:
#Get documents saved in new categories and their labels
base_folder = 'new_categories'

files = []
filenames = []
labels = []

#Search all files in folders
for sub_folder in os.listdir(base_folder):
    for file in os.listdir(os.path.join(base_folder,sub_folder)):
        if file.endswith('.txt'):
            file_path = os.path.join(base_folder,sub_folder, file)
            filenames.append(file_path)
            labels.append(int(sub_folder[-1]))
            with open(file_path, 'r',encoding='utf-8', errors='ignore') as file:
                content = file.read()
                files.append(content)

In [4]:
#Tokenize
tokenizer = BertTokenizer.from_pretrained(name_model)
token_files = tokenizer(files, truncation=True, padding=True, return_tensors='tf')

train_inputs, val_inputs, train_labels, val_labels = train_test_split(token_files['input_ids'].numpy(), labels, test_size=0.2, random_state=42)

train_inputs = tf.convert_to_tensor(train_inputs)
val_inputs = tf.convert_to_tensor(val_inputs)
train_labels = tf.convert_to_tensor(train_labels)
val_labels = tf.convert_to_tensor(val_labels)

2024-03-25 13:01:06.292671: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [5]:
for num in range(num_trains):

    model = TFBertForSequenceClassification.from_pretrained(name_model, num_labels=5)
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    
    model.compile(optimizer=optimizer, 
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
                  metrics=['accuracy'])

    history = model.fit(train_inputs, train_labels, validation_data=(val_inputs, val_labels), batch_size=batch_size, epochs=num_epochs)

    val_predictions = model.predict(val_inputs)
    val_predictions = np.argmax(val_predictions.logits, axis=1)

    precision = precision_score(val_labels, val_predictions, average='weighted')
    recall = recall_score(val_labels, val_predictions, average='weighted')
    f1 = f1_score(val_labels, val_predictions, average='weighted')
    acc = history.history['val_accuracy'][-1]

    with open('Results/results_'+name_model+'.txt', 'a') as file:
        file.write(f'Accuracy: {acc}\n')
        file.write(f'Precision: {precision}\n')
        file.write(f'Recall: {recall}\n')
        file.write(f'F1: {f1}\n')

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
 25/112 [=====>........................] - ETA: 6:26 - loss: 0.0343 - accuracy: 0.9900

KeyboardInterrupt: 

In [None]:
#Predict on validation set
val_predictions = model.predict(val_inputs)
val_predictions = np.argmax(val_predictions.logits, axis=1)

# Compute confusion matrix
cm = confusion_matrix(val_labels, val_predictions)

In [None]:
#Plot confusion matrix
plt.figure(figsize=(8, 6))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion matrix')
plt.colorbar()
tick_marks = np.arange(len(set(labels)))
plt.xticks(tick_marks, set(labels), rotation=45)
plt.yticks(tick_marks, set(labels))

fmt = 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    plt.text(j, i, format(cm[i, j], fmt),
             horizontalalignment="center",
             color="white" if cm[i, j] > thresh else "black")

plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
#Plot training loss and accuracy
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.show()

In [None]:
precision = precision_score(val_labels, val_predictions, average='weighted')
recall = recall_score(val_labels, val_predictions, average='weighted')
f1 = f1_score(val_labels, val_predictions, average='weighted')

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

In [None]:
model.save("Models/"+name_model+"_model")