In [1]:
!pip install transformers
!pip install tensorflow
!pip install scikit-learn
!pip install matplotlib

[0m

In [2]:
from transformers import XLNetTokenizer, TFXLNetForSequenceClassification
import tensorflow as tf
from sklearn.model_selection import train_test_split
import os
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix
import itertools
from sklearn.metrics import precision_score, recall_score, f1_score

2024-04-10 12:03:05.319255: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-10 12:03:05.322302: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-10 12:03:05.375001: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
#Hyperparameters
num_epochs = 10
learning_rate = 5e-5
batch_size = 16
num_trains = 15
#xlnet-base-cased, xlnet-large-cased,
name_model = "xlnet-base-cased"

In [4]:
#Get documents saved in new categories and their labels
base_folder = 'new_categories'

files = []
filenames = []
labels = []

#Search all files in folders
for sub_folder in os.listdir(base_folder):
    for file in os.listdir(os.path.join(base_folder,sub_folder)):
        if file.endswith('.txt'):
            file_path = os.path.join(base_folder,sub_folder, file)
            filenames.append(file_path)
            labels.append(int(sub_folder[-1]))
            with open(file_path, 'r',encoding='utf-8', errors='ignore') as file:
                content = file.read()
                files.append(content)

In [5]:
# Check the ino was taken correctly
print(len(files))
print(files[20])

2225
Europe blames US over weak dollar

European leaders have openly blamed the US for the sharp rise in the value of the euro.

US officials were talking up the dollar, they said, but failing to take action to back up their words. Meeting in Brussels, finance ministers of the 12 eurozone countries voiced their concern that the rise of the european currency was harming exports. The dollar is within touching distance of an all-time low reached earlier in November. At 0619 GMT on Tuesday, the dollar was up slightly at just above $1.29 to the euro, and buying 105.6 yen in Tokyo. It rallied briefly on Monday amid signs that oil prices are easing.

But analysts said the respite was likely to be only temporary. The European ministers' comments, said Junya Tanase of JPMorgan Chase bank in Tokyo, were "generally too weak to produce a market reaction".

Still, by the standards of diplomacy the European ministers were forthright. Nicolas Sarkozy of France said he and his colleagues were unanimou

In [8]:
#Tokenize
tokenizer = XLNetTokenizer.from_pretrained(name_model)
max_length = 512
token_files = tokenizer(files, truncation=True, padding=True, max_length=512, return_tensors='tf')

#Divide data into test and train
train_inputs, val_inputs, train_labels, val_labels = train_test_split(token_files['input_ids'].numpy(), labels, test_size=0.2, random_state=42)

#Transform them to tensors
train_inputs = tf.convert_to_tensor(train_inputs)
val_inputs = tf.convert_to_tensor(val_inputs)
train_labels = tf.convert_to_tensor(train_labels)
val_labels = tf.convert_to_tensor(val_labels)

tf.Tensor([1780  512], shape=(2,), dtype=int32)


In [7]:
for num in range(num_trains):

    model = TFXLNetForSequenceClassification.from_pretrained(name_model, num_labels=len(labels))
    #model = TFBertModel.from_pretrained(name_model)
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    
    model.compile(optimizer=optimizer, 
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
                  metrics=['accuracy'])

    history = model.fit(train_inputs, train_labels, validation_data=(val_inputs, val_labels), batch_size=batch_size, epochs=num_epochs)

    val_predictions = model.predict(val_inputs)
    val_predictions = np.argmax(val_predictions.logits, axis=1)

    precision = precision_score(val_labels, val_predictions, average='weighted')
    recall = recall_score(val_labels, val_predictions, average='weighted')
    f1 = f1_score(val_labels, val_predictions, average='weighted')
    acc = history.history['val_accuracy'][-1]

    with open('Results/results_'+name_model+'.txt', 'a') as file:
        file.write(f'Accuracy: {acc}\n')
        file.write(f'Precision: {precision}\n')
        file.write(f'Recall: {recall}\n')
        file.write(f'F1: {f1}\n')

Some layers from the model checkpoint at xlnet-base-cased were not used when initializing TFXLNetForSequenceClassification: ['lm_loss']
- This IS expected if you are initializing TFXLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFXLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary', 'logits_proj']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10


KeyboardInterrupt: 

In [None]:
#Predict on validation set
val_predictions = model.predict(val_inputs)
val_predictions = np.argmax(val_predictions.logits, axis=1)

# Compute confusion matrix
cm = confusion_matrix(val_labels, val_predictions)

In [None]:
#Plot confusion matrix
plt.figure(figsize=(8, 6))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion matrix')
plt.colorbar()
tick_marks = np.arange(len(set(labels)))
plt.xticks(tick_marks, set(labels), rotation=45)
plt.yticks(tick_marks, set(labels))

fmt = 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    plt.text(j, i, format(cm[i, j], fmt),
             horizontalalignment="center",
             color="white" if cm[i, j] > thresh else "black")

plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
#Plot training loss and accuracy
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.show()

In [None]:
precision = precision_score(val_labels, val_predictions, average='weighted')
recall = recall_score(val_labels, val_predictions, average='weighted')
f1 = f1_score(val_labels, val_predictions, average='weighted')

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

In [None]:
model.save("Models/"+name_model+"_model")