## Imports

In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

import nltk
from sklearn.feature_extraction import text
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
import numpy as np


nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')


ModuleNotFoundError: No module named 'nltk'

## Load Datasets

In [2]:
# Load your data
data = pd.read_json('../data/data.jsonl', lines=True)
test_data = pd.read_json('../data/test_final.jsonl', lines=True)
train_data = pd.read_json('../data/train_final.jsonl', lines=True)
validation_data = pd.read_json('../data/validation_final.jsonl', lines=True)

# Remove duplicates
test_data = test_data.drop_duplicates(subset=['text'])
train_data = train_data.drop_duplicates(subset=['text'])
validation_data = validation_data.drop_duplicates(subset=['text'])


## Pre-processing

In [3]:
my_stop_words = text.ENGLISH_STOP_WORDS
words_to_keep = frozenset(['no', 'couldnt', 'cry', 'not', 'cant', 'cannot', 'nor', 'except', 'nobody',
                           'off', 'but', 'serious', 'enough', 'nothing', 'alone', 'down', 'only', 'without'])
my_stop_words = my_stop_words - words_to_keep

def pre_process_data(dataset):
    # Tokenize
    #dataset['text'] = dataset['text'].apply(word_tokenize)
    # Remove stop words
    dataset['text'] = dataset['text'].apply(lambda x: [word for word in x if word.lower() not in my_stop_words])
    return dataset


train_data = pre_process_data(train_data)
validation_data = pre_process_data(validation_data)
test_data = pre_process_data(test_data)


## Tokenization Using BERT Tokenizer:
BERT tokenizer requires the data to be in a specific format

???????? MAS ESTÁ CERTO ANTES TAMBEM TOKENIZARMOS ????????


* podemos usar o bert-base-uncased e o roberta-based e comparar dps
* maybe usar embeddings tb depois


In [4]:
# Initialize the BERT tokenizer and model
model_name = "bert-base-uncased"

# SUPOSTAMENTE ESTE TB É BOM
# model_name = "roberta-base"


tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6)

# Tokenize using BERT tokenizer
def tokenize_data(texts):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=128)

train_encodings = tokenize_data(train_data['text'].apply(lambda x: ' '.join(x)).tolist())
val_encodings = tokenize_data(validation_data['text'].apply(lambda x: ' '.join(x)).tolist())
test_encodings = tokenize_data(test_data['text'].apply(lambda x: ' '.join(x)).tolist())

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Create Dataset Objects

In [5]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings, train_data['label'].tolist())
val_dataset = TextDataset(val_encodings, validation_data['label'].tolist())
test_dataset = TextDataset(test_encodings, test_data['label'].tolist())

## Training

#### Function to compute the training metrics

In [6]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


#### Training hyppertunning

In [7]:
import optuna
from transformers import Trainer, TrainingArguments

def model_init():
    return AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=6)

def objective(trial):
    # Hyperparameters to tune
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    num_train_epochs = trial.suggest_int("num_train_epochs", 1, 5)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32])
    
    # Define TrainingArguments with hyperparameters from the trial
    training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=learning_rate,
        per_device_train_batch_size=per_device_train_batch_size,
        num_train_epochs=num_train_epochs,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
    )
    
    trainer = Trainer(
        model_init=model_init,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    
    trainer.train()
    eval_result = trainer.evaluate()
    
    # Optuna aims to minimize the objective, so if accuracy is the metric, return 1 - accuracy
    return 1 - eval_result["eval_accuracy"]


study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)

print("Best trial:")
trial_ = study.best_trial

print(f"  Value: {trial_.value}")
print("  Params: ")
for key, value in trial_.params.items():
    print(f"    {key}: {value}")


[I 2024-05-13 01:00:14,613] A new study created in memory with name: no-name-45fc02cc-2e55-44fe-a813-1277121754bd
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 38%|███▊      | 500/1332 [10:19:12<5:43:29, 24.77s/it]    

{'loss': 1.7813, 'grad_norm': 2.827212333679199, 'learning_rate': 2.448815774777256e-05, 'epoch': 0.38}


 75%|███████▌  | 1000/1332 [17:21:11<1:42:51, 18.59s/it]   

{'loss': 1.7559, 'grad_norm': 2.259033203125, 'learning_rate': 9.77171679358232e-06, 'epoch': 0.75}


                                                        
100%|██████████| 1332/1332 [20:19:12<00:00, 23.48s/it]

{'eval_loss': 1.7080078125, 'eval_accuracy': 0.2482400889218229, 'eval_f1': 0.21928267292887021, 'eval_precision': 0.2784160033234338, 'eval_recall': 0.2482400889218229, 'eval_runtime': 1645.0227, 'eval_samples_per_second': 3.281, 'eval_steps_per_second': 0.41, 'epoch': 1.0}


100%|██████████| 1332/1332 [20:19:16<00:00, 54.92s/it]


{'train_runtime': 73156.1427, 'train_samples_per_second': 0.582, 'train_steps_per_second': 0.018, 'train_loss': 1.7605178878830001, 'epoch': 1.0}


100%|██████████| 675/675 [33:26<00:00,  2.97s/it]    
[I 2024-05-13 21:53:01,933] Trial 0 finished with value: 0.7517599110781771 and parameters: {'learning_rate': 3.92045987019628e-05, 'num_train_epochs': 1, 'per_device_train_batch_size': 32}. Best is trial 0 with value: 0.7517599110781771.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 19%|█▉        | 500/2663 [11:33:09<8:52:05, 14.76s/it]     

{'loss': 1.801, 'grad_norm': 2.984633684158325, 'learning_rate': 3.0767038573243956e-05, 'epoch': 0.19}


 38%|███▊      | 1000/2663 [13:41:11<6:39:52, 14.43s/it]

{'loss': 1.7963, 'grad_norm': 3.174362897872925, 'learning_rate': 2.3654916850348915e-05, 'epoch': 0.38}


 45%|████▌     | 1209/2663 [18:20:33<306:16:13, 758.30s/it]  

In [None]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
 # Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch", # run validation at the end of each epoch
    save_strategy="epoch",
    load_best_model_at_end=True,
)

print("Training labels range: ", min(train_data['label']), "to", max(train_data['label']))
print("Validation labels range: ", min(validation_data['label']), "to", max(validation_data['label']))
print("Test labels range: ", min(test_data['label']), "to", max(test_data['label']))

print("Training data NaN values:", train_data.isnull().values.any())
print("Validation data NaN values:", validation_data.isnull().values.any())
print("Test data NaN values:", test_data.isnull().values.any())

 # Initialize Trainer
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
print(os.getenv('CUDA_LAUNCH_BLOCKING'))
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

print(trainer)

Training labels range:  0 to 5
Validation labels range:  0 to 5
Test labels range:  0 to 5
Training data NaN values: False
Validation data NaN values: False
Test data NaN values: False
1
<transformers.trainer.Trainer object at 0x00000264BB079790>


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


## Training Evaluation

In [None]:
trainer.train()
eval_results = trainer.evaluate()
print(eval_results)

  0%|          | 0/7989 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  6%|▋         | 500/7989 [03:25<1:13:50,  1.69it/s]

{'loss': 1.7972, 'learning_rate': 1.8748278883464764e-05, 'epoch': 0.19}


 13%|█▎        | 1000/7989 [08:20<1:11:11,  1.64it/s]

{'loss': 1.7642, 'learning_rate': 1.749655776692953e-05, 'epoch': 0.38}


 19%|█▉        | 1500/7989 [13:28<1:00:43,  1.78it/s]

{'loss': 1.6429, 'learning_rate': 1.6244836650394292e-05, 'epoch': 0.56}


 25%|██▌       | 2000/7989 [18:29<1:06:08,  1.51it/s]

{'loss': 1.4076, 'learning_rate': 1.4993115533859057e-05, 'epoch': 0.75}


 31%|███▏      | 2500/7989 [23:40<55:58,  1.63it/s]  

{'loss': 1.1916, 'learning_rate': 1.3741394417323821e-05, 'epoch': 0.94}


                                                   
 33%|███▎      | 2663/7989 [26:33<53:56,  1.65it/s]

{'eval_loss': 1.1756267547607422, 'eval_accuracy': 0.5679881437569471, 'eval_f1': 0.5669536376467866, 'eval_precision': 0.5850901822610035, 'eval_recall': 0.5679881437569471, 'eval_runtime': 73.395, 'eval_samples_per_second': 73.547, 'eval_steps_per_second': 4.605, 'epoch': 1.0}


 38%|███▊      | 3000/7989 [29:55<49:32,  1.68it/s]   

{'loss': 1.0038, 'learning_rate': 1.2489673300788585e-05, 'epoch': 1.13}


 44%|████▍     | 3500/7989 [35:23<49:49,  1.50it/s]  

{'loss': 0.8031, 'learning_rate': 1.123795218425335e-05, 'epoch': 1.31}


 50%|█████     | 4000/7989 [40:58<43:21,  1.53it/s]

{'loss': 0.7058, 'learning_rate': 9.986231067718113e-06, 'epoch': 1.5}


 56%|█████▋    | 4500/7989 [46:34<39:21,  1.48it/s]

{'loss': 0.6119, 'learning_rate': 8.734509951182877e-06, 'epoch': 1.69}


 63%|██████▎   | 5000/7989 [52:16<33:00,  1.51it/s]

{'loss': 0.5418, 'learning_rate': 7.482788834647641e-06, 'epoch': 1.88}


                                                   
 67%|██████▋   | 5326/7989 [57:11<27:35,  1.61it/s]

{'eval_loss': 0.5322026610374451, 'eval_accuracy': 0.8206743238236384, 'eval_f1': 0.8202737266200173, 'eval_precision': 0.8313848720717878, 'eval_recall': 0.8206743238236384, 'eval_runtime': 76.4421, 'eval_samples_per_second': 70.616, 'eval_steps_per_second': 4.422, 'epoch': 2.0}


 69%|██████▉   | 5500/7989 [59:02<25:44,  1.61it/s]   

{'loss': 0.4673, 'learning_rate': 6.231067718112405e-06, 'epoch': 2.07}


 75%|███████▌  | 6000/7989 [1:04:48<28:37,  1.16it/s]

{'loss': 0.4155, 'learning_rate': 4.979346601577169e-06, 'epoch': 2.25}


 81%|████████▏ | 6500/7989 [1:12:50<26:42,  1.08s/it]

{'loss': 0.3866, 'learning_rate': 3.727625485041933e-06, 'epoch': 2.44}


 88%|████████▊ | 7000/7989 [1:23:21<22:31,  1.37s/it]

{'loss': 0.3762, 'learning_rate': 2.475904368506697e-06, 'epoch': 2.63}


 94%|█████████▍| 7500/7989 [1:34:41<11:11,  1.37s/it]

{'loss': 0.3519, 'learning_rate': 1.2241832519714608e-06, 'epoch': 2.82}


                                                     
100%|██████████| 7989/7989 [1:48:27<00:00,  1.35s/it]

{'eval_loss': 0.4559192657470703, 'eval_accuracy': 0.856428306780289, 'eval_f1': 0.857196278947809, 'eval_precision': 0.8695140701219711, 'eval_recall': 0.856428306780289, 'eval_runtime': 158.0496, 'eval_samples_per_second': 34.154, 'eval_steps_per_second': 2.139, 'epoch': 3.0}


100%|██████████| 7989/7989 [1:48:29<00:00,  1.23it/s]


{'train_runtime': 6509.7128, 'train_samples_per_second': 19.635, 'train_steps_per_second': 1.227, 'train_loss': 0.8646236930118513, 'epoch': 3.0}


100%|██████████| 338/338 [02:35<00:00,  2.18it/s]

{'eval_loss': 0.4559192657470703, 'eval_accuracy': 0.856428306780289, 'eval_f1': 0.857196278947809, 'eval_precision': 0.8695140701219711, 'eval_recall': 0.856428306780289, 'eval_runtime': 155.5011, 'eval_samples_per_second': 34.714, 'eval_steps_per_second': 2.174, 'epoch': 3.0}



