In [None]:
import pandas as pd

train_df = pd.read_csv("train_critic.csv").dropna()  # Carregando dados de treino
test_df = pd.read_csv("test_critic.csv").dropna() # Carregando dados de teste

In [None]:
from transformers import XLNetTokenizer, Trainer, Trainer, TrainingArguments, XLNetForSequenceClassification
import torch

NUM_LABELS = 2  # Número de classes
model_name = "xlnet-base-cased"  # Escolha o modelo conforme sua necessidade
tokenizer = XLNetTokenizer.from_pretrained(model_name)  # Carregar o tokenizer
model = XLNetForSequenceClassification.from_pretrained(model_name, num_labels=NUM_LABELS)  # Carregar o modelo de classificação

In [None]:
from datasets import Dataset
import tensorflow as tf
import random
import numpy as np
tf.random.set_seed(42)
np.random.seed(42)
random.seed(42)


train_data_set = Dataset.from_pandas(train_df)
train_val_set = train_data_set.train_test_split (test_size = 0.2)

print(train_val_set)

x_train = train_val_set["train"]["Review"]
x_val = train_val_set["test"]["Review"]
y_train = train_val_set["train"]["Sentiment"]
y_val = train_val_set["test"]["Sentiment"]

x_test = test_df["Review"]
test_set = Dataset.from_pandas(test_df)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def  plot_loss_accuracy(df):
    plt.figure(figsize=(16, 6))
    plt.subplot(121)
    sns.lineplot(data=df[['loss', 'eval_loss']])

In [None]:
from sklearn import metrics

def plot_confusion_matrix(dataset, predictions, labels=None):
    #calcula a acurácia balanceada
    labels = dataset['Sentiment']
    preds = [0 if pred['label'] == 'LABEL_0' else 1 for pred in predictions]
    
    
    print('balanced Accuracy:', balanced_accuracy_score(labels, preds))
    print('accuracy', metrics.accuracy_score(labels, preds))

    # Calculo da matriz de confusão.
    c = metrics.confusion_matrix(labels, preds)
    r = c / c.sum(axis=1, keepdims=True)

    plt.figure(figsize=(10, 8))
    sns.heatmap(r, annot=True, fmt=".2f", cmap="Blues",  annot_kws={'size': 16})
    plt.xlabel("Classe predita")
    plt.ylabel("Classe verdadeira")
    plt.title("Matriz de confusão relativa")
    plt.show()

In [None]:
train_encodings = tokenizer(list(x_train), truncation=True, padding=True, max_length=256)
valid_encodings = tokenizer(list(x_val), truncation=True, padding=True, max_length=256)

In [None]:
class DatasetLoader(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    
train_dataset = DatasetLoader(train_encodings, list(y_train))
valid_dataset = DatasetLoader(valid_encodings, list(y_val))

In [None]:
training_args = TrainingArguments(
per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=10,
    logging_dir='./logs1',
    logging_steps=200,
    evaluation_strategy="steps",
    eval_steps=200,
    output_dir='./results1',
    learning_rate=0.00001,
    lr_scheduler_type="constant",
    remove_unused_columns=False,
)


In [None]:
def make_model_contiguous(model):
    # Aplica .contiguous() a todos os tensores dos parâmetros do modelo
    for param in model.parameters():
        if not param.is_contiguous():
            param.data = param.data.contiguous()

In [None]:
from sklearn.metrics import balanced_accuracy_score

def compute_metrics(p):
    return {"balanced_accuracy": balanced_accuracy_score(p.label_ids, np.argmax(p.predictions, axis=1))}

In [None]:


make_model_contiguous(model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
df_loss = pd.DataFrame(columns=['epoch', 'loss'])
df_val_loss = pd.DataFrame(columns=['epoch', 'eval_loss','eval_balanced_accuracy'])

for values in trainer.state.log_history:
    if values.get('loss') is not None:
         df_new_row = pd.DataFrame.from_records({'epoch': values['epoch'], 'loss': values['loss']}, index=[0])
         df_loss = pd.concat([df_loss, df_new_row], ignore_index=True)
    elif values.get('train_loss') is not None:
         df_new_row = pd.DataFrame.from_records({'epoch': values['epoch'], 'loss': values['train_loss']}, index=[0])
         df_loss = pd.concat([df_loss, df_new_row], ignore_index=True)
    elif values.get('eval_loss') is not None:
         df_val_new_row = pd.DataFrame.from_records({'epoch': values['epoch'], 'eval_loss': values['eval_loss'], 'eval_balanced_accuracy': values['eval_balanced_accuracy']}, index=[0])
         df_val_loss = pd.concat([df_val_loss, df_val_new_row], ignore_index=True)

mergedDf = pd.merge(df_loss, df_val_loss)

In [None]:
mergedDf

In [None]:
plot_loss_accuracy(mergedDf)

In [None]:
from transformers import pipeline
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)

predictions = classifier(train_val_set["test"]["Review"], batch_size=16)


plot_confusion_matrix(train_val_set["test"], predictions)

In [None]:
predTest = classifier(test_set["Review"], batch_size=16)
plot_confusion_matrix( test_set, predTest)