In [1]:
import pandas as pd

train_df = pd.read_csv("train_audience.csv").dropna()  # Carregando dados de treino
test_df = pd.read_csv("test_audience.csv").dropna() # Carregando dados de teste

# train_df = train_df.groupby('Rating', group_keys=False).apply(lambda x: x.sample(100)).reset_index(drop=True)
# test_df = train_df.groupby('Rating', group_keys=False).apply(lambda x: x.sample(100)).reset_index(drop=True)

In [None]:
from transformers import  Trainer, Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer, BatchEncoding
import torch

NUM_LABELS = 10  # Número de classes
# model_name = "xlnet-large-cased"  # Escolha o modelo conforme sua necessidade
model_name = "bert-large-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)  # Carregar o tokenizer
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=NUM_LABELS)  # Carregar o modelo de classificação

  from .autonotebook import tqdm as notebook_tqdm
2024-12-11 21:08:19.566162: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-11 21:08:19.573364: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733962099.582397   33730 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733962099.585022   33730 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-11 21:08:19.594905: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorF

TypeError: not a string

In [None]:
from datasets import Dataset, DatasetDict
import tensorflow as tf
import random
import numpy as np
tf.random.set_seed(42)
np.random.seed(42)
random.seed(42)




train_data_set = Dataset.from_pandas(train_df)
test_set = Dataset.from_pandas(test_df)
train_val_set = train_data_set.train_test_split (test_size = 0.2)

datasets  = DatasetDict({
    "train": train_val_set["train"],
    "test": test_set,
    "validation": train_val_set["test"],
})

print (datasets)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def  plot_loss_accuracy(df):
    plt.figure(figsize=(16, 6))
    plt.subplot(121)
    sns.lineplot(data=df[['loss', 'eval_loss']])

In [None]:
from sklearn import metrics

def plot_confusion_matrix(dataset, predictions, labels=None):
    #calcula a acurácia balanceada
    labels = dataset['Rating']
    preds = [0 if pred['label'] == 'LABEL_0' else 1 for pred in predictions]
    
    
    print('balanced Accuracy:', balanced_accuracy_score(labels, preds))
    print('accuracy', metrics.accuracy_score(labels, preds))

    # Calculo da matriz de confusão.
    c = metrics.confusion_matrix(labels, preds)
    r = c / c.sum(axis=1, keepdims=True)

    plt.figure(figsize=(10, 8))
    sns.heatmap(r, annot=True, fmt=".2f", cmap="Blues",  annot_kws={'size': 16})
    plt.xlabel("Classe predita")
    plt.ylabel("Classe verdadeira")
    plt.title("Matriz de confusão relativa")
    plt.show()

In [None]:


# funcao de preprocessamento
def preprocess_function(examples):
    return tokenizer(examples["Review"], truncation=True, padding=True, max_length=256)

# roda a funcao de preprocessamento em todos os datasets
tokenized_dataset = datasets.map(preprocess_function, batched=True)
print(tokenized_dataset)

In [None]:
train_encodings = BatchEncoding({
    'input_ids': tokenized_dataset['train']['input_ids'],
    'token_type_ids': tokenized_dataset['train']['token_type_ids'],
    'attention_mask': tokenized_dataset['train']['attention_mask'],
})

valid_encodings = BatchEncoding({
    'input_ids': tokenized_dataset['validation']['input_ids'],
    'token_type_ids': tokenized_dataset['validation']['token_type_ids'],
    'attention_mask': tokenized_dataset['validation']['attention_mask'],
})



In [None]:
print(type(valid_encodings))

In [None]:
class DatasetLoader(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    
train_dataset = DatasetLoader(train_encodings, list(datasets['train']['Rating']))
valid_dataset = DatasetLoader(valid_encodings, list(datasets['validation']['Rating']))

In [None]:
training_args = TrainingArguments(
per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    num_train_epochs=10,
    logging_steps=150,
    evaluation_strategy="steps",
    eval_steps=50,
    output_dir='./results1',
    learning_rate=0.00001,
    lr_scheduler_type="constant",
    remove_unused_columns=False,
)


In [None]:
def make_model_contiguous(model):
    # Aplica .contiguous() a todos os tensores dos parâmetros do modelo
    for param in model.parameters():
        if not param.is_contiguous():
            param.data = param.data.contiguous()

In [None]:
from sklearn.metrics import balanced_accuracy_score

def compute_metrics(p):
    return {"balanced_accuracy": balanced_accuracy_score(p.label_ids, np.argmax(p.predictions, axis=1))}

In [None]:
make_model_contiguous(model)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
df_loss = pd.DataFrame(columns=['epoch', 'loss'])
df_val_loss = pd.DataFrame(columns=['epoch', 'eval_loss','eval_balanced_accuracy'])

for values in trainer.state.log_history:
    if values.get('loss') is not None:
         df_new_row = pd.DataFrame.from_records({'epoch': values['epoch'], 'loss': values['loss']}, index=[0])
         df_loss = pd.concat([df_loss, df_new_row], ignore_index=True)
    elif values.get('eval_loss') is not None:
         df_val_new_row = pd.DataFrame.from_records({'epoch': values['epoch'], 'eval_loss': values['eval_loss'], 'eval_balanced_accuracy': values['eval_balanced_accuracy']}, index=[0])
         df_val_loss = pd.concat([df_val_loss, df_val_new_row], ignore_index=True)

mergedDf = pd.merge(df_loss, df_val_loss)

In [None]:
plot_loss_accuracy(mergedDf)

In [None]:
from transformers import pipeline
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)

predictions = classifier(datasets["validation"]["Review"], batch_size=4)


plot_confusion_matrix(datasets["validation"], predictions)