In [None]:
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score

In [None]:
from google.colab import drive
from IPython.display import Image, display

mount_point = "/content/gdrive"
base_path = mount_point + "/MyDrive/Colab/SDG/data"

input_data_path = base_path + "/input/sdg_17_labels_classification_dataset_1020_texts_TEST_2023.12.11.xlsx"
input_data_path_train = base_path + "/input/sdg_17_labels_classification_dataset_4760_texts_TRAIN_2023.12.11.xlsx"

#top7_train_data_sim_to_each_test_data_filename = base_path + "/input/top7_train_data_sim_to_each_test_data_but_with_different_classes_with_2023.12.11_input_data.pickle"

# output_data_path = base_path + "/../Mistral/data/output/" + experiment_name
# log_filename = base_path + "/../Mistral/logs/" + experiment_name + ".log"

drive.mount(mount_point, force_remount=True)

Mounted at /content/gdrive


In [None]:
train_df = pd.read_excel(input_data_path_train)
train_df.head(3)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,text_id,doi,text,sdg,labels_negative,labels_positive,agreement,text_length
0,0,2065,2944,,This Vitamin Reduces Mental Health Problems By...,0,,,,9
1,1,1325,1911,,'League Of Legends' unveils new Arena game mod...,0,,,,66
2,2,409,581,,Community remembers Maddi Kingsbury at public ...,0,,,,56


In [None]:
test_df = pd.read_excel(input_data_path)
test_df.head(3)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,text_id,doi,text,sdg,labels_negative,labels_positive,agreement,text_length
0,0,0,6668,,L train resumes service after derailing from t...,0,,,,38
1,1,1,6099,,Brooklyn Decker’s SI Swimsuit Cover Photo Shoo...,0,,,,15
2,2,2,6471,,Mumbai: The Bombay High Court has suspended th...,0,,,,125


In [None]:
X_train = list(train_df.text.values)
y_train = list(train_df.sdg.values)

X_test = list(test_df.text.values)
y_test = list(test_df.sdg.values)

In [None]:
num_classes = 17

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Supponiamo che tu abbia già i dati di addestramento e di test
# X_train, y_train, X_test, y_test sono liste di testi e le rispettive etichette

# Carica il tokenizer e il modello BERT preaddestrato
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)

# Tokenizza i dati di addestramento e di test
tokenized_train = tokenizer(X_train, truncation=True, padding=True, max_length=128, return_tensors='pt')
tokenized_test = tokenizer(X_test, truncation=True, padding=True, max_length=128, return_tensors='pt')

# Creazione dei DataLoader
train_dataset = TensorDataset(tokenized_train['input_ids'], tokenized_train['attention_mask'], torch.tensor(y_train))
test_dataset = TensorDataset(tokenized_test['input_ids'], tokenized_test['attention_mask'], torch.tensor(y_test))

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Impostazione del dispositivo di esecuzione (CPU o GPU, se disponibile)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Impostazione dell'ottimizzatore e della funzione di perdita
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

# Addestramento del modello
num_epochs = 5
# for epoch in range(num_epochs):
#     model.train()
#     for input_ids, attention_mask, labels in train_loader:
#         input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

#         optimizer.zero_grad()
#         outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()

# for epoch in range(num_epochs):
#     model.train()
#     epoch_loss = 0.0
#     progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}', leave=False, dynamic_ncols=True)

#     for input_ids, attention_mask, labels in progress_bar:
#         input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

#         optimizer.zero_grad()
#         outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()

#         epoch_loss += loss.item()
#         progress_bar.set_postfix(loss=epoch_loss / len(train_loader), refresh=True)

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}', leave=False, dynamic_ncols=True)

    for input_ids, attention_mask, labels in progress_bar:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=epoch_loss / len(train_loader), refresh=True)

    # Valuta il modello alla fine di ogni epoch
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for input_ids, attention_mask, labels in test_loader:
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    print(f'Epoch {epoch + 1}/{num_epochs} - Loss: {epoch_loss / len(train_loader):.4f} - Accuracy: {accuracy:.4f}')

    # Riporta il modello in modalità di addestramento
    model.train()

# Valutazione del modello
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for input_ids, attention_mask, labels in test_loader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()

        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5 - Loss: 1.1799 - Accuracy: 0.8745




Epoch 2/5 - Loss: 0.3817 - Accuracy: 0.8775




Epoch 3/5 - Loss: 0.2194 - Accuracy: 0.8902




Epoch 4/5 - Loss: 0.1268 - Accuracy: 0.8892




Epoch 5/5 - Loss: 0.0906 - Accuracy: 0.8725


AttributeError: 'list' object has no attribute 'shape'

In [None]:
# Calcolo delle metriche di valutazione
print(len(all_labels))
print(len(all_preds))
print(classification_report(all_labels, all_preds, digits=4))

1020
1020
              precision    recall  f1-score   support

           0     1.0000    0.9000    0.9474        60
           1     0.7500    0.8000    0.7742        60
           2     0.8209    0.9167    0.8661        60
           3     0.9661    0.9500    0.9580        60
           4     0.9310    0.9000    0.9153        60
           5     0.9016    0.9167    0.9091        60
           6     0.9500    0.9500    0.9500        60
           7     0.8082    0.9833    0.8872        60
           8     0.7000    0.7000    0.7000        60
           9     0.7237    0.9167    0.8088        60
          10     0.7708    0.6167    0.6852        60
          11     0.9630    0.8667    0.9123        60
          12     0.8689    0.8833    0.8760        60
          13     0.9091    0.8333    0.8696        60
          14     0.9355    0.9667    0.9508        60
          15     0.9792    0.7833    0.8704        60
          16     0.9500    0.9500    0.9500        60

    accuracy    