In [1]:
import os
import sys
import logging
import time

import pandas as pd
import torch
import torch.optim as optim
from torch.utils.data import DataLoader

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
# 使用AutoXXX更智能
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm

  _torch_pytree._register_pytree_node(


hugging face库中对应模型的名字

In [2]:
Roberta = "xlm-roberta-base" 

In [3]:
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)

logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info(r"running %s" % ''.join(sys.argv))

2025-05-08 13:15:06,346: INFO: running d:\Anaconda\envs\nlp\lib\site-packages\ipykernel_launcher.py--f=c:\Users\Administrator\AppData\Roaming\jupyter\runtime\kernel-v350ee38b4198a2c8f534781948c623c65f26c7cd0.json


In [4]:
labeledTrainDataPath = r"D:\workplace\NLP_learning\dataset\labeledTrainData.tsv"
testDataPath = r"D:\workplace\NLP_learning\dataset\testData.tsv"

def ReadData(path):
    return pd.read_csv(path, header=0, delimiter="\t", quoting=3)
train = ReadData(labeledTrainDataPath)
test = ReadData(testDataPath)

train_texts, train_labels, test_texts = [], [], []
for i, review in enumerate(train["review"]):
    train_texts.append(review)
    train_labels.append(train["sentiment"][i])

for review in test["review"]:
    test_texts.append(review)

train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2)

开始训练Roberta

In [5]:
tokenizer = AutoTokenizer.from_pretrained(Roberta)
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)
print("第一条样本的编码:")
print({k: v[0] for k, v in train_encodings.items()})



第一条样本的编码:
{'input_ids': [0, 44, 8912, 282, 3395, 5154, 41550, 6921, 9494, 25, 7, 21, 12713, 875, 177283, 136, 3514, 208, 93905, 3357, 2481, 621, 4734, 142, 27992, 5, 20413, 903, 1346, 12, 442, 25, 7, 162520, 764, 831, 25, 18, 27992, 100052, 3267, 248, 74047, 3267, 57433, 68551, 272, 6921, 9494, 4, 77064, 669, 6057, 111, 70, 19336, 49119, 23, 1919, 661, 441, 22153, 15663, 133, 134327, 160964, 90, 1639, 4, 661, 441, 22153, 42724, 7, 1639, 136, 28, 5416, 35801, 223, 661, 441, 22153, 15663, 133, 134327, 1639, 36549, 15, 53927, 1957, 10, 147453, 12768, 3688, 4, 707, 83, 442, 1660, 163, 32, 247, 13950, 7831, 1919, 150679, 9, 21732, 9, 7655, 2242, 5132, 47, 70, 6957, 49119, 678, 661, 441, 22153, 15663, 133, 134327, 12, 173883, 6889, 154162, 1639, 15, 53, 2037, 54926, 2685, 25, 7, 10, 71496, 13, 12768, 111, 34153, 86595, 678, 242, 441, 25, 16, 20, 198343, 142, 50094, 111, 661, 441, 22153, 15663, 133, 134327, 1639, 1163, 5252, 25842, 678, 10, 335, 9, 187430, 100052, 3267, 248, 74047, 3267, 5743

In [6]:
class TrainDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        # dict.items()返回字典中所有键值对的视图对象，格式为 (key, value) 元组。
        # item是一个字典，保存了每个键的第idx个样本
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx]) 
        return item
    
    def __len__(self):
        return len(self.labels)
    
class TestDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, num_samples=0):
        self.encodings = encodings
        self.num_samples = num_samples

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return self.num_samples
    
train_dataset = TrainDataset(train_encodings, train_labels)
val_dataset = TrainDataset(val_encodings, val_labels)
test_dataset = TestDataset(test_encodings, num_samples=len(test_texts))

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [7]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model =  AutoModelForSequenceClassification.from_pretrained(Roberta)
model.to(device)
model.train()
optim = optim.AdamW(model.parameters(), lr=1e-5)

  _torch_pytree._register_pytree_node(
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
for epoch in range(3):
    start = time.time()
    train_loss, val_losses = 0, 0
    train_acc, val_acc = 0, 0
    n, m = 0, 0

    with tqdm(total=len(train_loader), desc="Epoch %d" % epoch) as pbar:
        for batch in train_loader:
            n += 1
            optim.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optim.step()
            train_acc += accuracy_score(torch.argmax(outputs.logits.cpu().data, dim=1), labels.cpu())
            train_loss += loss.cpu()

            pbar.set_postfix({'epoch': '%d' % (epoch),
                                'train loss': '%.4f' % (train_loss.data / n),
                                'train acc': '%.2f' % (train_acc / n)
                                })
            pbar.update(1)

        with torch.no_grad():
            for batch in val_loader:
                m += 1
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                val_loss = outputs.loss
                val_acc += accuracy_score(torch.argmax(outputs.logits.cpu().data, dim=1), labels.cpu())
                val_losses += val_loss
        end = time.time()
        runtime = end - start
        pbar.set_postfix({'epoch': '%d' % (epoch),
                            'train loss': '%.4f' % (train_loss.data / n),
                            'train acc': '%.2f' % (train_acc / n),
                            'val loss': '%.4f' % (val_losses.data / m),
                            'val acc': '%.2f' % (val_acc / m),
                            'time': '%.2f' % (runtime)})

        # print('epoch: %d, train loss: %.4f, train acc: %.2f, val loss: %.4f, val acc: %.2f, time: %.2f' %
        #       (epoch, train_loss.data / n, train_acc / n, val_losses.data / m, val_acc / m, runtime))
test_pred = []
with torch.no_grad():
    with tqdm(total=len(test_loader), desc='Predction') as pbar:
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            # test_pred.extent
            test_pred.extend(torch.argmax(outputs.logits.cpu().data, dim=1).numpy().tolist())

            pbar.update(1)

result_output = pd.DataFrame(data={"id": test["id"], "sentiment": test_pred})
result_output.to_csv("./result/roberta.csv", index=False, quoting=3)
logging.info('result saved!')

Epoch 0: 100%|██████████| 2500/2500 [14:11<00:00,  2.94it/s, epoch=0, train loss=0.3009, train acc=0.87, val loss=0.2062, val acc=0.92, time=851.36]
Epoch 1: 100%|██████████| 2500/2500 [14:16<00:00,  2.92it/s, epoch=1, train loss=0.1807, train acc=0.93, val loss=0.1991, val acc=0.93, time=856.89]
Epoch 2: 100%|██████████| 2500/2500 [14:16<00:00,  2.92it/s, epoch=2, train loss=0.1250, train acc=0.96, val loss=0.2127, val acc=0.92, time=857.09]
Predction: 100%|██████████| 3125/3125 [07:36<00:00,  6.84it/s]
2025-05-08 14:05:40,334: INFO: result saved!
