In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-sentiment-analysis/testData.tsv
/kaggle/input/imdb-sentiment-analysis/labeledTrainData.tsv
/kaggle/input/imdb-sentiment-analysis/sampleSubmission.csv
/kaggle/input/imdb-sentiment-analysis/unlabeledTrainData.tsv


In [2]:
%config Completer.use_jedi = False

In [3]:
import os
import sys
import logging
import time

import pandas as pd
import torch
import torch.optim as optim
from torch.utils.data import DataLoader

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm

In [4]:
train = pd.read_csv("/kaggle/input/imdb-sentiment-analysis/labeledTrainData.tsv",
                header=0, 
                delimiter="\t", 
                quoting=3)
test = pd.read_csv("/kaggle/input/imdb-sentiment-analysis/testData.tsv",
                header=0, 
                delimiter="\t", 
                quoting=3)
test.head()

Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio..."
4,"""12128_7""","""A very accurate depiction of small time mob l..."


In [5]:
train_texts, train_labels, test_texts = [], [], []
for i, review in enumerate(train["review"]):
    train_texts.append(review)
    train_labels.append(train["sentiment"][i])

for review in test["review"]:
    test_texts.append(review)

train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2)

In [6]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

print("第一条样本的编码:")
print({k: v[0] for k, v in train_encodings.items()})

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



第一条样本的编码:
{'input_ids': [1, 307, 5274, 386, 2874, 3107, 281, 942, 310, 354, 791, 271, 84018, 79153, 261, 3481, 364, 293, 262, 713, 272, 9497, 309, 1135, 10493, 508, 1301, 2620, 309, 13840, 268, 349, 287, 2620, 309, 8000, 5491, 5966, 3274, 302, 2620, 309, 699, 264, 791, 285, 260, 9497, 309, 3074, 5782, 262, 37882, 2620, 309, 269, 675, 817, 371, 619, 260, 325, 280, 268, 305, 314, 1452, 261, 262, 924, 261, 263, 262, 47684, 492, 260, 273, 4352, 272, 273, 418, 280, 297, 1221, 758, 292, 278, 261, 304, 278, 490, 266, 397, 688, 7160, 758, 261, 1999, 278, 1721, 304, 6003, 260, 273, 2218, 272, 274, 295, 489, 2795, 277, 1172, 434, 10807, 287, 8910, 18289, 262, 1718, 265, 12199, 262, 37147, 261, 373, 2787, 266, 8845, 422, 285, 260, 4052, 8981, 840, 1504, 4052, 8981, 840, 1504, 21978, 9497, 309, 3074, 5782, 262, 37882, 2620, 309, 1161, 372, 282, 657, 264, 433, 261, 873, 262, 410, 470, 264, 398, 278, 269, 267, 9497, 309, 1474, 368, 358, 56700, 2620, 309, 294, 266, 938, 269, 2106, 278, 438, 266, 9928

In [7]:
class TrainDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        # dict.items()返回字典中所有键值对的视图对象，格式为 (key, value) 元组。
        # item是一个字典，保存了每个键的第idx个样本
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx]) 
        return item
    
    def __len__(self):
        return len(self.labels)
    
class TestDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, num_samples=0):
        self.encodings = encodings
        self.num_samples = num_samples

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return self.num_samples
    
train_dataset = TrainDataset(train_encodings, train_labels)
val_dataset = TrainDataset(val_encodings, val_labels)
test_dataset = TestDataset(test_encodings, num_samples=len(test_texts))

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [8]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model =  AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-base")
model.to(device)
model.train()
optim = optim.AdamW(model.parameters(), lr=1e-5)

2025-05-08 05:56:00.495124: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746683760.678053      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746683760.730833      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
for epoch in range(3):
    start = time.time()
    train_loss, val_losses = 0, 0
    train_acc, val_acc = 0, 0
    n, m = 0, 0

    with tqdm(total=len(train_loader), desc="Epoch %d" % epoch) as pbar:
        for batch in train_loader:
            n += 1
            optim.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optim.step()
            train_acc += accuracy_score(torch.argmax(outputs.logits.cpu().data, dim=1), labels.cpu())
            train_loss += loss.cpu()

            pbar.set_postfix({'epoch': '%d' % (epoch),
                                'train loss': '%.4f' % (train_loss.data / n),
                                'train acc': '%.2f' % (train_acc / n)
                                })
            pbar.update(1)

        with torch.no_grad():
            for batch in val_loader:
                m += 1
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                val_loss = outputs.loss
                val_acc += accuracy_score(torch.argmax(outputs.logits.cpu().data, dim=1), labels.cpu())
                val_losses += val_loss
        end = time.time()
        runtime = end - start
        pbar.set_postfix({'epoch': '%d' % (epoch),
                            'train loss': '%.4f' % (train_loss.data / n),
                            'train acc': '%.2f' % (train_acc / n),
                            'val loss': '%.4f' % (val_losses.data / m),
                            'val acc': '%.2f' % (val_acc / m),
                            'time': '%.2f' % (runtime)})

        # print('epoch: %d, train loss: %.4f, train acc: %.2f, val loss: %.4f, val acc: %.2f, time: %.2f' %
        #       (epoch, train_loss.data / n, train_acc / n, val_losses.data / m, val_acc / m, runtime))
test_pred = []
with torch.no_grad():
    with tqdm(total=len(test_loader), desc='Predction') as pbar:
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            # test_pred.extent
            test_pred.extend(torch.argmax(outputs.logits.cpu().data, dim=1).numpy().tolist())

            pbar.update(1)

result_output = pd.DataFrame(data={"id": test["id"], "sentiment": test_pred})
result_output.to_csv("deberta.csv", index=False, quoting=3)
logging.info('result saved!')

Epoch 0:   0%|          | 0/2500 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Epoch 0: 100%|██████████| 2500/2500 [27:18<00:00,  1.53it/s, epoch=0, train loss=0.1784, train acc=0.93, val loss=0.1304, val acc=0.95, time=1638.49]
Epoch 1: 100%|██████████| 2500/2500 [27:16<00:00,  1.53it/s, epoch=1, train loss=0.1023, train acc=0.97, val loss=0.1361, val acc=0.95, time=1637.72]
Epoch 2: 100%|██████████| 2500/2500 [27:16<00:00,  1.53it/s, epoch=2, train loss=0.0635, train acc=0.98, val loss=0.1630, val acc=0.95, time=1638.26]
Predction: 100%|██████████| 1563/1563 [11:03<00:00,  2.35it/s]
