In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, AdamW
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from tqdm import tqdm
from torch.cuda.amp import autocast, GradScaler

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
torch.backends.cudnn.benchmark = True

Using device: cuda


In [3]:
file_path = "data/noemoticon_preprocessed.csv"
df = pd.read_csv(file_path)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1597267 entries, 0 to 1597266
Data columns (total 2 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   polarity  1597267 non-null  int64 
 1   text      1597267 non-null  object
dtypes: int64(1), object(1)
memory usage: 24.4+ MB
None


In [4]:
label_mapping = {0: 0, 4: 1} #把原始的 0/4 映射成 0/1，实现 二分类任务
df["polarity"] = df["polarity"].map(label_mapping)

In [5]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['polarity'].tolist(), test_size=0.2, random_state=42
)   

In [6]:
# Load BERT tokenizer
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

In [7]:
# Define dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [8]:
# Create DataLoader
batch_size = 128
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
val_dataset = TextDataset(val_texts, val_labels, tokenizer)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True , num_workers=4, pin_memory=True)
# # val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True )
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [9]:
# Define BERT model
class BertClassifier(nn.Module):
    def __init__(self, num_classes=2):
        super(BertClassifier, self).__init__()
        # self.bert = BertModel.from_pretrained("bert-base-uncased") ## >100 hours
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)
    
    # def forward(self, input_ids, attention_mask):
    #     outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
    #     pooled_output = outputs.pooler_output
    #     x = self.dropout(pooled_output)
    #     return self.fc(x)
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        x = self.dropout(outputs.last_hidden_state[:, 0, :])  # 取CLS token
        return self.fc(x)

In [10]:
# Initialize model
model = BertClassifier().to(device)
# model = torch.compile(BertClassifier().to(device))
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()
scaler = torch.amp.GradScaler() # Enable mixed precision



In [11]:
# Training loop with mixed precision and gradient accumulation
def train(model, train_loader, val_loader, optimizer, criterion, epochs=3, accumulation_steps=2):
    for epoch in range(epochs):
        model.train()
        total_loss, total_correct = 0, 0
        optimizer.zero_grad()

        for step, batch in enumerate(tqdm(train_loader)):
            input_ids = batch['input_ids'].to(device, non_blocking=True)
            attention_mask = batch['attention_mask'].to(device, non_blocking=True)
            labels = batch['labels'].to(device, non_blocking=True)

            with torch.amp.autocast('cuda'):
                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels) / accumulation_steps

            scaler.scale(loss).backward()

            if (step + 1) % accumulation_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()

            total_loss += loss.item() * accumulation_steps
            total_correct += (outputs.argmax(dim=1) == labels).sum().item()

        print(f"Epoch {epoch+1}: Loss = {total_loss/len(train_loader):.4f}, Accuracy = {total_correct/len(train_loader.dataset):.4f}")
        evaluate(model, val_loader)


In [12]:
# Evaluation function
def evaluate(model, val_loader):
    model.eval()
    preds, true_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask)
            preds.extend(outputs.argmax(dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    print(classification_report(true_labels, preds))

In [None]:
# Start training
train(model, train_loader, val_loader, optimizer, criterion, epochs=3, accumulation_steps=2)

  4%|▍         | 380/9983 [09:19<11:34:01,  4.34s/it]