In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaModel  # 使用 RoBERTa
import pandas as pd
from torch.optim.lr_scheduler import ReduceLROnPlateau  # 学习率调度器
from torch.utils.tensorboard import SummaryWriter  # 导入 TensorBoard 相关的库
from tqdm import tqdm  # 用于显示训练进度条

# 配置设备
device = torch.device("cpu")

# 自定义 collate_fn 处理变长输入
def collate_fn(batch):
    '''
    该函数用于将单个数据样本从 Dataset 中合并成一个批次。
    主要功能：
    1. 合并文本输入，处理每个文本的 `input_ids` 和 `attention_mask`，并做 padding 以保证输入长度一致。
    2. 合并标签，将标签转换为张量。
    '''
    text_inputs, labels = zip(*batch)

    # 获取所有文本输入的 input_ids 和 attention_mask
    input_ids = [text_input['input_ids'].squeeze(0) for text_input in text_inputs]
    attention_mask = [text_input['attention_mask'].squeeze(0) for text_input in text_inputs]

    max_length = max([input_id.size(0) for input_id in input_ids])

    # 对文本输入做 padding，确保所有文本输入的长度一致
    for i in range(len(input_ids)):
        padding_length = max_length - input_ids[i].size(0)
        if padding_length > 0:
            input_ids[i] = torch.cat([input_ids[i], torch.zeros(padding_length, dtype=torch.long)], dim=0)
            attention_mask[i] = torch.cat([attention_mask[i], torch.zeros(padding_length, dtype=torch.long)], dim=0)

    input_ids = torch.stack(input_ids, dim=0)
    attention_mask = torch.stack(attention_mask, dim=0)
    labels = torch.tensor(labels)

    return {'input_ids': input_ids, 'attention_mask': attention_mask}, labels

# 仅文本数据的加载类
class TextOnlyDataset(Dataset):
    def __init__(self, data_folder, label_file, tokenizer, max_length=512):
        '''
        该类用于加载和处理仅包含文本的数据。
        '''
        self.data_folder = data_folder
        self.tokenizer = tokenizer
        self.max_length = max_length

        self.data = pd.read_csv(label_file)
        self.guid_list = self.data["guid"].tolist()
        self.labels = self.data["tag"].map({"positive": 0, "neutral": 1, "negative": 2}).tolist()

    def __len__(self):
        return len(self.guid_list)

    def __getitem__(self, idx):
        '''
        获取指定索引的数据样本（文本和标签）
        '''
        guid = self.guid_list[idx]
        text_path = os.path.join(self.data_folder, f"{guid}.txt")

        with open(text_path, 'r', encoding='ISO-8859-1') as file:
            text = file.read()

        text_input = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=self.max_length)

        label = self.labels[idx]
        return text_input, label

# 仅文本模型（RoBERTa）
class TextOnlyModel(nn.Module):
    def __init__(self):
        '''
        定义仅文本模型，使用 RoBERTa 进行文本特征提取。
        '''
        super(TextOnlyModel, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.fc = nn.Linear(self.roberta.config.hidden_size, 3)  # 输出3个类别：positive, neutral, negative

    def forward(self, text_input):
        '''
        前向传播，提取文本特征并进行分类。
        '''
        text_output = self.roberta(**text_input)
        output = self.fc(text_output.pooler_output)
        return output

def train_model(model, train_loader, val_loader, epochs=5, learning_rate=1e-5, save_path="model.pth"):
    '''
    该函数用于训练仅文本模型。
    1. 定义优化器和损失函数（交叉熵损失）。
    2. 每个 epoch 中计算训练损失和准确率。
    3. 评估模型在验证集上的表现。
    4. 保存训练过程中最好的模型。
    '''
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)
    criterion = nn.CrossEntropyLoss()
    scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=2, factor=0.1, verbose=True)

    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []

    writer = SummaryWriter(log_dir='./runs/text_only_model')  # 用于 TensorBoard 记录

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        correct_train = 0
        total_train = 0

        # 使用 tqdm 显示训练进度条
        for data, labels in tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{epochs}", unit="batch"):
            optimizer.zero_grad()

            data = {key: val.squeeze(1).to(device) for key, val in data.items()}
            labels = labels.to(device)

            outputs = model(data)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct_train += (predicted == labels).sum().item()
            total_train += labels.size(0)

        train_losses.append(total_loss / len(train_loader))
        train_accuracy = correct_train / total_train * 100
        train_accuracies.append(train_accuracy)

        # 验证集评估
        model.eval()
        val_loss = 0
        correct_val = 0
        total_val = 0
        with torch.no_grad():
            for data, labels in tqdm(val_loader, desc=f"Validating Epoch {epoch+1}/{epochs}", unit="batch"):
                data = {key: val.squeeze(1).to(device) for key, val in data.items()}
                labels = labels.to(device)

                outputs = model(data)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                _, predicted = torch.max(outputs, 1)
                correct_val += (predicted == labels).sum().item()
                total_val += labels.size(0)

        val_losses.append(val_loss / len(val_loader))
        val_accuracy = correct_val / total_val * 100
        val_accuracies.append(val_accuracy)

        # 在 TensorBoard 中记录损失与准确率
        writer.add_scalar('Training Loss', train_losses[-1], epoch)
        writer.add_scalar('Validation Loss', val_losses[-1], epoch)
        writer.add_scalar('Training Accuracy', train_accuracies[-1], epoch)
        writer.add_scalar('Validation Accuracy', val_accuracies[-1], epoch)

        print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_losses[-1]}, Train Accuracy: {train_accuracies[-1]:.2f}%")
        print(f"Validation Loss: {val_losses[-1]}, Validation Accuracy: {val_accuracies[-1]:.2f}%")

        torch.save(model.state_dict(), save_path)

        # 更新学习率
        scheduler.step(val_losses[-1])

    # 绘制训练过程的损失和准确率曲线
    plot_training_results(train_losses, val_losses, train_accuracies, val_accuracies)

    # 关闭 TensorBoard 记录器
    writer.close()

# 设置分词器
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# 加载仅文本数据集
train_dataset = TextOnlyDataset(data_folder='./data', label_file='./train.txt', tokenizer=tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)

# 划分训练集和验证集
train_dataset, val_dataset = train_test_split(train_dataset, test_size=0.2, random_state=42)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

# 初始化模型
model = TextOnlyModel().to(device)

# 训练模型
train_model(model, train_loader, val_loader, epochs=5, learning_rate=1e-5)