原模型


In [None]:
!pip install datasets
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer
from datasets import load_dataset # The datasets module is now imported after installation.
import math
# 自定義Dataset類
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x

class TransformerModel(nn.Module):
    def __init__(self, input_dim, embed_dim, nhead, nhid, nlayers, output_dim, dropout=0.5):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(embed_dim)
        self.encoder = nn.Embedding(input_dim, embed_dim)
        self.transformer = nn.Transformer(embed_dim, nhead, nlayers, nhid, dropout=dropout)
        self.decoder = nn.Linear(embed_dim, output_dim)
        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, src_mask=None):
        src = self.encoder(src) * math.sqrt(self.encoder.embedding_dim)
        src = self.pos_encoder(src)
        output = self.transformer(src, src_mask)
        output = self.decoder(output)
        return output

# 加載IMDb資料集
dataset = load_dataset('imdb')
train_texts = dataset['train']['text']
train_labels = dataset['train']['label']
test_texts = dataset['test']['text']
test_labels = dataset['test']['label']

# 使用Roberta tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
max_length = 128
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
test_dataset = TextClassificationDataset(test_texts, test_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 創建模型
input_dim = tokenizer.vocab_size
embed_dim = 512
nhead = 8
nhid = 2048
nlayers = 6
output_dim = 2  # 二分類
dropout = 0.5

model = TransformerModel(input_dim, embed_dim, nhead, nhid, nlayers, output_dim, dropout)

# 訓練模型
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']

        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = criterion(outputs.view(-1, output_dim), labels.view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f'Epoch {epoch + 1}, Loss: {total_loss / len(train_dataloader)}')

print("訓練完成！")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


**嵌入維度（Embedding Dimension）：**

增加嵌入維度（從512改為768）：更高的嵌入維度可以讓模型捕捉到更多的特徵和細節，這對於處理複雜的語言任務可能是有利的。這樣可以提高模型的表達能力，但也會增加計算量和訓練時間。

**注意力頭數（Number of Attention Heads）：**

增加注意力頭數（從8改為12）：更多的注意力頭可以讓模型在不同的子空間中學習到更多的關聯性和特徵，這有助於提升模型的性能，但也會增加計算資源的需求。

**隱藏層大小（Hidden Layer Size）：**

增加隱藏層大小（從2048改為4096）：更大的隱藏層可以讓模型學習到更複雜的模式和特徵，這通常會提高模型的性能，但也會增加內存和計算的需求。
**層數（Number of Layers）：**

減少層數（從6改為4）：減少層數可以使模型更輕量化，減少過擬合的風險，並且加快訓練速度。這在數據量較少或模型過於複雜時可能是有利的。

**學習率（Learning Rate）：**

降低學習率（從0.001改為0.0005）：較低的學習率可以讓模型在每一步更新時更加穩定，避免過大的步伐導致錯誤的方向，這通常有助於提高模型的最終性能和穩定性。

**批量大小（Batch Size）：**

增加批量大小（從32改為64）：更大的批量大小可以提高訓練的穩定性和效率，因為每次更新的梯度是基於更多樣本的平均值，這通常會讓訓練過程更加平滑。但需要注意的是，批量大小過大可能會導致內存不足的問題。

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer
from datasets import load_dataset
import math

# 自定義Dataset類
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x

class TransformerModel(nn.Module):
    def __init__(self, input_dim, embed_dim, nhead, nhid, nlayers, output_dim, dropout=0.5):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(embed_dim)
        self.encoder = nn.Embedding(input_dim, embed_dim)
        self.transformer = nn.Transformer(embed_dim, nhead, nlayers, nhid, dropout=dropout)
        self.decoder = nn.Linear(embed_dim, output_dim)
        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, src_mask=None):
        src = self.encoder(src) * math.sqrt(self.encoder.embedding_dim)
        src = self.pos_encoder(src)
        output = self.transformer(src, src_mask)
        output = self.decoder(output)
        return output

# 加載IMDb資料集
dataset = load_dataset('imdb')
train_texts = dataset['train']['text']
train_labels = dataset['train']['label']
test_texts = dataset['test']['text']
test_labels = dataset['test']['label']

# 使用Roberta tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
max_length = 128
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
test_dataset = TextClassificationDataset(test_texts, test_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)  # 批量大小改為64
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# 創建模型
input_dim = tokenizer.vocab_size
embed_dim = 768  # 嵌入維度改為768
nhead = 12  # 注意力頭數改為12
nhid = 4096  # 隱藏層大小改為4096
nlayers = 4  # 層數改為4
output_dim = 2  # 二分類
dropout = 0.5

model = TransformerModel(input_dim, embed_dim, nhead, nhid, nlayers, output_dim, dropout)

# 訓練模型
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005)  # 學習率改為0.0005

num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']

        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = criterion(outputs.view(-1, output_dim), labels.view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f'Epoch {epoch + 1}, Loss: {total_loss / len(train_dataloader)}')

print("訓練完成！")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
