In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import math

In [2]:
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_encoder_layers, dim_feedforward, max_seq_length, device):
        super(TransformerModel, self).__init__()
        self.device = device
        self.embedding = nn.Embedding(vocab_size, d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
        self.positional_encoding = self._generate_positional_encoding(d_model, max_seq_length).to(device)
        self.output_layer = nn.Linear(d_model, vocab_size)

    def _generate_positional_encoding(self, dim, max_len):
        pe = torch.zeros(max_len, dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, dim, 2).float() * -(math.log(10000.0) / dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        return pe

    def forward(self, src):
        src = self.embedding(src) + self.positional_encoding[:src.size(0), :]
        output = self.transformer_encoder(src)
        output = self.output_layer(output)
        return output

vocab_size = 23  # 21种氨基酸 + 2个用于填充
d_model = 256  # 嵌入维度
nhead = 4  # 注意力机制的头数 #考虑调小
num_encoder_layers = 2  # 编码器层的数量
dim_feedforward = 128  # 前馈网络的维度 #调小
max_seq_length = 22  # 序列的最大长度 #考虑优化

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerModel(vocab_size, d_model, nhead, num_encoder_layers, dim_feedforward, max_seq_length, device)
print(model)



TransformerModel(
  (embedding): Embedding(23, 256)
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
        )
        (linear1): Linear(in_features=256, out_features=128, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=128, out_features=256, bias=True)
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (output_layer): Linear(in_features=256, out_features=23, bias=True)
)


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
data = pd.read_csv('/kaggle/input/pmtnetdata/data/tcr_seq.csv')
sequences = data['Amino.Acid'].values

label_encoder = LabelEncoder()
label_encoder.fit(list("ACDEFGHIKLMNPQRSTVWYOX"))  # 所有可能的氨基酸
encoded_sequences = []
for sequence in sequences:
    encoded_sequence = label_encoder.transform(list(sequence))
    encoded_sequences.append(encoded_sequence)

max_seq_length = max([len(seq) for seq in encoded_sequences])
padded_sequences = np.zeros((len(encoded_sequences), max_seq_length), dtype=int)
for i, seq in enumerate(encoded_sequences):
    padded_sequences[i, :len(seq)] = seq[:max_seq_length]

X_train, X_val, _, _ = train_test_split(padded_sequences, np.zeros(len(padded_sequences)), test_size=0.2, random_state=42)

train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.long).to(device))
val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.long).to(device))

train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=512, shuffle=False)

model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001)

num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        inputs = batch[0].to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.view(-1, vocab_size), inputs.view(-1))
        loss.backward()
        optimizer.step()
    if epoch%10 == 0:
        print(f'Epoch {epoch+1}, Loss: {loss.item()}')

torch.save(model.state_dict(), 'transformer_model.pth')

Epoch 1, Loss: 2.1207356452941895
Epoch 11, Loss: 0.0248120054602623
Epoch 21, Loss: 0.00518172699958086
Epoch 31, Loss: 0.0028125355020165443
Epoch 41, Loss: 0.0020934364292770624
Epoch 51, Loss: 0.0018653686856850982
Epoch 61, Loss: 0.002030816162005067
Epoch 71, Loss: 0.001400624169036746
Epoch 81, Loss: 0.0012842409778386354
Epoch 91, Loss: 0.001004404854029417
