In [1]:
import platform
import torch

def showinfo(tip, info):
    print("{}:{}".format(tip,info))

showinfo("操作系统及版本信息",platform.platform())
showinfo('系统位数', platform.architecture())
showinfo('pytorch版本', torch.__version__)
showinfo('cuda版本', torch.version.cuda)
showinfo('cudnn版本', torch.backends.cudnn.version())

操作系统及版本信息:Linux-5.15.153.1-microsoft-standard-WSL2-x86_64-with-glibc2.35
系统位数:('64bit', 'ELF')
pytorch版本:2.3.0
cuda版本:12.1
cudnn版本:8902


In [2]:
from torch import nn
import torch.nn.functional as F


class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, dim_feedforward=2048, dropout=0.1):
        super(TransformerEncoderLayer, self).__init__()
        self.self_attn = nn.MultiheadAttention(d_model, num_heads, dropout=dropout)
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, src):
        src2 = self.norm1(src)
        # Multi-head attention
        # Ensure the output of self_attn has the same shape as src
        attn_output, _ = self.self_attn(src2, src2, src2)
        src = src + attn_output

        # Another normalization
        src2 = self.norm2(src)

        # Feed-forward network
        # Check output shape of linear1; should be [batch_size, seq_len, dim_feedforward]
        src2 = F.relu(self.linear1(src2))
        src2 = self.dropout1(src2)  # Apply dropout after activation

        # Ensure linear2 shapes the output back to [batch_size, seq_len, d_model]
        src2 = self.linear2(src2)
        src2 = self.dropout2(src2)  # Apply dropout

        # Residual connection
        src = src + src2
        return src