# 单头注意力

In [None]:
import torch
import torch.nn as nn
import math

class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()

        self.hidden_dim = hidden_dim
        self.q_proj = nn.Linear(hidden_dim, hidden_dim)
        self.k_proj = nn.Linear(hidden_dim, hidden_dim)
        self.v_proj = nn.Linear(hidden_dim, hidden_dim)

    def forward(self, x, mask=None): # x shape:[batch, seq_len, hidden_dim]
        q = self.q_proj(x) # q,k,v shape:[batch, seq_len, hidden_dim]
        k = self.k_proj(x)
        v = self.v_proj(x)
        # [seq_len, seq_len]
        attention_score = torch.matmul(q, k.transpose(-1, -2)) # -1和-2表示把第二维和第三维交换
        if mask is not None: # padding的地方要mask
            attention_score = attention_score.masked_fill(mask == 0, float("-inf")) 
        attention_score = torch.softmax(attention_score / math.sqrt(self.hidden_dim), dim = -1)
        
        output =  torch.matmul(attention_score, v) # [batch, seq_len, hidden_dim]
        return output 

inputs = torch.randn(2, 3, 4)
mask = torch.ones(3, 3)
attention = Attention(4)
attention(inputs, mask)

# 多头注意力

In [None]:
import torch
import torch.nn as nn
import math

class MutiHeadAttention(nn.Module):
    def __init__(self, hidden_dim, num_heads):
        super().__init__()

        self.hidden_dim = hidden_dim
        self.num_heads = num_heads
        self.head_dim = hidden_dim // num_heads
        # 确保能整除
        assert hidden_dim % num_heads == 0

        self.q_proj = nn.Linear(hidden_dim, hidden_dim)
        self.k_proj = nn.Linear(hidden_dim, hidden_dim)
        self.v_proj = nn.Linear(hidden_dim, hidden_dim)

        self.attention_drop = nn.Dropout(0.1)
        self.out_proj = nn.Linear(hidden_dim, hidden_dim) # 多头注意力额外需要一次变换

        self.layer_norm = nn.LayerNorm(hidden_dim)
    
    def forward(self, x, mask = None):
        batch_size, seq_len, _ = x.shape

        # 线性变换得到Q K V
        # q,k,v shape:[batch, seq_len, hidden_dim]
        q = self.q_proj(x) 
        k = self.k_proj(x)
        v = self.v_proj(x)

        # 分割多头
        # q,k,v shape:[batch, num_heads, seq_len, head_dim]
        q = q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        k = k.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        v = v.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)

        # 计算注意力权重
        # [batch, num_heads, seq_len, seq_len]
        attention_score = torch.matmul(q, k.transpose(-1, -2))
        if mask is not None:
            attention_score = attention_score.masked_fill(mask == 0, float('-inf'))
        attention_score = torch.softmax(attention_score / math.sqrt(self.head_dim), dim = -1)
        attention_score = self.attention_drop(attention_score) # 加一个dropout

        # 应用注意力权重得到输出
        output = torch.matmul(attention_score, v) # [batch, num_heads, seq_len, head_dim]

        # 拼接多头结果
        # [batch, seq_len, hidden_dim]
        # transpose后直接使用view会报错 必须使用contiguous确保转置后数据的内存地址连续
        output = output.transpose(1, 2).contiguous().view(batch_size, seq_len, self.hidden_dim)

        # 再经过一次线性层
        output = self.out_proj(output)

        # 残差连接和层归一化
        output = self.layer_norm(x + output)

        return output

inputs = torch.randn(2,3,4)
mask = torch.ones(3, 3)

attention = MutiHeadAttention(4, 2)
attention(inputs, mask)

# DNN

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# 自定义数据集
class CustomDataset(Dataset):
    def __init__(self, num_samples): # 数据量
        self.num_samples = num_samples
        self.X = []
        self.Y = []
        for _ in range(num_samples):
            # 生成类别变量
            x1 = torch.randint(0, 3, (1,))  # 3个类别
            x2 = torch.randint(0, 4, (1,))  # 4个类别
            x3 = torch.randint(0, 5, (1,))  # 5个类别
            # 生成二分类目标
            y = torch.tensor([1 if x1+x2+x3>6 else 0])  # 0或1

            self.X.append(torch.cat([x1, x2, x3]))
            self.Y.append(y)

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]

# 创建数据集
num_samples = 1000  # 设定样本数量
dataset = CustomDataset(num_samples)

# 创建DataLoader
train_loader = DataLoader(dataset, batch_size=64, shuffle=True)

# 模型搭建
class SimpleDNN(nn.Module):
    def __init__(self):
        super(SimpleDNN, self).__init__()

        self.emb1 = nn.Embedding(3, 50) # embedding
        self.emb2 = nn.Embedding(4, 50) # embedding
        self.emb3 = nn.Embedding(5, 50) # embedding

        self.fc1 = nn.Linear(150, 25) # 隐藏层
        self.fc2 = nn.Linear(25, 2)  # 输出层

    def forward(self, x):
        x = torch.cat([self.emb1(x[:, 0]), self.emb2(x[:, 1]), self.emb3(x[:, 2])], dim = 1)

        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

model = SimpleDNN()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 训练模型
num_epochs = 5
for epoch in range(num_epochs):
    for data, target in train_loader:
        output = model(data)
        target = target.squeeze()
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

    # 在训练集上进行预测
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data, target in train_loader:
            output = model(data)
            predicted = output.argmax(dim=1, keepdim=True)
            total += target.size(0)
            correct += (predicted == target).sum().item()

    print(f'Accuracy of the network on the training data: {100 * correct / total}%')

# AUC

In [None]:
def AUC(label, pre):
    #计算正样本和负样本的索引，以便索引出之后的概率值
    pos = [i for i in range(len(label)) if label[i] == 1]
    neg = [i for i in range(len(label)) if label[i] == 0]
 
    auc = 0
    for i in pos:
        for j in neg:
            if pre[i] > pre[j]:
                auc += 1
            elif pre[i] == pre[j]:
                auc += 0.5
 
    return auc / (len(pos)*len(neg))
 
 
if __name__ == '__main__':
    label = [1,0,0,0,1,0,1,0]
    pre = [0.9, 0.8, 0.3, 0.1, 0.4, 0.9, 0.66, 0.7]
    print(AUC(label, pre))
 
    from sklearn.metrics import roc_curve, auc
    fpr, tpr, th = roc_curve(label, pre , pos_label=1)
    print('sklearn', auc(fpr, tpr))
