In [23]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
# Load the OTU table file
otu_table = pd.read_csv('genus_rotated_f.csv', header=0, index_col=None)

X = otu_table.values  # 转换为NumPy数组
# 处理标准差为零的情况
mean = X.mean(axis=0)
std = X.std(axis=0)
std[std == 0] = 1  # 将标准差为零的特征的标准差设置为1，避免除以零

# 数据标准化
X = (X - mean) / std

# 处理NaN值，将其替换为零或其他合理的数值
X = np.nan_to_num(X, nan=0.0)
# 数据分割
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)
X

array([[-0.05607722,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.05607722,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.05607722,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.05607722,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.05607722,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.05607722,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [24]:
# 定义Transformer编码器
class TransformerEncoder(nn.Module):
    def __init__(self, input_dim, nhead, num_layers, dim_feedforward):
        super(TransformerEncoder, self).__init__()
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=input_dim, nhead=nhead, dim_feedforward=dim_feedforward)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        self.linear = nn.Linear(input_dim, input_dim)  # 可选，取决于需要的嵌入维度

    def forward(self, x):
        x = self.transformer_encoder(x)
        x = self.linear(x)
        return x

# 掩码数据函数
def mask_data(x, mask_ratio=0.15):
    mask = np.random.rand(*x.shape) < mask_ratio
    x_masked = x.copy()
    x_masked[mask] = 0  # 用0填充掩盖的部分
    return x_masked, mask

In [31]:
# 超参数设置
input_dim = X_train.shape[1]
nhead = 3
num_layers = 2
dim_feedforward = 128
epochs = 50
learning_rate = 0.001

input_dim

393

In [32]:
# 模型初始化
model = TransformerEncoder(input_dim, nhead, num_layers, dim_feedforward)

In [33]:
criterion = nn.MSELoss()  # 使用均方误差损失函数
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


In [34]:
import matplotlib.pyplot as plt
# 训练模型
losses = []
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    
    # 掩码训练数据
    X_train_masked, mask = mask_data(X_train)
    
    # 将数据转换为tensor
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    X_train_masked_tensor = torch.tensor(X_train_masked, dtype=torch.float32)
    
    # 前向传播
    outputs = model(X_train_masked_tensor)
    
    # 计算损失（仅计算掩码部分的损失）
    loss = criterion(outputs[mask], X_train_tensor[mask])
    loss.backward()
    optimizer.step()
    losses.append(loss.item())
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

# 生成嵌入
model.eval()
with torch.no_grad():
    embeddings = model(torch.tensor(X, dtype=torch.float32))

# 将生成的嵌入作为下游任务的输入
embeddings = embeddings.numpy()
# 绘制损失图像
plt.figure(figsize=(10, 6))
plt.plot(range(1, epochs + 1), losses, label='Training Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss over Epochs')
plt.legend()
plt.show()

Epoch 1, Loss: 0.6059686541557312
Epoch 2, Loss: 0.5466816425323486
Epoch 3, Loss: 0.5837085247039795
Epoch 4, Loss: 0.47711533308029175
Epoch 5, Loss: 0.4429970681667328
Epoch 6, Loss: 0.37490221858024597
Epoch 7, Loss: 0.2904721200466156
Epoch 8, Loss: 0.26965397596359253
Epoch 9, Loss: 0.25968828797340393
Epoch 10, Loss: 0.3568226993083954
Epoch 11, Loss: 0.3834213316440582
Epoch 12, Loss: 0.365933895111084
Epoch 13, Loss: 0.3818112015724182
Epoch 14, Loss: 0.39016440510749817
Epoch 15, Loss: 0.31721505522727966
Epoch 16, Loss: 0.2872919738292694
Epoch 17, Loss: 0.29132840037345886
Epoch 18, Loss: 0.31863150000572205
Epoch 19, Loss: 0.31089818477630615
Epoch 20, Loss: 0.3327706754207611
Epoch 21, Loss: 0.23491650819778442
Epoch 22, Loss: 0.33862465620040894
Epoch 23, Loss: 0.2721449136734009
Epoch 24, Loss: 0.37342599034309387
Epoch 25, Loss: 0.2526301443576813
Epoch 26, Loss: 0.3040506839752197
Epoch 27, Loss: 0.22153881192207336
Epoch 28, Loss: 0.2894376516342163
Epoch 29, Loss: 0