In [1]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
import numpy as np
from sklearn.preprocessing import StandardScaler
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

# 检查是否有GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 加载OTU表格文件
otu_table = pd.read_csv('genus_rotated_f_filtered.csv', header=0, index_col=None)

# # 标准化数据
# scaler = StandardScaler()
# otu_table_scaled = scaler.fit_transform(otu_table)

# 将标准化后的数据转换回DataFrame
otu_table_scaled_df = otu_table

# 创建genus名称到唯一索引的映射
genus_names = otu_table.columns.tolist()
genus_to_idx = {genus: idx for idx, genus in enumerate(genus_names)}

# 创建genus名称到唯一索引的映射
genus_names = otu_table_scaled_df.columns.tolist()
genus_to_idx = {genus: idx for idx, genus in enumerate(genus_names)}

otu_table_scaled_df

Unnamed: 0,Akkermansia,Alistipes,Bacteroides,Blautia,Clostridium,Collinsella,Coprococcus,Dialister,Dorea,Eubacterium,...,Lachnospira,Odoribacter,Oscillospira,Parabacteroides,Phascolarctobacterium,Prevotella,Roseburia,Ruminococcus,Subdoligranulum,Sutterella
0,0,187,3661,10,7,2,0,0,2,0,...,18,20,12,167,4,2,55,38,0,89
1,0,0,3377,99,22,19,61,18,0,131,...,17,0,0,475,0,5,104,35,121,0
2,0,6,3408,19,37,0,0,6,2,6,...,0,12,5,80,0,3,95,54,26,0
3,87,1018,2108,70,88,1,58,27,3,60,...,26,36,635,144,0,0,837,311,353,0
4,2,181,1535,67,68,0,26,0,9,15,...,11,0,183,748,148,0,308,115,25,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
314,0,19,1625,14,12,0,3,6,1,2,...,7,0,3,19,21,0,44,21,32,85
315,0,112,6358,20,20,0,3,0,6,1,...,4,0,410,53,21,0,29,9,83,0
316,2,221,1338,73,204,14,39,187,15,7,...,6,10,389,17,355,271,39,324,233,0
317,9,0,727,18,60,18,76,272,3,25,...,24,0,150,211,0,1601,11,84,13,0


In [4]:

# 自定义数据集类
class MicrobiomeDataset(Dataset):
    def __init__(self, data, mask_prob=0.20):
        self.data = data
        self.mask_prob = mask_prob
        self.genus_to_idx = genus_to_idx
        self.vocab_size = len(genus_to_idx)
        self.mask_token_idx = self.vocab_size  # 使用词汇表大小作为mask token索引
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sample = self.data[idx]
        input_sample = sample.copy()
        target_sample = sample.copy()
        
        # 随机mask一些位点
        mask = np.random.rand(len(sample)) < self.mask_prob
        input_sample[mask] = self.mask_token_idx  # 使用mask token的索引
        
        return torch.tensor(input_sample, dtype=torch.long), torch.tensor(target_sample, dtype=torch.long)

# class TransformerEncoderModel(nn.Module):
#     def __init__(self, vocab_size, d_model, nhead, num_layers):
#         super(TransformerEncoderModel, self).__init__()
#         self.embedding = nn.Embedding(vocab_size + 1, d_model)  # vocab_size + 1用于包含mask token
#         encoder_layers = nn.TransformerEncoderLayer(d_model, nhead)
#         self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
#         self.fc = nn.Linear(d_model, vocab_size)

#     def forward(self, x):
#         x = self.embedding(x)
#         x = self.transformer_encoder(x.permute(1, 0, 2))  # 调整维度以适应Transformer输入 (seq_len, batch_size, d_model)
#         x = x.permute(1, 0, 2)  # 调整维度回 (batch_size, seq_len, d_model)
#         x = self.fc(x)
#         return x
class TransformerEncoderModel(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers):
        super(TransformerEncoderModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size + 1, d_model)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, x, return_features=False):
        x = self.embedding(x)
        x = self.transformer_encoder(x.permute(1, 0, 2))
        x = x.permute(1, 0, 2)
        if return_features:
            return x
        x = self.fc(x)
        return x
# 模型参数
vocab_size = len(genus_names)
d_model = 512
nhead = 8
num_layers = 6
print(vocab_size)
# 初始化模型并移动到GPU
model = TransformerEncoderModel(vocab_size, d_model, nhead, num_layers).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=-1)  # 忽略mask token的损失计算
optimizer = optim.Adam(model.parameters(), lr=0.00001)

# 数据加载
dataset = MicrobiomeDataset(otu_table_scaled_df.values)
data_loader = DataLoader(dataset, batch_size=4, shuffle=True)


model

22


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [3]:
def train(model, data_loader, criterion, optimizer, device, epochs=128):
    model.train()
    losses = []
    for epoch in range(epochs):
        total_loss = 0
        for input_data, target_data in data_loader:
            # 移动数据到GPU
            input_data, target_data = input_data.to(device), target_data.to(device)
            
            optimizer.zero_grad()
            output = model(input_data)
            loss = criterion(output.view(-1, vocab_size), target_data.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(data_loader)
        losses.append(avg_loss)
        print(f'Epoch {epoch+1}/{epochs}, Loss: {avg_loss}')
    return losses

# 开始训练
losses = train(model, data_loader, criterion, optimizer, device)

# 保存模型
torch.save(model.state_dict(), 'pretrained_transformer_encoder_model.pth')

# 绘制损失图
plt.figure(figsize=(10, 5))
plt.plot(range(1, len(losses) + 1), losses, marker='o')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss Over Epochs')
plt.grid(True)
plt.show()

/opt/conda/conda-bld/pytorch_1678411187366/work/aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [23,0,0], thread: [64,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1678411187366/work/aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [23,0,0], thread: [65,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1678411187366/work/aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [23,0,0], thread: [66,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1678411187366/work/aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [23,0,0], thread: [67,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1678411187366/work/aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [23,0,0], thread: [68,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/cond

RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`

In [None]:
class EnhancedMLP(nn.Module):
    def __init__(self, pretrained_model, input_size, hidden_size, num_classes):
        super(EnhancedMLP, self).__init__()
        self.pretrained_model = pretrained_model
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        with torch.no_grad():
            x = self.pretrained_model(x.long(), return_features=True)  # 获取编码后的特征
        x = x.mean(dim=1)  # 对序列维度进行平均池化
        x = torch.nn.functional.relu(self.fc1(x))
        x = self.fc2(x)
        return x


In [None]:
# 加载预训练的TransformerEncoderModel
pretrained_model = TransformerEncoderModel(vocab_size, d_model, nhead, num_layers)
pretrained_model.load_state_dict(torch.load('pretrained_transformer_encoder_model.pth'))
pretrained_model.to(device)
pretrained_model.eval()  # 设置为评估模式

# EnhancedMLP模型参数
input_size = vocab_size * d_model  # 根据TransformerEncoderModel的输出尺寸调整
hidden_size = 128
num_classes = 10  # 假设有10个类别进行分类

# 初始化EnhancedMLP模型并移动到GPU
enhanced_mlp = EnhancedMLP(pretrained_model, input_size, hidden_size, num_classes).to(device)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = Adam(enhanced_mlp.parameters(), lr=0.0001)
enhanced_mlp

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader

# Read the data into a DataFrame
df = pd.read_csv("NSCLC.csv")
# Filter out columns that contain all zeros
# df = df.loc[:, (NSCLC != 0).any(axis=0)]
f1 = pd.read_csv('genus_rotated_f_filtered.csv')
# print(f1.shape)
# Extract genus-level data
df['Genus'] = df['#NAME'].apply(lambda x: x.split(';g__')[1].split(';')[0] if ';g__' in x else 'Unclassified')

# Select the relevant columns
genus_df = df[['Genus'] + df.columns[1:-1].tolist()]

# Filter out rows with "_unclassified" in the Genus column
NSCLC = genus_df[~genus_df['Genus'].str.contains('_unclassified')]
if 'Genus' in NSCLC.columns:
    NSCLC = NSCLC.groupby('Genus').sum().reset_index()
else:
    NSCLC = NSCLC  # If there is no 'genus' column, use the original data
NSCLC = NSCLC[NSCLC['Genus'].notna() & (NSCLC['Genus'] != '')]
NSCLC = NSCLC.loc[:, (NSCLC != 0).any(axis=0)]
NSCLC.set_index(NSCLC.columns[0], inplace=True)
f2 = NSCLC.transpose()
missing_cols = [col for col in f1.columns if col not in f2.columns]
# Add missing columns to f2 with values set to 0 using pd.concat
f2 = pd.concat([f2, pd.DataFrame(0, index=f2.index, columns=missing_cols)], axis=1)
# Drop columns from f2 that are not in f1
f2 = f2[f1.columns]
# Merge f2 to f1, keeping only the column names
f1 = f2
metadata  = pd.read_csv('metadata_response.csv')
metadata.set_index(metadata.columns[0], inplace=True)
# num_columns = len(merged_table.columns) - 1
merged_table = f1.join(metadata, how='inner')
# merged_table.to_csv("merged_table.csv",index=False)
# merged_table = merged_table.drop(columns=['Best response'])
response = merged_table['Best response']
otu_table_merge = merged_table.drop(columns=['Best response'])
# Drop the first column if it contains sample IDs or unnecessary data
otu_table_merge = otu_table_merge.iloc[:, 1:]

# # Normalize OTU counts by total counts per sample
# normalized_otu_counts = otu_table_merge.div(otu_table_merge.sum(axis=1), axis=0)

# # Optionally, convert to percentages
# normalized_otu_counts *= 100
# 标准化OTU数据
# scaler = StandardScaler()
# otu_table_scaled = scaler.fit_transform(otu_table_merge)
normalized_otu_counts = otu_table_merge
# 将标准化后的数据转换回DataFrame
# normalized_otu_counts = pd.DataFrame(otu_table_scaled, columns=otu_table_merge.columns)
# Print to verify
# normalized_otu_counts.to_csv("normalized_otu_counts.csv",index=False)
# Create a dictionary to map genus names to unique indices
genus_names = normalized_otu_counts.columns.tolist()
genus_to_idx = {genus: idx for idx, genus in enumerate(genus_names)}
genus_names
genus_to_idx
from sklearn.preprocessing import LabelEncoder

# Assuming 'merged_table' is your DataFrame containing the response and features
encoder = LabelEncoder()
merged_table['Best response'] = encoder.fit_transform(merged_table['Best response'])

# Separate features and target
# features = merged_table.drop('Best response', axis=1)
features = normalized_otu_counts
targets = merged_table['Best response']
targets

In [None]:
class OTUDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets

    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        # Ensure data is returned as tensors
        x = torch.tensor(self.features.iloc[idx].values, dtype=torch.float32)
        y = torch.tensor(self.targets.iloc[idx], dtype=torch.long)  # Use torch.long for classification labels
        return x, y
from sklearn.model_selection import train_test_split
# Encode the 'Best response' column
encoder = LabelEncoder()
merged_table['Best response'] = encoder.fit_transform(merged_table['Best response'])

# Split the features and targets into training and testing sets
features_train, features_test, targets_train, targets_test = train_test_split(
    features, targets, test_size=0.2, random_state=42)
# print(features_test)
train_dataset = OTUDataset(features_train, targets_train)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_dataset = OTUDataset(features_test, targets_test)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False)

In [None]:
# Train and evaluate the enhanced model
def train_and_evaluate(model, train_dataloader, test_dataloader, criterion, optimizer, num_epochs=256):
    # Check if GPU is available and move the model to GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    def train_model(model, criterion, optimizer, dataloader, num_epochs):
        model.train()
        for epoch in range(num_epochs):
            total_loss = 0
            for inputs, labels in dataloader:
                # Move inputs and labels to GPU
                inputs, labels = inputs.to(device), labels.to(device)
                
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()
            if (epoch + 1) % 10 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss / len(dataloader):.4f}')

    def evaluate_model(model, dataloader):
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in dataloader:
                # Move inputs and labels to GPU
                inputs, labels = inputs.to(device), labels.to(device)
                
                outputs = model(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        accuracy = 100 * correct / total
        print(f'Accuracy: {accuracy:.2f}%')

    # Train the model
    train_model(model, criterion, optimizer, train_dataloader, num_epochs)
    
    # Evaluate the model
    evaluate_model(model, test_dataloader)



In [None]:
hidden_size = 128
enhanced_model = EnhancedMLP(pretrained_model, d_model, hidden_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = Adam(enhanced_model.parameters(), lr=0.001)

# Train and evaluate the enhanced model
train_and_evaluate(enhanced_model, train_dataloader, test_dataloader, criterion, optimizer, num_epochs=64)