In [70]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
import numpy as np
import matplotlib.pyplot as plt

# Load the OTU table file
otu_table = pd.read_csv('genus_rotated_f.csv', header=0, index_col=None)
# Drop columns if all their values are 0
# otu_table = otu_table.loc[:, (otu_table != 0).any(axis=0)]
# Normalize the OTU counts (skip the first column if it's the sample IDs)
otu_table_proportions = otu_table.div(otu_table.sum(axis=1), axis=0)
# Convert proportions to percentages
otu_table_percentages = otu_table_proportions * 100
print(otu_table_percentages.shape)
# Create a dictionary to map genus names to unique indices
genus_names = otu_table_proportions.columns.tolist()
genus_to_idx = {genus: idx for idx, genus in enumerate(genus_names)}


(319, 393)


In [71]:
class OTUDataset(Dataset):
    def __init__(self, otu_table_percentages):
        self.otu_table_percentages = otu_table_percentages.values

    def __len__(self):
        return len(self.otu_table_percentages)

    def __getitem__(self, idx):
        sample = self.otu_table_percentages[idx]
        return torch.tensor(sample, dtype=torch.float32)


In [91]:
class MicroTransformerModel(nn.Module):
    def __init__(self, input_dim, embed_dim, num_heads, ff_dim, num_layers, dropout=0.1):
        super(MicroTransformerModel, self).__init__()
        self.embedding = nn.Linear(input_dim, embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=ff_dim, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(embed_dim, input_dim)

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer_encoder(x)
        x = self.fc(x)
        return x


In [92]:
# Hyperparameters
input_dim = otu_table_percentages.shape[1]
# print(input_dim)
embed_dim = 128
num_heads = 4
ff_dim = 128
num_layers = 3
dropout = 0.1
batch_size = 32
num_epochs = 128
learning_rate = 0.001

# Create the dataset and dataloader
dataset = OTUDataset(otu_table_percentages)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Initialize the model, loss function, and optimizer
model = MicroTransformerModel(input_dim, embed_dim, num_heads, ff_dim, num_layers, dropout)
criterion = nn.MSELoss()
optimizer = Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, data in enumerate(dataloader):
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, data)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    epoch_loss = running_loss / len(dataloader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')

# Save the model
torch.save(model.state_dict(), 'transformer_pretrained_otu.pth')


Epoch [1/128], Loss: 9.5154
Epoch [2/128], Loss: 8.6709
Epoch [3/128], Loss: 8.2212
Epoch [4/128], Loss: 7.8054
Epoch [5/128], Loss: 7.4045
Epoch [6/128], Loss: 7.0158
Epoch [7/128], Loss: 6.6268
Epoch [8/128], Loss: 6.2395
Epoch [9/128], Loss: 5.8509
Epoch [10/128], Loss: 5.4657
Epoch [11/128], Loss: 5.0894
Epoch [12/128], Loss: 4.7127
Epoch [13/128], Loss: 4.3545
Epoch [14/128], Loss: 4.0077
Epoch [15/128], Loss: 3.6651
Epoch [16/128], Loss: 3.3481
Epoch [17/128], Loss: 3.0426
Epoch [18/128], Loss: 2.7581
Epoch [19/128], Loss: 2.4932
Epoch [20/128], Loss: 2.2563
Epoch [21/128], Loss: 2.0270
Epoch [22/128], Loss: 1.8222
Epoch [23/128], Loss: 1.6347
Epoch [24/128], Loss: 1.4675
Epoch [25/128], Loss: 1.3092
Epoch [26/128], Loss: 1.1748
Epoch [27/128], Loss: 1.0508
Epoch [28/128], Loss: 0.9398
Epoch [29/128], Loss: 0.8447
Epoch [30/128], Loss: 0.7672
Epoch [31/128], Loss: 0.6872
Epoch [32/128], Loss: 0.6231
Epoch [33/128], Loss: 0.5603
Epoch [34/128], Loss: 0.5086
Epoch [35/128], Loss: 0

In [96]:
# Load the pretrained transformer model
pretrained_model = MicroTransformerModel(input_dim, embed_dim, num_heads, ff_dim, num_layers, dropout)
print(pretrained_model)
pretrained_model.load_state_dict(torch.load('transformer_pretrained_otu.pth'))
# pretrained_model.eval()  # Set the model to evaluation mode


MicroTransformerModel(
  (embedding): Linear(in_features=393, out_features=128, bias=True)
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-2): 3 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=128, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=128, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (fc): Linear(in_features=128, out_features=393, bias=True)
)


<All keys matched successfully>

In [87]:
import pandas as pd
# Read the data into a DataFrame
df = pd.read_csv("NSCLC.csv")
# Filter out columns that contain all zeros
# df = df.loc[:, (NSCLC != 0).any(axis=0)]
f1 = pd.read_csv('genus_rotated_f.csv')
# print(f1.shape)
# Extract genus-level data
df['Genus'] = df['#NAME'].apply(lambda x: x.split(';g__')[1].split(';')[0] if ';g__' in x else 'Unclassified')

# Select the relevant columns
genus_df = df[['Genus'] + df.columns[1:-1].tolist()]

# Filter out rows with "_unclassified" in the Genus column
NSCLC = genus_df[~genus_df['Genus'].str.contains('_unclassified')]
if 'Genus' in NSCLC.columns:
    NSCLC = NSCLC.groupby('Genus').sum().reset_index()
else:
    NSCLC = NSCLC  # If there is no 'genus' column, use the original data
NSCLC = NSCLC[NSCLC['Genus'].notna() & (NSCLC['Genus'] != '')]
NSCLC = NSCLC.loc[:, (NSCLC != 0).any(axis=0)]
NSCLC.set_index(NSCLC.columns[0], inplace=True)
f2 = NSCLC.transpose()
missing_cols = [col for col in f1.columns if col not in f2.columns]
# Add missing columns to f2 with values set to 0 using pd.concat
f2 = pd.concat([f2, pd.DataFrame(0, index=f2.index, columns=missing_cols)], axis=1)
# Drop columns from f2 that are not in f1
f2 = f2[f1.columns]
# Merge f2 to f1, keeping only the column names
f1 = f2
metadata  = pd.read_csv('metadata_response.csv')
metadata.set_index(metadata.columns[0], inplace=True)
# num_columns = len(merged_table.columns) - 1
merged_table = f1.join(metadata, how='inner')
# merged_table.to_csv("merged_table.csv",index=False)
# merged_table = merged_table.drop(columns=['Best response'])
response = merged_table['Best response']
otu_table_merge = merged_table.drop(columns=['Best response'])
# Drop the first column if it contains sample IDs or unnecessary data
otu_table_merge = otu_table_merge.iloc[:, 1:]

# Normalize OTU counts by total counts per sample
normalized_otu_counts = otu_table_merge.div(otu_table_merge.sum(axis=1), axis=0)

# Optionally, convert to percentages
normalized_otu_counts *= 100

# Print to verify
# normalized_otu_counts.to_csv("normalized_otu_counts.csv",index=False)
# Create a dictionary to map genus names to unique indices
genus_names = normalized_otu_counts.columns.tolist()
genus_to_idx = {genus: idx for idx, genus in enumerate(genus_names)}
genus_names
genus_to_idx
from sklearn.preprocessing import LabelEncoder

# Assuming 'merged_table' is your DataFrame containing the response and features
encoder = LabelEncoder()
merged_table['Best response'] = encoder.fit_transform(merged_table['Best response'])

# Separate features and target
# features = merged_table.drop('Best response', axis=1)
features = normalized_otu_counts
targets = merged_table['Best response']
targets


ERR2213660     1
ERR2213665     1
ERR2213666     1
ERR2213669     0
ERR2213672     0
              ..
SRR15373067    0
SRR15373089    0
SRR15373078    0
SRR15373012    0
SRR15373143    1
Name: Best response, Length: 417, dtype: int64

In [97]:
# Define the MLP model
class MLP(nn.Module):
    def __init__(self, embed_dim, hidden_dim, output_dim, pretrained_model):
        super(MLP, self).__init__()
        self.pretrained_model = pretrained_model
        self.pretrained_model.embedding = nn.Identity()  # Remove the embedding layer
        self.pretrained_model.fc = nn.Identity()  # Remove the final fully connected layer
        self.fc1 = nn.Linear(embed_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        with torch.no_grad():  # Freeze the pretrained transformer model
            x = self.pretrained_model(x)
        # print(x.shape)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Hyperparameters for the MLP
hidden_dim = 128
output_dim = len(np.unique(response))  # Number of unique classes
# Instantiate the transformer model
input_dim = 392
embed_dim = 128
num_heads = 8
ff_dim = 512
num_layers = 6

transformer_model = MicroTransformerModel(input_dim=input_dim, embed_dim=embed_dim, num_heads=num_heads, ff_dim=ff_dim, num_layers=num_layers)

# Instantiate the MLP model
hidden_dim = 256
mlp_model = MLP(embed_dim=embed_dim, hidden_dim=hidden_dim, output_dim=output_dim, pretrained_model=transformer_model)

# Print model summary to confirm dimensions
print(mlp_model)

MLP(
  (pretrained_model): MicroTransformerModel(
    (embedding): Identity()
    (transformer_encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-5): 6 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
          )
          (linear1): Linear(in_features=128, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=512, out_features=128, bias=True)
          (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (fc): Identity()
  )
  (fc1): Linear(in_features=128, out_features=256, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=256, out_features=2, bias=True)
)


In [101]:

# Define the dataset and dataloader for MLP training
class OTUDatasetWithLabels(Dataset):
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        x = torch.tensor(self.features.iloc[idx].values, dtype=torch.float32)
        y = torch.tensor(self.targets.iloc[idx], dtype=torch.long)
        return x, y

In [102]:
# Split the features and targets into training and testing sets
features_train, features_test, targets_train, targets_test = train_test_split(
    normalized_otu_counts, response, test_size=0.2, random_state=42)

train_dataset = OTUDatasetWithLabels(features_train, targets_train)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataset = OTUDatasetWithLabels(features_test, targets_test)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [103]:
# Training settings for the MLP
criterion = nn.CrossEntropyLoss()
optimizer = Adam(mlp_model.parameters(), lr=0.001)
num_epochs = 20

# Training loop for the MLP
for epoch in range(num_epochs):
    mlp_model.train()
    running_loss = 0.0
    for data, labels in train_dataloader:
        optimizer.zero_grad()
        outputs = mlp_model(data)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    epoch_loss = running_loss / len(train_dataloader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')

AssertionError: was expecting embedding dimension of 128, but got 392