In [154]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import torch.nn.functional as F

In [183]:
import pytorch_lightning as pl

# Define logistic regression model using PyTorch Lightning
# class LogisticRegressionModel(pl.LightningModule):
#     def __init__(self, input_dim, output_dim):
#         super(LogisticRegressionModel, self).__init__()
#         self.linear = nn.Linear(input_dim, output_dim)
#         self.criterion = nn.CrossEntropyLoss()
    
#     def forward(self, x):
#         return torch.softmax(self.linear(x), dim=1)
    
#     def training_step(self, batch, batch_idx):
#         x, y = batch
#         y_hat = self(x)
#         loss = self.criterion(y_hat, y)
#         self.log('train_loss', loss)
#         return loss

#     def test_step(self, batch, batch_idx):
#         x, y = batch
#         y_hat = self(x)
#         predicted = torch.argmax(y_hat, 1)
#         accuracy = (predicted == y).sum().item() / len(y)
#         self.log('test_accuracy', accuracy, on_epoch=True)
#         return accuracy

#     def configure_optimizers(self):
#         return torch.optim.Adam(self.parameters(), lr=0.01)

class LogisticRegressionModel(pl.LightningModule):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegressionModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, output_dim)
        self.criterion = nn.CrossEntropyLoss()
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return F.softmax(x, dim=1)
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        self.log('train_loss', loss)
        return loss

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        _, predicted = torch.max(y_hat, 1)
        accuracy = (predicted == y).sum().item() / len(y)
        self.log('test_accuracy', accuracy, on_epoch=True)
        return accuracy

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.01)

In [174]:
feature_dest_folder = Path("/home/icb/alessandro.palma/environment/IMPA/IMPA/project_folder/dino_featurization_project/featurized_anndata/rxrx1")
adata_after_transf = sc.read_h5ad(feature_dest_folder / "rxrx1_adata_after_transf.h5ad")
adata_before_transf = sc.read_h5ad(feature_dest_folder / "rxrx1_adata_before_transf.h5ad")

Add compounds

In [175]:
data_index = pd.read_csv('/home/icb/alessandro.palma/environment/IMPA/IMPA/project_folder/datasets/rxrx1/metadata/rxrx1_df.csv', index_col=1)

compound_names = []

for row in adata_before_transf.obs.iterrows():
    batch = row[1].batch
    plate = row[1].plate
    well = row[1].well
    view = row[1]["view"]
    no = row[1].no
    file_name = f"U2OS-{batch}_{plate}_{well}_{view}_{no}"
    cpd = data_index.loc[file_name].CPD_NAME
    compound_names.append(cpd)

In [176]:
adata_after_transf.obs["compound"] = compound_names
adata_before_transf.obs["compound"] = compound_names

## BEFORE correction

In [177]:
X_before = adata_before_transf.X.copy()
y_before = np.array(compound_names)

In [178]:
label_encoder_before = LabelEncoder()
y_before = label_encoder_before.fit_transform(y_before)

In [179]:
X_train_before, X_test_before, y_train_before, y_test_before = train_test_split(X_before, 
                                                                                y_before, 
                                                                                test_size=0.2,
                                                                                random_state=42)

In [180]:
X_train_tensor_before = torch.tensor(X_train_before, dtype=torch.float32)
y_train_tensor_before = torch.tensor(y_train_before, dtype=torch.long)
X_test_tensor_before = torch.tensor(X_test_before, dtype=torch.float32)
y_test_tensor_before = torch.tensor(y_test_before, dtype=torch.long)

In [181]:
# Create DataLoader for batching
train_data_before = TensorDataset(X_train_tensor_before, y_train_tensor_before)
train_loader_before = DataLoader(train_data_before, batch_size=256, shuffle=True, num_workers=5)

In [184]:
# Initialize the Lightning model
input_dim_before = X_train_before.shape[1]
output_dim_before = len(np.unique(compound_names))
model_before = LogisticRegressionModel(input_dim_before, output_dim_before)

# Train the Lightning model
trainer_before = pl.Trainer(max_epochs=100)
trainer_before.fit(model_before, train_loader_before)

/home/icb/alessandro.palma/miniconda3/envs/IMPA_try/lib/python3.9/site-packages/lightning_fabric/plugins/environments/slurm.py:191: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python3.9 /home/icb/alessandro.palma/miniconda3/envs/IMPA_t ...
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/icb/alessandro.palma/miniconda3/envs/IMPA_try/lib/python3.9/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:630: Checkpoint directory /ictstr01/home/icb/alessandro.palma/environment/IMPA/IMPA/notebooks/cpg0000/lightning_logs/version_18688408/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | fc1       | Linear           | 24.6 K
1 | fc

Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=100` reached.


In [185]:
test_data_before = TensorDataset(X_test_tensor_before, y_test_tensor_before)
test_loader_before = DataLoader(test_data_before, batch_size=32)
test_results_before = trainer_before.test(dataloaders=test_loader_before)

Restoring states from the checkpoint path at /ictstr01/home/icb/alessandro.palma/environment/IMPA/IMPA/notebooks/cpg0000/lightning_logs/version_18688408/checkpoints/epoch=99-step=53500-v2.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /ictstr01/home/icb/alessandro.palma/environment/IMPA/IMPA/notebooks/cpg0000/lightning_logs/version_18688408/checkpoints/epoch=99-step=53500-v2.ckpt
/home/icb/alessandro.palma/miniconda3/envs/IMPA_try/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=5` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

## AFTER correction

In [186]:
X_after = adata_after_transf.X.copy()
y_after = np.array(compound_names)

In [187]:
label_encoder_after = LabelEncoder()
y_after = label_encoder_after.fit_transform(y_after)

In [188]:
X_train_after, X_test_after, y_train_after, y_test_after = train_test_split(X_after, 
                                                                            y_after, 
                                                                            test_size=0.2,
                                                                            random_state=42)

In [189]:
X_train_tensor_after = torch.tensor(X_train_after, dtype=torch.float32)
y_train_tensor_after = torch.tensor(y_train_after, dtype=torch.long)
X_test_tensor_after = torch.tensor(X_test_after, dtype=torch.float32)
y_test_tensor_after = torch.tensor(y_test_after, dtype=torch.long)

In [190]:
# Create DataLoader for batching
train_data_after = TensorDataset(X_train_tensor_after, y_train_tensor_after)
train_loader_after = DataLoader(train_data_after, batch_size=512, shuffle=True, num_workers=5)

In [191]:
# Initialize the Lightning model
input_dim_after = X_train_after.shape[1]
output_dim_after = len(np.unique(compound_names))
model_after = LogisticRegressionModel(input_dim_after, output_dim_after)

# Train the Lightning model
trainer_after = pl.Trainer(max_epochs=100)
trainer_after.fit(model_after, train_loader_after)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | fc1       | Linear           | 24.6 K
1 | fc2       | Linear           | 2.1 K 
2 | fc3       | Linear           | 35.3 K
3 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
62.1 K    Trainable params
0         Non-trainable params
62.1 K    Total params
0.248     Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=100` reached.


In [192]:
test_data_after = TensorDataset(X_test_tensor_after, y_test_tensor_after)
test_loader_after = DataLoader(test_data_after, batch_size=32)
test_results_after = trainer_after.test(dataloaders=test_loader_after)

Restoring states from the checkpoint path at /ictstr01/home/icb/alessandro.palma/environment/IMPA/IMPA/notebooks/cpg0000/lightning_logs/version_18688408/checkpoints/epoch=99-step=26800-v2.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /ictstr01/home/icb/alessandro.palma/environment/IMPA/IMPA/notebooks/cpg0000/lightning_logs/version_18688408/checkpoints/epoch=99-step=26800-v2.ckpt


Testing: |          | 0/? [00:00<?, ?it/s]