In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from scipy.stats import spearmanr
import scanpy as sc
from torch.utils.data import DataLoader, Dataset

: 

In [None]:
# Paths
data_dir = "/home/vivian.chu/vivian-sandbox/other/xAI-cancer-competition/.data"
out_dir = "/home/vivian.chu/vivian-sandbox/other/xAI-cancer-competition/vivian-models"

# Load data
train_data = pd.read_csv(f"{data_dir}/train.csv")
train_targets = pd.read_csv(f"{data_dir}/train_targets.csv")
test_data = pd.read_csv(f"{data_dir}/test.csv")

# Preprocess data
train_data.rename(columns={'Unnamed: 0': 'sample'}, inplace=True)
train_data['sample'] = train_data['sample'].astype(str)
train_targets['sample'] = train_targets['sample'].astype(str)
test_data['sampleId'] = test_data['sampleId'].astype(str)

In [130]:
print("Train:", train_data.shape)
print("Test:", test_data.shape)

Train: (742, 19921)
Test: (304, 19921)


In [None]:
# Merge train data and targets
train = train_data.merge(train_targets, on="sample")

In [None]:
# Use Scanpy for highly variable gene selection
adata = sc.AnnData(X=train.iloc[:, 1:-2].values, obs=train[["sample", "AAC", "tissue"]])
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=2000)
selected_genes = adata.var[adata.var['highly_variable']].index
train_X = adata[:, selected_genes].X
train_y = adata.obs["AAC"].values

In [None]:
test_X = test_data[selected_genes].values

In [None]:
# Split train into train/validation
X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, test_size=0.2, random_state=42)

In [None]:
# Dataset and DataLoader
class ExpressionDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32) if y is not None else None

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        if self.y is not None:
            return self.X[idx], self.y[idx]
        return self.X[idx]

In [None]:
train_dataset = ExpressionDataset(X_train, y_train)
val_dataset = ExpressionDataset(X_val, y_val)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [None]:
# Model
class DrugResponseModel(nn.Module):
    def __init__(self, input_size):
        super(DrugResponseModel, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 1)
        )

    def forward(self, x):
        return self.network(x)

In [None]:
# Initialize model
input_size = X_train.shape[1]
model = DrugResponseModel(input_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Training loop
def train_model(model, train_loader, val_loader, epochs=50):
    best_spearman = -1
    for epoch in range(epochs):
        model.train()
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            preds = model(X_batch).squeeze()
            loss = criterion(preds, y_batch)
            loss.backward()
            optimizer.step()
        
        # Validation
        model.eval()
        val_preds, val_true = [], []
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                preds = model(X_batch).squeeze()
                val_preds.extend(preds.cpu().numpy())
                val_true.extend(y_batch.cpu().numpy())
        
        spearman = spearmanr(val_preds, val_true)[0]
        print(f"Epoch {epoch+1}/{epochs}, Spearman: {spearman:.4f}")
        if spearman > best_spearman:
            best_spearman = spearman
            torch.save(model.state_dict(), f"{out_dir}/best_model.pth")
    return best_spearman

# Train the model
best_spearman = train_model(model, train_loader, val_loader)

In [None]:
# Load the best model
model.load_state_dict(torch.load(f"{out_dir}/best_model.pth"))

# Predict on test data
test_dataset = ExpressionDataset(test_X)
test_loader = DataLoader(test_dataset, batch_size=32)
model.eval()
test_preds = []
with torch.no_grad():
    for X_batch in test_loader:
        preds = model(X_batch).squeeze()
        test_preds.extend(preds.cpu().numpy())

In [None]:
# Save predictions
test_preds_df = pd.DataFrame({"sampleId": test_data["sampleId"], "AAC": test_preds})
test_preds_df.to_csv(f"{out_dir}/test_predictions.csv", index=False)

print("Test predictions saved.")