# 🧬 Drug Side Effect Prediction - Google Colab

**Complete training and evaluation pipeline**

---

## Hardware: T4 GPU (12GB RAM minimum)

## 1️⃣ Setup & Check GPU

In [None]:
# Check GPU
!nvidia-smi

import torch
print(f"\nPyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Setup working directory
!mkdir -p /content/drug_prediction
%cd /content/drug_prediction

## 2️⃣ Install Dependencies

In [None]:
!pip install -q numpy pandas scikit-learn scipy tqdm tensorboard matplotlib seaborn rdkit subword-nmt
print("✓ Installed!")

## 3️⃣ Upload Files

Upload all `.py` files and data files

In [None]:
from google.colab import files

!mkdir -p data/raw

print("Upload: data files + all .py files")
uploaded = files.upload()

# Move data files
for f in uploaded.keys():
    if f.endswith(('.pkl', '.csv', '.txt')):
        !mv "{f}" data/raw/
        
print("✓ Uploaded!")

## 4️⃣ Verify Files

In [None]:
import os

# Check data
data_files = ['drug_SMILES_750.csv', 'drug_codes_chembl_freq_1500.txt', 
              'subword_units_map_chembl_freq_1500.csv', 'drug_side.pkl']

print("Data Files:")
for f in data_files:
    path = f'data/raw/{f}'
    print(f"{'✓' if os.path.exists(path) else '✗'} {f}")

# Check code
code_files = ['config.py', 'dataset.py', 'encoder.py', 'model.py', 
              'smiles_encoder.py', 'preprocessing.py', 'preprocess_data.py',
              'trainer.py', 'evaluator.py', 'train.py', 'evaluate.py']

print("\nCode Files:")
for f in code_files:
    print(f"{'✓' if os.path.exists(f) else '✗'} {f}")

## 5️⃣ Preprocess Data (20-30 min)

In [None]:
!python preprocess_data.py \
    --data_dir data/raw \
    --output_dir data/processed \
    --top_k 50 \
    --n_folds 10

print("\n✓ Preprocessing done!")

In [None]:
# View statistics
import json
with open('data/processed/dataset_statistics.json') as f:
    stats = json.load(f)

print(f"Samples: {stats['dataset']['num_samples']:,}")
print(f"Positive: {stats['dataset']['num_positive']:,}")
print(f"Drugs: {stats['dataset']['num_drugs']:,}")
print(f"Side Effects: {stats['dataset']['num_side_effects']:,}")

## 6️⃣ Train Model

### Quick Test (1 fold, 50 epochs)

In [None]:
!python train.py \
    --config fast \
    --start_fold 0 \
    --end_fold 1 \
    --epochs 50 \
    --batch_size 128 \
    --use_amp \
    --device cuda

print("\n✓ Training done!")

### Full Training (10 folds, 200 epochs) - Takes 20-30 hours

In [None]:
# Uncomment to run full training
# !python train.py \
#     --config fast \
#     --epochs 200 \
#     --batch_size 128 \
#     --use_amp \
#     --compile_model \
#     --device cuda

### Monitor with TensorBoard

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs/tensorboard

## 7️⃣ Evaluate Model

In [None]:
!python evaluate.py \
    --checkpoint_dir checkpoints \
    --save_predictions \
    --device cuda

print("\n✓ Evaluation done!")

In [None]:
# View results
with open('outputs/results/test_aggregated_results.json') as f:
    results = json.load(f)

for metric in ['rmse', 'pearson', 'auc_roc', 'f1']:
    mean = results.get(f'{metric}_mean', 0)
    std = results.get(f'{metric}_std', 0)
    print(f"{metric:10s}: {mean:.4f} ± {std:.4f}")

## 8️⃣ Visualize Results

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from scipy.stats import pearsonr

# Load predictions
df = pd.read_csv('outputs/results/fold_0/predictions.csv')
y_true = df['label'].values
y_pred = df['prediction'].values

# Create plots
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Scatter plot
axes[0].scatter(y_true, y_pred, alpha=0.3, s=10)
axes[0].plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--')
corr, _ = pearsonr(y_true, y_pred)
axes[0].set_title(f'Predictions\nPearson: {corr:.3f}')
axes[0].set_xlabel('Actual')
axes[0].set_ylabel('Predicted')

# ROC Curve
from sklearn.metrics import roc_curve, auc
y_true_bin = (y_true != 0).astype(int)
fpr, tpr, _ = roc_curve(y_true_bin, y_pred)
roc_auc = auc(fpr, tpr)
axes[1].plot(fpr, tpr, label=f'AUC = {roc_auc:.3f}')
axes[1].plot([0, 1], [0, 1], 'r--')
axes[1].set_title('ROC Curve')
axes[1].set_xlabel('FPR')
axes[1].set_ylabel('TPR')
axes[1].legend()

# Confusion Matrix
from sklearn.metrics import confusion_matrix
y_pred_bin = (y_pred > 0.5).astype(int)
cm = confusion_matrix(y_true_bin, y_pred_bin)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[2])
axes[2].set_title('Confusion Matrix')
axes[2].set_xlabel('Predicted')
axes[2].set_ylabel('Actual')

plt.tight_layout()
plt.savefig('results_visualization.png', dpi=300)
plt.show()

## 9️⃣ Download Results

In [None]:
# Zip results
!zip -r results.zip outputs/results/
!zip -r checkpoints.zip checkpoints/fold_0/best_model.pth

# Download
from google.colab import files
files.download('results.zip')
files.download('checkpoints.zip')

print("✓ Downloaded!")

## 🔟 Make Predictions

In [None]:
# Load model for inference
from config import get_default_config
from model import create_model
from smiles_encoder import create_smiles_encoder

# Setup
config = get_default_config()
config.device = 'cuda'

model = create_model(config.model, device='cuda')
checkpoint = torch.load('checkpoints/fold_0/best_model.pth')
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

smiles_encoder = create_smiles_encoder(
    'data/raw/drug_codes_chembl_freq_1500.txt',
    'data/raw/subword_units_map_chembl_freq_1500.csv',
    50
)

se_index = np.load('data/processed/SE_sub_index_50_0.npy')
se_mask = np.load('data/processed/SE_sub_mask_50_0.npy')

print("✓ Model loaded!")

In [None]:
# Prediction function
@torch.no_grad()
def predict(drug_smiles, se_id):
    drug_enc, drug_m = smiles_encoder.encode(drug_smiles)
    se_enc = se_index[se_id]
    se_m = se_mask[se_id]
    
    drug_t = torch.from_numpy(drug_enc).unsqueeze(0).cuda()
    se_t = torch.from_numpy(se_enc).unsqueeze(0).cuda()
    drug_m_t = torch.from_numpy(drug_m).unsqueeze(0).cuda()
    se_m_t = torch.from_numpy(se_m).unsqueeze(0).cuda()
    
    output, _, _ = model(drug_t, se_t, drug_m_t, se_m_t)
    return output.item()

# Example
smiles = "CC(C)Cc1ccc(cc1)[C@@H](C)C(=O)O"  # Ibuprofen
se_id = 100

pred = predict(smiles, se_id)
print(f"\nDrug: {smiles}")
print(f"SE ID: {se_id}")
print(f"Prediction: {pred:.4f}")
print(f"Has SE: {'Yes' if pred > 0.5 else 'No'}")

## ✅ Complete!

**Your model is trained and ready to use!** 🎉