# Notebook for recreation of results from 3 different model types
This notebook is used to recreate our results for our best models for each of the different model types used in the project.

We assess the performance on the same test dataset for all models, namely the gtex dataset which only includes Artery tissue types.




OBS.
* The models checkpoints are accompanying the assignment delivery, are to placed in the `models` directory to be accessed by the script.


In [2]:
import pickle
import torch
from torch.utils.data import DataLoader
import numpy as np
from tqdm import tqdm
import IsoDatasets
from VAE2 import VAE_lf
from FFNN import FeedForwardIsoform_small, FeedForwardIsoform_XL

### Initialize common functionalities

In [16]:
# Setup dataset
gtex_test = IsoDatasets.GtexDataset("/dtu-compute/datasets/iso_02456/hdf5-row-sorted/", include='Artery')

# Check gpu availability
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f">> Using device: {device}")

# Setup MSE loss
criterion = torch.nn.MSELoss()

>> Using device: cpu


### Standalone DNN performance
Here we load in a checkpoint for out best performing standalone DNN and checks the performance on the artery test dataset

In [30]:
# Init
STANDALONE_DNN_MODEL_PATH = f"/dtu/blackhole/0b/155947/models/STANDALONE_DENSE_lr0.001_e5_wd5e-07_p10_small_tl0.2647835611185031"
gtx_test_dataloader = DataLoader(gtex_test, batch_size=10, shuffle=True)

# Grab a sample to initialize output size for DNN class
gene_expr, isoform_expr, _ = next(iter(gtx_test_dataloader))

# DNN model
dnn = FeedForwardIsoform_small(input_shape = gene_expr[0].size(), 
                            output_shape = isoform_expr[0].size())
checkpoint = torch.load(STANDALONE_DNN_MODEL_PATH, map_location=torch.device('cpu'))
dnn.load_state_dict(checkpoint['model_state_dict'])

dnn = dnn.to(device)

In [31]:
dnn.eval()
test_loss = []
for x, y, _ in tqdm(gtx_test_dataloader):
    x = x.to(device)
    y = y.to(device)

    # Run through network
    x = dnn.forward(x)

    loss = criterion(x, y)

    test_loss.append(loss.item())

100%|██████████| 134/134 [00:17<00:00,  7.52it/s]


In [32]:
mean_test_loss_standaloneDNN = np.mean(test_loss)
print('Mean test loss of standalone DNN is:', mean_test_loss_standaloneDNN)

Mean test loss of standalone DNN is: 0.2646775093318811


### PCA-DNN performance
Here we load in a checkpoint for out best performing PCA-DNN and checks the performance on the artery test dataset

In [4]:
# Init
PCA_SIZE = 1024
PCA_MODEL_PATH = f"/dtu/blackhole/0b/155947/models/ipca_model_n{PCA_SIZE}.pkl"
DNN_MODEL_PATH = f"/dtu/blackhole/0b/155947/models/PCA_DENSE_l1024_lr0.0001_e100_wd1e-08_p10"
gtx_test_dataloader = DataLoader(gtex_test, batch_size=10, shuffle=True)

# Loading the PCA
with open(PCA_MODEL_PATH, 'rb') as file:
    ipca = pickle.load(file)

# Grab a sample to initialize output size for DNN class
gene_expr, isoform_expr, _ = next(iter(gtx_test_dataloader))

# DNN model
dnn = FeedForwardIsoform_XL(input_shape = PCA_SIZE, 
                             output_shape = isoform_expr[0].size())
checkpoint = torch.load(DNN_MODEL_PATH, map_location=torch.device('cpu'))
dnn.load_state_dict(checkpoint['model_state_dict'])

dnn = dnn.to(device)

In [5]:
test_loss = []
for x, y, _ in tqdm(gtx_test_dataloader):
    # Perform PCA
    x = ipca.transform(x)

    # Datatype handling
    x = torch.from_numpy(x).float()
    x = x.to(device)
    y = y.to(device)

    # Run through network
    x = dnn.forward(x)

    loss = criterion(x, y)

    test_loss.append(loss.item())

100%|██████████| 14/14 [01:00<00:00,  4.29s/it]


In [10]:
mean_test_loss_PCADNN = np.mean(test_loss)
print('Mean test loss of PCADNN is:', mean_test_loss_PCADNN)
test_loss

Mean test loss of PCADNN is: 0.7727246156760624


[0.7282154560089111,
 0.7362427711486816,
 0.7790459990501404,
 0.7592176198959351,
 0.77659010887146,
 0.782164990901947,
 0.7979474663734436,
 0.7533282041549683,
 0.8233906626701355,
 0.7620619535446167,
 0.7716060280799866,
 0.7683447599411011,
 0.7700207829475403,
 0.8099678158760071]

### Encoder-DNN performance
Here we load in a checkpoint for out best performing Encoder-DNN and checks the performance on the artery test dataset.

In [None]:
# Init
LATENT_FEATURES = 2
ENCODER_MODEL_PATH = f"Thomas/deepIsoform/models/ENCODER"
DNN_MODEL_PATH = f"Thomas/deepIsoform/models/DENSE"
gtx_test_dataloader = DataLoader(gtex_test, batch_size=10, shuffle=True)

# Grab a sample to initialize input size for encoder and output size for DNN class
gene_expr, isoform_expr, _ = next(iter(gtx_test_dataloader))

# Loading VAE checkpoint to be utilized as encoder
vae = VAE_lf(input_shape=gene_expr[0].size(),
                       hidden_features=0,                   # Ignore hidden_feature parameter (not used)
                       latent_features=LATENT_FEATURES)
checkpoint = torch.load(ENCODER_MODEL_PATH, map_location=torch.device('cpu'))
vae.load_state_dict(checkpoint['model_state_dict'])

# DNN model
dnn = FeedForwardIsoform_XL(input_shape = PCA_SIZE, 
                             output_shape = isoform_expr[0].size())
checkpoint = torch.load(DNN_MODEL_PATH, map_location=torch.device('cpu'))
dnn.load_state_dict(checkpoint['model_state_dict'])

vae = vae.to(device)
dnn = dnn.to(device)

In [None]:
test_loss = []
for x, y, _ in tqdm(gtx_test_dataloader):
    # Send to device
    x = x.to(device)
    y = y.to(device)

    # Encode input to latent space
    mu, logvar = vae.encode_mu_var(x)
    z = vae.reparameterize(mu, logvar)

    # Run through network
    x = dnn.forward(z)

    # Caculate loss and backprop
    loss = criterion(x, y).double()

    test_loss.append(loss.item())

In [None]:
mean_test_loss_encDNN = torch.mean(test_loss).item()
print('Mean test loss of PCADNN is:', mean_test_loss_encDNN)

### Comparison of the 3 models