In [1]:
# Import libraries that are required to run your project
# You are allowed to add more libraries as you need

import os
import pandas as pd
import numpy as np
import torch.nn
from scipy.stats import spearmanr

## Work Package 1.1 - Modeling Choices & Data Pre-processing

In [7]:
# TODO: 
# Load your feature (bed and/or bigwig and/or fasta) and target files (tsv) here.
# Decide which features to use for training. Feel free to process them however you need.

from utils.dataset import InputDataset

# NOTE: 
# bed and bigwig files contain signals of all chromosomes (including sex chromosomes).
# Training and validation split based on chromosomes has been done for you. 
# However, you can resplit the data in any way you want.

#phils_path = "/home/phil/Downloads/ML4G_Project_1_Data"

path_data = "/home/phil/Downloads/ML4G_Project_1_Data"  # TODO
path_test = "/path/to/test/info/file"   # X3_test_info.tsv ; TODO
path_test = os.path.join(path_data, 'CAGE-train/CAGE-train/X3_test_info.tsv')
test_genes = pd.read_csv(path_test, sep='\t')
# ---------------------------INSERT CODE HERE---------------------------

# load gene information
train_info_X1 = pd.read_csv(os.path.join(path_data, 'CAGE-train', 'CAGE-train', 'X1_train_info.tsv'), sep='\t')
train_info_X2 = pd.read_csv(os.path.join(path_data, 'CAGE-train', 'CAGE-train', 'X1_train_info.tsv'), sep='\t')
print(f'Training Data Keys: {train_info_X1.keys()}')

# Modalities chosen by looking at the
modalities = ['DNase', 'H3K27ac', 'H3K4me1', 'H3K4me3', 'H3K36me3']
window_size = 20000

# load_data
train_data_X1 = InputDataset(data_directory=path_data, cell_line='X1', objective="train", modality_names=modalities, window_size=window_size)

val_data_X1 = InputDataset(data_directory=path_data, cell_line='X1', objective="val", modality_names=modalities, window_size=window_size)

train_data_X2 = InputDataset(data_directory=path_data, cell_line='X2', objective="train", modality_names=modalities, window_size=window_size)

val_data_X2 = InputDataset(data_directory=path_data, cell_line='X2', objective="val", modality_names=modalities, window_size=window_size)

# ---------------------------------------------------------------------- 

Index(['gene_name', 'chr', 'gene_start', 'gene_end', 'TSS_start', 'TSS_end',
       'strand'],
      dtype='object')


## Work Package 1.2 - Model Building

In [None]:
# TODO: 
# Select the best model to predict gene expression from the obtained features in WP 1.1.
import torch
import pytorch_lightning as pl

from utils.lightning_wrapper import ModelWrapper
from models.neural_nets import ConvolutionalModel

# ---------------------------INSERT CODE HERE---------------------------
# Subsetting method to use less data while testing models:
stepsize = 100
data_indices = list(range(0, len(train_data_X1), stepsize))
train_data_X1 = torch.utils.data.Subset(train_data_X1, data_indices)

data_indices = list(range(0, len(val_data_X1), stepsize))
val_data_X1 = torch.utils.data.Subset(val_data_X1, data_indices)

# TODO: in case we use the sequence as additional information, then add 4 to the number of channels c.
model = ConvolutionalModel(c=2*len(modalities))

lightning_model = ModelWrapper(model_architecture=model, learning_rate=1e-3, loss=torch.nn.L1Loss, datasets=[train_data_X1, val_data_X1], batch_size=1)

trainer = pl.Trainer(max_epochs=1, deterministic=True, reload_dataloaders_every_n_epochs=5)

if torch.cuda.is_available():
    trainer.fit(lightning_model, accelerator="gpu")
else:
    trainer.fit(lightning_model)
# ----------------------------------------------------------------------


## Work Package 1.3 - Prediction on Test Data (Evaluation Metric)

In [None]:
# TODO:
# Using the model trained in WP 1.2, make predictions on the test data (chr 1 of cell line X3).
# Store predictions in a variable called "pred" which is a numpy array.

pred = None
# ---------------------------INSERT CODE HERE---------------------------




# ----------------------------------------------------------------------

# Check if "pred" meets the specified constrains
assert isinstance(pred, np.ndarray), 'Prediction array must be a numpy array'
assert np.issubdtype(pred.dtype, np.number), 'Prediction array must be numeric'
assert pred.shape[0] == len(test_genes), 'Each gene should have a unique predicted expression'

#### Store Predictions in the Required Format

In [None]:
# Store predictions in a ZIP. 
# Upload this zip on the project website under "Your submission".
# Zip this notebook along with the conda environment (and README, optional) and upload this under "Your code".

save_dir = 'path/to/save/output/file'  # TODO
file_name = 'gex_predicted.csv'         # PLEASE DO NOT CHANGE THIS
zip_name = "LastName_FirstName_Project1.zip" # TODO
save_path = f'{save_dir}/{zip_name}'
compression_options = dict(method="zip", archive_name=file_name)

test_genes['gex_predicted'] = pred.tolist()
test_genes[['gene_name', 'gex_predicted']].to_csv(save_path, compression=compression_options)