### This Jupyter file is meant for testing code to get Enformer output for all the cfDNA coordinate files in a given directory, and store the output into H5PY files. For more details on what each functions are doing and the whole workflow, refer to storeEnformerOutput.py file

In [1]:
import numpy as np
import torch
import pandas as pd

from enformer_pytorch import Enformer
from torch.utils.data import DataLoader

import h5py
import sys

sys.path.insert(0,'/hpc/compgen/projects/fragclass/analysis/mvivekanandan/script/madhu_scripts')

import config
import utils
import sequenceDataset

import importlib   
import os

arguments in file sequenceDataset are {'coordStoreDirectory': '/hpc/compgen/projects/fragclass/analysis/mvivekanandan/output/subsetClassBalancedCoordinateFiles', 'refGenomePath': '/hpc/compgen/projects/fragclass/raw/hg19_genome/hg19_ch1-22_XYM.fa', 'usePaddingForSequenceOutput': None, 'modelInputSequenceSize': 'enformer', 'runWithControls': False, 'usePaddingForCnn': False, 'trainingCoordsDatasetName': 'trainingCoords', 'validationCoordsDatasetName': 'validationCoords', 'testCoordsDatasetName': 'testCoords', 'trainingLabelsDatasetName': 'trainingLabels', 'validationLabelsDatasetName': 'validationLabels', 'testLabelsDatasetName': 'testLabels'}


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"The device used is : {device}")

importlib.reload(config)
importlib.reload(utils)
importlib.reload(sequenceDataset)

#Set arguments from config file.
arguments = {}
#File paths
arguments["refGenomePath"] = config.filePaths.get("refGenomePath")
arguments["coordStoreDirectory"] = config.filePaths.get("coordStoreDirectory")
arguments["trainingEnformerOuputStoreFile"] = config.testFilePaths.get("trainingEnformerOutputStoreFile")
arguments["validationEnformerOuputStoreFile"] = config.testFilePaths.get("validationEnformerOutputStoreFile")
arguments["testEnformerOuputStoreFile"] = config.testFilePaths.get("testEnformerOutputStoreFile")
arguments["trainingMetadata"] = config.filePaths.get("trainingMetadata")
arguments["validationMetadata"] = config.filePaths.get("validationMetadata")
arguments["testMetadata"] = config.filePaths.get("testMetadata")

#Enformer output model hyperparameters
arguments["enformerBatchSize"] = config.modelHyperParameters.get("enformerBatchSize")
arguments["enformerNumberOfWorkers"] = config.modelHyperParameters.get("enformerNumberOfWorkers")

#General configs
arguments["file_sharing_strategy"] = config.modelGeneralConfigs.get("fileSharingStrategy")
arguments["enformerOutputFileCompression"] = config.modelGeneralConfigs.get("enformerOutputFileCompression")
arguments["enformerOutputFileChunkSize"] = config.modelGeneralConfigs.get("enformerOutputFileChunkSize")

#Datasets
arguments["trainingLabelsDatasetName"] = config.datasetNames.get("trainingLabels")
arguments["validationLabelsDatasetName"] = config.datasetNames.get("validationLabels")
arguments["testLabelsDatasetName"] = config.datasetNames.get("testLabels")
arguments["trainingEnformerOutputDatasetName"] = config.datasetNames.get("trainingEnformerOutput")
arguments["validationEnformerOutputDatasetName"] = config.datasetNames.get("validationEnformerOutput")
arguments["testEnformerOutputDatasetName"] = config.datasetNames.get("testEnformerOutput")

print(arguments)
fileIndicesDict = {}

The device used is : cuda
arguments in file sequenceDataset are {'coordStoreDirectory': '/hpc/compgen/projects/fragclass/analysis/mvivekanandan/output/subsetClassBalancedCoordinateFiles', 'refGenomePath': '/hpc/compgen/projects/fragclass/raw/hg19_genome/hg19_ch1-22_XYM.fa', 'usePaddingForSequenceOutput': None, 'modelInputSequenceSize': 'enformer', 'runWithControls': False, 'usePaddingForCnn': False, 'trainingCoordsDatasetName': 'trainingCoords', 'validationCoordsDatasetName': 'validationCoords', 'testCoordsDatasetName': 'testCoords', 'trainingLabelsDatasetName': 'trainingLabels', 'validationLabelsDatasetName': 'validationLabels', 'testLabelsDatasetName': 'testLabels'}
{'refGenomePath': '/hpc/compgen/projects/fragclass/raw/hg19_genome/hg19_ch1-22_XYM.fa', 'coordStoreDirectory': '/hpc/compgen/projects/fragclass/analysis/mvivekanandan/output/subsetClassBalancedCoordinateFiles', 'trainingEnformerOuputStoreFile': '/hpc/compgen/projects/fragclass/analysis/mvivekanandan/output/EnformerOutputs

In [3]:
def getEnformerPredictions(enformer_model, sequence, bins, ntracks, batchNum):
    with torch.no_grad():
        startBins = bins[0]
        endBins = bins[1]

        #For each output from enformer, get the right bin. 
        full_enformer_output = enformer_model(sequence)['human']
    
    #the enformer prediction is still in the GPU (since we sent the enformer model and one hot encoded sequence to the GPU. Numpy arrays are not supported in the GPU(GPU probably supports only tensors). So we pass the enformer prediction to CPU and convert it into a numpy array.
    #Detach is used to remove the gradients from the predictions. Gradients are similar to the weights of the model. In our case, we are only interested in the predictions and not the model training, so we remove the gradients to save space.
    full_enformer_output = full_enformer_output.detach().cpu()
    final_enformer_output = torch.empty(1, 2, ntracks)
    dims = full_enformer_output.shape
    
    for i in range(dims[0]):
        #endBin + 1 because of the way torch index based slicing works. x[:, 1:3, :] will give the 1st and 2nd index
        #So the end index in slicing should always be 1 greater than the last index we want. 
        # print(f"For iteration {i}, startBins is {startBins[i]} and endBin is {endBins[i]}")
        assert full_enformer_output.numel() != True, f"Something is wrong !! The enformer pedictions is empty for batch {batchNum} and within batch iteration {i}"
        single_sample_output = full_enformer_output[i, int(startBins[i]):int(endBins[i]) + 1, :].reshape(1, 2, ntracks)
        # print(f"Shape of single sample enformer output is {single_sample_output.shape}")
        final_enformer_output = torch.cat([final_enformer_output,single_sample_output], dim=0)
    
    #The 1st value in the final enformer output is the empty tensor we created for concatenation purposes. 
    final_enformer_output = final_enformer_output[1:, :, :]
    # print(f"Shape of the enformer prediction after taking bins is {final_enformer_output.shape}")
    # print(f"Printing the output shape from enformer {pretrained_output.shape}", flush=True)

    #Combine enformer outputs from 2 bins into a single long output. Each bin, each track is a feature. So the total
    #number of features for training is num_bins * num_tracks_per_bin. All features can be combined in a
    #single 1D tensor array. The other dimension will be the batch size.
    batch_size, nbins, ntracks = final_enformer_output.shape

    final_enformer_output = torch.reshape(final_enformer_output, (batch_size, nbins * ntracks))
    return final_enformer_output

In [4]:
#Look into how much h5py content can be compressed. Greater the compression, longer the time needed to read it again.
def storeAsH5pyFile(sampleType, numSamples, numEnformerOuputSingleSample, createDataset = False, h5_file = False, 
                    enformerOutputToStore=False, labelsToStore=False, currentIndex = False):
   
   enformerOutputDatasetName = arguments[sampleType + "EnformerOutputDatasetName"]
   labelsDatasetName = arguments[sampleType + "LabelsDatasetName"]
   enformerOutputFilePath = arguments[sampleType + "EnformerOuputStoreFile"]
   print(f"enformer output file path is {enformerOutputFilePath}")
   
   #If we opening the H5PY file for the 1st time then create the dataset and return the file. 
   if createDataset: 
      print("This is the 1st time. Inside createDataset")

      if h5_file == False:
         h5_file = h5py.File(enformerOutputFilePath, "w-")

      h5_file.create_dataset(enformerOutputDatasetName,  (numSamples, numEnformerOuputSingleSample),
                                    compression="gzip", compression_opts=arguments["enformerOutputFileCompression"], 
                                    chunks = (arguments["enformerOutputFileChunkSize"], numEnformerOuputSingleSample))
      h5_file.create_dataset(labelsDatasetName, (numSamples, 1), compression="gzip", 
                             compression_opts=arguments["enformerOutputFileCompression"], 
                              chunks = (arguments["enformerOutputFileChunkSize"], 1))
      return(h5_file)

   else:
      sizeOfOutputToStore = len(labelsToStore)
      endIndex = currentIndex + sizeOfOutputToStore
      h5_file[enformerOutputDatasetName][(currentIndex):(endIndex),:] = enformerOutputToStore
      h5_file[labelsDatasetName][(currentIndex):(endIndex),:] = labelsToStore
      return endIndex

def storeMetadataAsCsv(sampleType, filepathData, indexData):
   metadataFileKey = sampleType + "Metadata"
   metadataFilePath = arguments[metadataFileKey]
   metadata = pd.DataFrame({'og_file': filepathData, 'indexInFile':indexData})
   print(f"Shape of metadata df after all batches are done is {metadata.shape}")
   metadata.to_csv(metadataFilePath, sep='\t', index=False)

In [5]:
def set_worker_sharing_strategy(worker_id: int) -> None:
    torch.multiprocessing.set_sharing_strategy(arguments["file_sharing_strategy"])

#The function returns 2 numpy arrays. The 1st numpy array is the enformer output for all cfdna fragments. The second numpy array is the array of labels for all cfDNA fragments.
def storeEnformerOutput(sampleType, h5_file = False):
    nbins = 2
    ntracks = 5313

    #Set the model to eval mode first and then send it to cuda. This prevents the GPU node from running out of memory.
    enformerModel = Enformer.from_pretrained('EleutherAI/enformer-official-rough', use_checkpointing = True).eval()
    enformerModel = enformerModel.to(device)
    
    enformerInputDataset = sequenceDataset.SequenceDataset(sampleType)
    enformerInputDataloader = DataLoader(enformerInputDataset, batch_size=arguments["enformerBatchSize"], 
                                         num_workers=arguments["enformerNumberOfWorkers"],
                                         shuffle=True, worker_init_fn=set_worker_sharing_strategy)
    
    numSamples = len(enformerInputDataset)

    #Create the datasets for storing enformer output. 
    h5_file = storeAsH5pyFile(sampleType, numSamples, nbins * ntracks, True, h5_file)
    
    filepath_data, index_data = [], []
    currentH5Index = 0

    for i, data in enumerate(enformerInputDataloader, 0):
        print(f"Processing data for batch {i}", flush = True)
        
        #Store the filepath and the index within file to a separate CSV file. This is to ensure that we are able to locate the sample
        #so we can access the metadata(from original coordinate bed file) associated with the sample. 
        #filepath and index should have all the samples data from this batch. 
        encodedSequence, label, bins, filepath, indexWithinFile, _= data

        filepath_data.extend(filepath)
        index_data.extend(indexWithinFile)

        # print(f"Printing the shape of the encoded sequence {encodedSequence.shape}", flush = True)
        # print(f"Printing the shape of label {label.shape}")
        encodedSequence = encodedSequence.to(device)
        
        #Will be of the shape [batch_size * 10626]
        enformerPrediction = getEnformerPredictions(enformerModel, encodedSequence, bins, ntracks, i).detach().cpu().numpy()
    
        #The data is getting too big to load, round off enformer predictions to 3 decimal places. 
        enformerPrediction = np.around(enformerPrediction, decimals=3)
        print(f"Finished processsing batch {i}, enformer prediction shape is {enformerPrediction.shape}")
        
        """
        H5 file contents are updated every batch. To ensure that the contents are not overwritten every batch, store with indices. 
        The indices given are ascending order numbers starting from 0, this ensures that the shuffled order is maintained while storing in H5PY file. 
        """
        currentH5Index = storeAsH5pyFile(sampleType, numSamples, nbins * ntracks, False, h5_file, enformerPrediction, label, currentH5Index)
        print(f"The number of samples stored in H5PY file so far is {currentH5Index}")

    h5_file.close()

    #Store the filename and index within the file for each sample as a CSV file for later use. 
    storeMetadataAsCsv(sampleType, filepath_data, index_data)

In [6]:
def verifyStoredEnformerTracks():
    """
    Assertions to be done for stored enformer tracks 
    1. Total share of enformer output file shoud be [num_samples_coordinate_files * 10626]
    1. Number of positives and negatives in enformer file = number of positives in the coordinate store directory
    2. Total shape of the enformer tracks 
    """
    coordsDir = arguments["coordStoreDirectory"]
    sampleCounts = {}
    sampleCounts["training"] = [0, 0]
    sampleCounts["validation"] = [0,0]
    sampleCounts["test"] = [0,0]

    for filename in os.listdir(coordsDir):
        filepath = os.path.join(coordsDir, filename)
        with h5py.File(filepath, 'r') as f:
            for sampleType in ["training", "validation", "test"]:
                labelsDataset = sampleType + "Labels"
                labels = f[labelsDataset][:]
                sampleCounts[sampleType][0] += (labels == 1).sum()
                sampleCounts[sampleType][1] += (labels == 0).sum()
    
    x = sampleCounts["training"]
    y = sampleCounts["validation"]
    z = sampleCounts["test"]

    print(f"Finished going over the coordinate files, the numbers are {x}, {y} and {z}")

    outputFilesDict = {}
    # outputFilesDict["training"] = arguments["trainingEnformerOuputStoreFile"]
    outputFilesDict["validation"] = arguments["validationEnformerOuputStoreFile"]
    # outputFilesDict["test"] = arguments["testEnformerOuputStoreFile"]
    
    for sampleType, file in outputFilesDict.items(): 
        with h5py.File(file, 'r') as f:
            enformerOutputDataset = sampleType + "EnformerOutput"
            labelsDataset = sampleType + "Labels"
            enformerDataShape = f[enformerOutputDataset][:].shape
            labels = f[labelsDataset][:]

            print(f"Enformer output shape is {enformerDataShape}")
            #Assertion 1 Verify that shape of enformer output is as expected: 
            assert enformerDataShape[0] == sampleCounts[sampleType][0] + sampleCounts[sampleType][1], (f"The total number of samples in enformer output"+
                                                                                                       f" file({enformerDataShape[0]}) does not match the "+
                                                                                                       "total samples in coordinate store directory"
                                                                                                       f"({sampleCounts[sampleType][0] + sampleCounts[sampleType][1]})")
            assert enformerDataShape[1] == 10626, (f"The number of enformer tracks in the output file({enformerDataShape[1]}) "+
                                                    "for samples is not 10626 !!")

            #Assertion - 2 Verify that the number of positives and negatives match 
            numPositives = (labels == 1).sum()
            numNegatives = (labels == 0).sum()
            print(f"Num positives and negatives in enformer for sample {sampleType} are {numPositives} and {numNegatives}")
            print(f"Num pos and neg in coord for sampleType {sampleType} are {sampleCounts[sampleType][0]} and {sampleCounts[sampleType][1]}")

            assert numPositives == sampleCounts[sampleType][0], (f"The number of positives in enformer file({numPositives}) "+
                                                                 f"does not match the original positives {sampleCounts[sampleType][0]}")
            assert numNegatives == sampleCounts[sampleType][1], (f"The number of negatives in enformer file({numNegatives}) "+
                                                                 f"does not match the original negatives {sampleCounts[sampleType][1]}")

In [8]:
if __name__ == '__main__':
    #Get enformer Output for training data 
    storeEnformerOutput("training")
    # storeEnformerOutput("validation")
    # storeEnformerOutput("test")
    verifyStoredEnformerTracks()

Total number of samples in all files combined is 212720
enformer output file path is /hpc/compgen/projects/fragclass/analysis/mvivekanandan/output/EnformerOutputs/training_test.hdf5
This is the 1st time. Inside createDataset
Processing data for batch 0




Finished processsing batch 0, enformer prediction shape is (8, 10626)
enformer output file path is /hpc/compgen/projects/fragclass/analysis/mvivekanandan/output/EnformerOutputs/training_test.hdf5
The number of samples stored in H5PY file so far is 8
Processing data for batch 1
Finished processsing batch 1, enformer prediction shape is (8, 10626)
enformer output file path is /hpc/compgen/projects/fragclass/analysis/mvivekanandan/output/EnformerOutputs/training_test.hdf5
The number of samples stored in H5PY file so far is 16
Processing data for batch 2
Finished processsing batch 2, enformer prediction shape is (8, 10626)
enformer output file path is /hpc/compgen/projects/fragclass/analysis/mvivekanandan/output/EnformerOutputs/training_test.hdf5
The number of samples stored in H5PY file so far is 24
Processing data for batch 3
Finished processsing batch 3, enformer prediction shape is (8, 10626)
enformer output file path is /hpc/compgen/projects/fragclass/analysis/mvivekanandan/output/Enf

KeyboardInterrupt: 

In [4]:
#Check if the getting index within file portion is working properly. 
coordStoreDir = "/hpc/compgen/projects/fragclass/analysis/mvivekanandan/output/subsetClassBalancedCoordinateFiles"
coordDatasetName = "trainingCoords"
startIndexList, fileNamesList = utils.createFileNameIndexList(coordStoreDir, coordDatasetName)

for index in [100, 10000, 20000, 30000, 100000]:
    filePosition = utils.getFilePositionFromIndex(startIndexList, index)
    filename = fileNamesList[filePosition]
    indexWithinFile = index - startIndexList[filePosition]
    print(f"For index {index}, filename : {filename} and index within file : {indexWithinFile}")

Total number of samples in all files combined is 212720
For index 100, filename : L20-M35.recipient.hdf5 and index within file : 100
For index 10000, filename : L20-M35.recipient.hdf5 and index within file : 10000
For index 20000, filename : L13-M24.donor.hdf5 and index within file : 483
For index 30000, filename : L20-M35.donor.hdf5 and index within file : 3811
For index 100000, filename : L10-M12_75.donor.hdf5 and index within file : 186
