In [1]:
import pandas as pd
import pyBigWig
import numpy as np
from tqdm import tqdm

import os
import time
import multiprocessing as mp

In [2]:
column_name = ["biosample"]
HMs = ["H3K4me3", "H3K27ac", "H3K4me1", "H3K36me3", "H3K9me3", "H3K27me3"]

for HM in HMs:
    for ith in list(range(1,23))+["X"]:
        column_name.append("{}_{}".format(HM, "chr"+str(ith)))

ref_row = []
bw = pyBigWig.open("dataset/cell line/A549/H3K4me3/ENCFF006GYA.bigWig")
for ith in list(range(1,23))+["X"]:
    ref_row.append(bw.chroms()["chr"+str(ith)])

ref_row *= 6
ref_row.insert(0, "reference")
df = pd.DataFrame(data=[ref_row], columns=column_name)

In [3]:
class Dataset():
    def __init__(self, name, dataset_path, bin_size):
        initial = dict([("chr"+str(ith),[]) for ith in list(range(1,23))+["X"]])
        self.HMs = ["H3K4me3", "H3K27ac", "H3K4me1", "H3K36me3", "H3K9me3", "H3K27me3"]

        self.name = name
        self.bin_size = bin_size
        self.path = dataset_path
        self.nan_count = mp.Manager().list()
        self.signals = dict([(HM, initial.copy())for HM in self.HMs])

        ######## Automatic do processing code
        self.preprocess()

    def getDatasetPath(self, HM):
        dataset_path = self.path + HM + "/"
        dataset_path += os.listdir(dataset_path)[0]
        return dataset_path

    def ruleOfFill(self, idx, signal):
        # Check if first position is nan
        if (idx == 0) and np.isnan(signal[idx+1]):
            return 0
        elif (idx == 0) and (not np.isnan(signal[idx+1])):
            return signal[idx+1]
        # Check if final position or next postion is nan
        elif (idx == (len(signal)-1)) or (np.isnan(signal[idx+1])):
            return signal[idx-1]
        # If previous and next position are not nan, sum these values and average
        else:
            return (signal[idx-1]+signal[idx+1]) / 2

    def processMissingValue(self, signal):
        count = 0
        if not all(signal == signal):
            for idx in range(len(signal)):
                if np.isnan(signal[idx]):
                    count += 1
        return count

    def getBinSignalValuePerChrom(self, HM, chr_ith):
        # Becasue bigwigfile object cannot be pickled, we just open dataset in each processes
        bw = pyBigWig.open(self.getDatasetPath(HM))
        chr_len = bw.chroms()[chr_ith]
        total_count = 0

        for end_idx in range(self.bin_size, chr_len, self.bin_size):
            start_idx = end_idx - self.bin_size
            raw_signal = bw.values(chr_ith, start_idx, end_idx, numpy=True)
            count = self.processMissingValue(raw_signal)
            total_count += count
        else:
            raw_signal = bw.values(chr_ith, end_idx, chr_len, numpy=True)
            count = self.processMissingValue(raw_signal)

            self.nan_count.append((HM+"_"+chr_ith, total_count+count))
            # print("--- {} finished".format(chr_ith))

    def preprocess(self):
        for HM in self.HMs:
            pool = mp.Pool()
            args = [(HM, "chr"+str(ith)) for ith in list(range(1,23))+["X"]]
            pool.starmap_async(self.getBinSignalValuePerChrom, args)

            # Prevents any more tasks from being submitted to the pool.
            # Once all the tasks have been completed the worker processes will exit.
            pool.close()
            # Wait for the worker processes to exit.
            # One must call close() or terminate() before using join() .
            pool.join()

            # print("--- {} finished".format(HM))

In [4]:
biosamples = os.listdir("dataset/tissue")
print(len(biosamples))

62


In [5]:
tqdm_bar = tqdm(biosamples)
for biosample in tqdm_bar:
    tqdm_bar.set_description("Processing {}".format(biosample))
    data_object = Dataset(biosample, "dataset/tissue/{}/".format(biosample), 100000)
    new_row = dict(data_object.nan_count)
    new_row["biosample"] = biosample
    df = df.append(new_row, ignore_index=True)

Processing esophagus: 100%|██████████| 62/62 [1:18:49<00:00, 76.28s/it]


In [7]:
df.to_excel("dataset/tissue/nan_count.xlsx", index=False, header=True)