In [1]:
import os
import pyBigWig
from dataset import Dataset
HMs = ["H3K4me3", "H3K27ac", "H3K4me1", "H3K36me3", "H3K9me3", "H3K27me3"]

import numpy as np
import pandas as pd
import seaborn as snst
import matplotlib.pyplot as plt

biosample = "A549"
dataset_dir = "../dataset/new/".format(biosample)
bin_size = 1000

In [2]:
def convertInputFormat(dataset):
    df_input = dict([(HM, []) for HM in HMs])
    for HM in HMs:
        for ith in list(range(1,23))+["X"]:
            df_input[HM] += dataset.signals[HM]["chr"+str(ith)]

    return pd.DataFrame(df_input)

def plotAndSave(df, path):
    fig, axs = plt.subplots(1, 3, figsize=(20, 5))
    corr_types = ["pearson", "spearman", "kendall"]
    for ith, corr_type in enumerate(corr_types):
        axs[ith].set_title(corr_type, fontsize=15)
        if ith == 0:
            sns.heatmap(df.corr(method=corr_type), vmin=-1, vmax=1, ax=axs[ith], annot=True)
        else:
            sns.heatmap(df.corr(method=corr_type), vmin=-1, vmax=1, ax=axs[ith], annot=True, yticklabels=False)

    fig.suptitle("Bin size : 1000", fontsize=15, x=0.5, y=-0.1)
    fig.savefig(path)

In [22]:
# Initialize
datasets = dict()
biosamples = os.listdir(dataset_dir)

for biosample in biosamples:
    dataset = Dataset(biosample, dataset_dir+biosample+"/", bin_size)
    dataset.mp_preprocess()
    datasets[biosample] = dataset.signals
        
    try:
        df = convertInputFormat(dataset)
        path = dataset_dir + "{}/correlation_HMs.png".format(biosample)
        plotAndSave(df, path)
        datasets[biosample] = dataset.signals.copy()
    except:
        print("{} --- ERROR".format(biosample))

In [7]:
biosamples = os.listdir("dataset/primary cell")
print(biosamples)

['naive thymus-derived CD4-positive, alpha-beta T cell', 'osteoblast', 'CD8-positive, alpha-beta T cell', 'fibroblast of dermis', 'mammary epithelial cell', 'activated T-cell', 'foreskin melanocyte', 'fibroblast of lung', 'foreskin fibroblast', 'common myeloid progenitor, CD34-positive', 'T-helper 17 cell', 'peripheral blood mononuclear cell', 'neutrophil', 'neurosphere', 'natural killer cell', 'CD4-positive, alpha-beta T cell', 'naive thymus-derived CD8-positive, alpha-beta T cell', 'effector memory CD4-positive, alpha-beta T cell', 'fibroblast of breast', 'B cell', 'CD8-positive, alpha-beta memory T cell', 'CD4-positive, CD25-positive, alpha-beta regulatory T cell', 'immature natural killer cell', 'astrocyte', 'endothelial cell of umbilical vein', 'CD4-positive, alpha-beta memory T cell', 'keratinocyte', 'foreskin keratinocyte', 'skeletal muscle myoblast', 'T-cell', 'CD14-positive monocyte']


In [12]:
biosample_signal_pair = dict([(biosample, []) for biosample in biosamples])
for biosample in biosamples:
    for HM in HMs:
        for ith in list(range(1,23))+["X"]:
            biosample_signal_pair[biosample] += datasets[biosample][HM]["chr"+str(ith)]

In [18]:
df = pd.DataFrame(biosample_signal_pair)
del biosample_signal_pair

In [23]:
fig = plt.figure(figsize=(30,20))
temp = sns.heatmap(df.corr(method="kendall"), vmin=-1, vmax=1)
temp.axes.set_title("Kendall", fontsize=40)
temp.set_xlabel("Bin size : 1000", fontsize=30)
# fig.savefig("dataset/primary cell/corr_biosample.png")