In [10]:
import pandas as pd
import pickle
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import scipy.stats as stats
from statsmodels.stats.multitest import multipletests

# from BCBio import GFF
# from Bio.Seq import Seq
# from Bio.SeqUtils import GC
# from Bio.SeqRecord import SeqRecord
# from Bio.SeqFeature import SeqFeature, FeatureLocation
# import mygene
# import h5py

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams
import matplotlib.patches as patches
from matplotlib.colors import LinearSegmentedColormap

In [11]:
targets = [("H3K27ac", "_narrow"), ("H3K36me3", ""), 
           ("H3K4me1", "_narrow"), ("H3K4me2", "_narrow"), ("H3K4me3", "_narrow"), ("H3K79me2", ""), 
           ("H3K9ac", "_narrow"), ("H3K9me3", ""), ("H4K20me1", ""), ("H3K27me3", ""), ("methylation", "")]

In [12]:
#Строим таблицу, где строки это все возможные lnc, для которых есть хотя бы одно значимое пересечение хотя бы для одной метки
#а столбцы мультииндекс по метке и по типу
def getTableForAllMarks(iMARGI=False, annotation_prefix=""):
    g_annotation_prefix = annotation_prefix + "_" if annotation_prefix else ""
    annotation_prefix = "_" + annotation_prefix if annotation_prefix else ""
    
    df_first = pd.read_csv("../data/all_marks/" + targets[0][0] + "/our_fantom" + annotation_prefix + "_genes_association_pvalues.tsv", sep="\t")
    df_first['lncRNAName'] = ["_".join([name] + i.split('_')[1:]) for i, name in zip(df_first['lncRNAId'], df_first['lncRNAName'])]
    if(iMARGI):
        g = pd.read_csv("../data/all_marks/" + targets[0][0] + "/" + g_annotation_prefix + "genometric_result_all_rnas.tsv", sep="\t")
        df_first = df_first[df_first['lncRNAId'].isin(g['lnc'])][['lncRNAName', 'pm_pvalue', 'mm_pvalue', 'pp_pvalue', 'mp_pvalue']]
    else:
        df_first = df_first[['lncRNAName', 'pm_pvalue', 'mm_pvalue', 'pp_pvalue', 'mp_pvalue']]
        
    for i in range(1, len(targets)):
        df_second = pd.read_csv("../data/all_marks/" + targets[i][0] + "/our_fantom" + annotation_prefix + "_genes_association_pvalues.tsv", sep="\t")
        df_second['lncRNAName'] = ["_".join([name] + i.split('_')[1:]) for i, name in zip(df_second['lncRNAId'], df_second['lncRNAName'])]
        if(iMARGI):
            g = pd.read_csv("../data/all_marks/" + targets[i][0] + "/" + g_annotation_prefix + "genometric_result_all_rnas.tsv", sep="\t")
            df_second = df_second[df_second['lncRNAId'].isin(g['lnc'])][['lncRNAName', 'pm_pvalue', 'mm_pvalue', 'pp_pvalue', 'mp_pvalue']]
        else:
            df_second = df_second[['lncRNAName', 'pm_pvalue', 'mm_pvalue', 'pp_pvalue', 'mp_pvalue']]
            
        df_first = df_first.merge(df_second, how="outer", left_on='lncRNAName', right_on='lncRNAName')
    
    df_first = df_first.set_index('lncRNAName')
    # del df_first.index.name
    df_first = df_first.applymap(lambda x: 0 if np.isnan(x) else -np.log10(x))
    columns_index_1 = ["H3K27ac", "H3K36me3", "H3K4me1", "H3K4me2", "H3K4me3", "H3K79me2", "H3K9ac", "H3K9me3", "H4K20me1", "H3K27me3", "Methylation"]
    columns_index_2 = ['wa', 'ea', 'wr', 'er']
    df_first.columns = pd.MultiIndex.from_product([columns_index_1, columns_index_2], names=['target', 'type'])
    
    np.unravel_index(np.argmax(df_first.values, axis=None), df_first.values.shape)
    
    #df_first = df_first.drop('EMX2OS')
    
    return df_first

In [13]:
def getDataWithoutASOs(iMARGI=False, clusters=False):
    df = getTableForAllMarks(iMARGI=iMARGI, annotation_prefix="fantom_aso")
    df = df.mask(df < 1.3, 0)
    df['geneName'] = [i.split('_')[0] for i in df.index]
    b = df.groupby(['geneName']).apply(check_asos)
    b = b.drop(['geneName'], axis=1)
    b = b[b.astype(bool).sum(axis=1) > 0]
    if clusters:
        return b.apply(add_cluster, axis = 1, result_type="expand")
    else:
        return b

In [14]:
def check_asos(df):
    count = pd.DataFrame({"nonzero_count" : df.astype(bool).sum(axis=0)})['nonzero_count']
    if df.shape[0] == 1:  # Нет нескольких ASO - 1
        return count
    else:
        s = pd.Series([2 if b else 0 for b in count > df.shape[0]/2], index=count.index)
        return s

In [15]:
data = getDataWithoutASOs(iMARGI=False)
# https://stackoverflow.com/a/55757002/310453
data.columns = ["-".join(a) for a in data.columns.to_flat_index()]
data.to_csv('../data/heatmap_data.genes.tsv', sep="\t")

  df_first = df_first.merge(df_second, how="outer", left_on='lncRNAName', right_on='lncRNAName')
  df_first = df_first.merge(df_second, how="outer", left_on='lncRNAName', right_on='lncRNAName')
  df_first = df_first.merge(df_second, how="outer", left_on='lncRNAName', right_on='lncRNAName')
  df_first = df_first.merge(df_second, how="outer", left_on='lncRNAName', right_on='lncRNAName')
  b = b.drop(['geneName'], axis=1)


In [16]:
data = getDataWithoutASOs(iMARGI=True)
data.columns = ["-".join(a) for a in data.columns.to_flat_index()]
data.to_csv('../data/heatmap_data.genes_imargi.tsv', sep="\t")

  df_first = df_first.merge(df_second, how="outer", left_on='lncRNAName', right_on='lncRNAName')
  df_first = df_first.merge(df_second, how="outer", left_on='lncRNAName', right_on='lncRNAName')
  df_first = df_first.merge(df_second, how="outer", left_on='lncRNAName', right_on='lncRNAName')
  df_first = df_first.merge(df_second, how="outer", left_on='lncRNAName', right_on='lncRNAName')
  b = b.drop(['geneName'], axis=1)


In [17]:
data = getTableForAllMarks(annotation_prefix="fantom_aso").sort_index()
data.columns = ["-".join(a) for a in data.columns.to_flat_index()]
data.to_csv('../data/heatmap_data.asos.tsv', sep="\t")

  df_first = df_first.merge(df_second, how="outer", left_on='lncRNAName', right_on='lncRNAName')
  df_first = df_first.merge(df_second, how="outer", left_on='lncRNAName', right_on='lncRNAName')
  df_first = df_first.merge(df_second, how="outer", left_on='lncRNAName', right_on='lncRNAName')
  df_first = df_first.merge(df_second, how="outer", left_on='lncRNAName', right_on='lncRNAName')


In [18]:
data = getTableForAllMarks(iMARGI=True, annotation_prefix="fantom_aso").sort_index()
data.columns = ["-".join(a) for a in data.columns.to_flat_index()]
data.to_csv('../data/heatmap_data.asos_imargi.tsv', sep="\t")

  df_first = df_first.merge(df_second, how="outer", left_on='lncRNAName', right_on='lncRNAName')
  df_first = df_first.merge(df_second, how="outer", left_on='lncRNAName', right_on='lncRNAName')
  df_first = df_first.merge(df_second, how="outer", left_on='lncRNAName', right_on='lncRNAName')
  df_first = df_first.merge(df_second, how="outer", left_on='lncRNAName', right_on='lncRNAName')
