In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


import seaborn as sns

import os, sys, shutil, importlib, glob
from tqdm.notebook import tqdm

In [2]:
from celloracle import motif_analysis as ma
from celloracle.utility import save_as_pickled_object

In [3]:
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

plt.rcParams['figure.figsize'] = (15,7)
plt.rcParams["savefig.dpi"] = 600

In [4]:
os.chdir('/fast/AG_Ohler/CheWei/Root_scATAC/cicero_output')

In [5]:
# PLEASE make sure that you are setting correct ref genome.
ref_genome = "TAIR10"

genome_installation = ma.is_genome_installed(ref_genome=ref_genome)
print(ref_genome, "installation: ", genome_installation)

TAIR10 installation:  True


In [6]:
import genomepy

In [16]:
genomepy.install_genome("TAIR10","Ensembl")

In [7]:
# Load annotated peak data.
peaks = pd.read_csv("processed_peak_file.csv", index_col=0)
peaks.head()

Unnamed: 0,peak_id,gene_short_name
0,1_10009064_10010347,AT1G28470
1,1_10011402_10011934,AT1G06163
2,1_10012652_10016219,AT1G06163
3,1_10012652_10016219,AT1G28480
4,1_10012652_10016219,AT1G28490


In [8]:
# Define function for quality check
def decompose_chrstr(peak_str):
    """
    Args:
        peak_str (str): peak_str. e.g. 'chr1_3094484_3095479'

    Returns:
        tuple: chromosome name, start position, end position
    """

    *chr_, start, end = peak_str.split("_")
    chr_ = "_".join(chr_)
    return chr_, start, end

from genomepy import Genome

def check_peak_foamat(peaks_df, ref_genome):
    """
    Check peak fomat.
     (1) Check chromosome name.
     (2) Check peak size (length) and remove sort DNAs (<5bp)

    """

    df = peaks_df.copy()

    n_peaks_before = df.shape[0]

    # Decompose peaks and make df
    decomposed = [decompose_chrstr(peak_str) for peak_str in df["peak_id"]]
    df_decomposed = pd.DataFrame(np.array(decomposed))
    df_decomposed.columns = ["chr", "start", "end"]
    df_decomposed["start"] = df_decomposed["start"].astype(np.int)
    df_decomposed["end"] = df_decomposed["end"].astype(np.int)

    # Load genome data
    genome_data = Genome(ref_genome)
    all_chr_list = list(genome_data.keys())


    # DNA length check
    lengths = np.abs(df_decomposed["end"] - df_decomposed["start"])


    # Filter peaks with invalid chromosome name
    n_threshold = 5
    df = df[(lengths >= n_threshold) & df_decomposed.chr.isin(all_chr_list)]

    # DNA length check
    lengths = np.abs(df_decomposed["end"] - df_decomposed["start"])

    # Data counting
    n_invalid_length = len(lengths[lengths < n_threshold])
    n_peaks_invalid_chr = n_peaks_before - df_decomposed.chr.isin(all_chr_list).sum()
    n_peaks_after = df.shape[0]

    #
    print("Peaks before filtering: ", n_peaks_before)
    print("Peaks with invalid chr_name: ", n_peaks_invalid_chr)
    print("Peaks with invalid length: ", n_invalid_length)
    print("Peaks after filtering: ", n_peaks_after)

    return df

In [9]:
peaks = check_peak_foamat(peaks, ref_genome)

Peaks before filtering:  47596
Peaks with invalid chr_name:  0
Peaks with invalid length:  7
Peaks after filtering:  47589


In [10]:
# Instantiate TFinfo object
tfi = ma.TFinfo(peak_data_frame=peaks,
                ref_genome=ref_genome)

In [11]:
ref_genome

'TAIR10'

In [12]:
%%time
# Scan motifs. !!CAUTION!! This step may take several hours if you have many peaks!
tfi.scan(fpr=0.02,
         motifs=None,  # If you enter None, default motifs will be loaded.
         verbose=True)

# Save tfinfo object
tfi.to_hdf5(file_path="Arabidopsis.celloracle.tfinfo")

No motif data entered. Loading default motifs for your species ...
 Default motif for Arabidopsis: CisBP_ver2_Arabidopsis_thaliana.pfm. 
 For more information, please see celloracle documentation. 

Initiating scanner... 

Calculating FPR-based threshold. This step may take substantial time when you load a new ref-genome. It will be done quicker on the second time. 

Convert peak info into DNA sequences ... 

Scanning motifs ... It may take several hours if you proccess many peaks. 



0it [00:00, ?it/s]

CPU times: user 1h 30min 39s, sys: 45.2 s, total: 1h 31min 24s
Wall time: 1h 32min 13s


In [13]:
# Check motif scan results
tfi.scanned_df.head()

Unnamed: 0,seqname,motif_id,factors_direct,factors_indirect,score,pos,strand
0,1_10009064_10010347,M00855_2.00,AT1G42990,,5.412471,973,1
1,1_10009064_10010347,M00859_2.00,AT4G35610,,7.832737,170,1
2,1_10009064_10010347,M00862_2.00,AT5G65590,"AT2G28510, AT3G61850, AT4G21050, AT1G21340, AT...",4.937159,198,1
3,1_10009064_10010347,M00866_2.00,AT4G35550,"AT3G18010, AT2G28610, AT1G20710, AT1G20700, AT...",9.886514,559,-1
4,1_10009064_10010347,M00869_2.00,AT5G62020,"AT3G51910, AT1G32330, AT4G36990, AT1G67970, AT...",5.94414,25,1


In [14]:
## Filtering Motif
# Reset filtering
tfi.reset_filtering()

# Do filtering
tfi.filter_motifs_by_score(threshold=10)

# Do post filtering process. Convert results into several file format.
tfi.make_TFinfo_dataframe_and_dictionary(verbose=True)

Filtering finished: 10879522 -> 3146247
1. Converting scanned results into one-hot encoded dataframe.


  0%|          | 0/17257 [00:00<?, ?it/s]

2. Converting results into dictionaries.


  0%|          | 0/27434 [00:00<?, ?it/s]

  0%|          | 0/1065 [00:00<?, ?it/s]

In [15]:
df = tfi.to_dataframe()
df.head()

Unnamed: 0,peak_id,gene_short_name,AT1G01010,AT1G01060,AT1G01250,AT1G01260,AT1G01520,AT1G01720,AT1G02065,AT1G02230,...,AT5G66730,AT5G66870,AT5G66940,AT5G67000,AT5G67110,AT5G67180,AT5G67190,AT5G67300,AT5G67450,AT5G67580
0,1_10009064_10010347,AT1G28470,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1_10011402_10011934,AT1G06163,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1_10012652_10016219,AT1G06163,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
3,1_10012652_10016219,AT1G28480,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
4,1_10012652_10016219,AT1G28490,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [16]:
# Save result as a dataframe
df = tfi.to_dataframe()
df.to_parquet("base_GRN_dataframe.parquet")

# If you want, you can save the result as a dictionary as follows.
#td = tfi.to_dictionary(dictionary_type="targetgene2TFs")
#save_as_pickled_object(td, os.path.join(folder, "TFinfo_targetgene2TFs.pickled"))

In [17]:
df.to_csv("scATAC_base_GRN.csv")