In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns


import os, sys, shutil, importlib, glob
from tqdm.notebook import tqdm

from celloracle import motif_analysis as ma

In [2]:
print(pd.__version__)

1.1.5


In [3]:
print(np.__version__)

1.19.5


In [4]:
%config InlineBackend.figure_format = 'retina'

plt.rcParams['figure.figsize'] = [6, 4.5]
plt.rcParams["savefig.dpi"] = 300

In [5]:
os.chdir('/fast/AG_Ohler/CheWei/Root_scATAC/cicero_output')

In [6]:
# Load scATAC-seq peak list.
peaks = pd.read_csv("./all_peaks.csv", index_col=0)
peaks = peaks.x.values
peaks

array(['1_335_7056', '1_8406_9951', '1_12063_12890', ...,
       '5_26956798_26956907', '5_26957455_26961494',
       '5_26962568_26975462'], dtype=object)

In [7]:
# Load cicero coaccess score.
cicero_connections = pd.read_csv("cicero_connections.csv", index_col=0)
cicero_connections.head()

  mask |= (ar1 == a)


Unnamed: 0,Peak1,Peak2,coaccess
1,1_10009064_10010347,1_9750080_9750307,0.07966
2,1_10009064_10010347,1_9751793_9754234,0.058712
3,1_10009064_10010347,1_9756523_9756940,0.196817
4,1_10009064_10010347,1_9757841_9758508,0.000257
5,1_10009064_10010347,1_9761163_9761218,-0.01287


In [8]:
## Make TSS annotation
ma.SUPPORTED_REF_GENOME

{'Human': ['hg38', 'hg19'],
 'Mouse': ['mm10', 'mm9'],
 'S.cerevisiae': ['sacCer2', 'sacCer3'],
 'Zebrafish': ['danRer7', 'danRer10', 'danRer11'],
 'Xenopus': ['xenTro2', 'xenTro3'],
 'Rat': ['rn4', 'rn5', 'rn6'],
 'Drosophila': ['dm3', 'dm6'],
 'C.elegans': ['ce6', 'ce10'],
 'Arabidopsis': ['TAIR10'],
 'Chicken': ['galGal4', 'galGal5', 'galGal6']}

In [9]:
tss_annotated = ma.get_tss_info(peak_str_list=peaks, ref_genome="TAIR10") ##!! Set reference genome here


# Check results
tss_annotated.tail()

que bed peaks: 25377
tss peaks in que: 36153


Unnamed: 0,chr,start,end,gene_short_name,strand
36148,5,26962568,26975462,AT5G67620,-
36149,5,26962568,26975462,AT5G67630,-
36150,5,26962568,26975462,AT5G09955,+
36151,5,26962568,26975462,AT5G67640,-
36152,5,26962568,26975462,AT5G09965,-


In [10]:
tss_annotated['chr'] = tss_annotated['chr'].astype(str)

In [11]:
## Integrate TSS info into cicero connection
integrated = ma.integrate_tss_peak_with_cicero(tss_peak=tss_annotated, cicero_connections=cicero_connections)
print(integrated.shape)
integrated.head()

(2359340, 3)


Unnamed: 0,peak_id,gene_short_name,coaccess
0,1_10009064_10010347,AT1G06127,0.039057
1,1_10009064_10010347,AT1G06133,0.312275
2,1_10009064_10010347,AT1G06153,0.085787
3,1_10009064_10010347,AT1G06163,0.35977
4,1_10009064_10010347,AT1G06173,0.224941


In [12]:
## Filter peaks
peak = integrated[integrated.coaccess >= 0.8]
peak = peak[["peak_id", "gene_short_name"]].reset_index(drop=True)

In [13]:
print(peak.shape)
peak.head()

(47596, 2)


Unnamed: 0,peak_id,gene_short_name
0,1_10009064_10010347,AT1G28470
1,1_10011402_10011934,AT1G06163
2,1_10012652_10016219,AT1G06163
3,1_10012652_10016219,AT1G28480
4,1_10012652_10016219,AT1G28490


In [14]:
peak.to_csv("processed_peak_file.csv")