# Gene selection for GRN inference

02/12/2022

Here we select the set of genes for the GRN inference in the main text.

We use the Smart-seq2, 10x and SCAN-seq2 (9CL) data for Hepg2, the CEL-seq, STORM-seq 1M reads, Smart-seq3, SCAN-seq2 9CL and UMI200 for K562.

For each dataset we consider only genes present in the fasta file of the canonical isoforms used for the annotation of the eCLIP data. 

We select the following sets of genes:

- HVTFs + top 500/1000 HVGs
- eCLIP RBPs + top 500/1000 HVGs

In [None]:
%matplotlib inline

In [None]:
import scanpy as sc
import anndata as ad
import numpy as np
import pandas as pd
import os

In [None]:
input_folder='./'
proc_folder=input_folder+'processed_data/'

## Load the fasta file with the canonical isoforms

In [None]:
from Bio import SeqIO
gname=[]
gid=[]
f_open = open("/Users/jonathan/Desktop/IIT/INTERACTomics/ENCODE_eCLIP_DATA/transcriptomes/hsapiens_gene_ensembl_107_canonical_new.fa", "rU")
for rec in SeqIO.parse(f_open, "fasta"):
    myid = rec.id
    gname.append(myid.split('|')[4])
    gid.append(myid.split('|')[0])

## Loading scRNA-seq pre-processed data

In [None]:
HepG2_SMART=ad.read_h5ad(proc_folder+'processed_HepG2_Smartseq2.h5ad')
HepG2_10x=ad.read_h5ad(proc_folder+'processed_HepG2_10x.h5ad')

HepG2_9CL_SCAN=ad.read_h5ad(proc_folder+'processed_HepG2_9CL_SCANseq2.h5ad')
HepG2_9CL_SCAN_ARACNe=ad.read_h5ad(proc_folder+'processed_HepG2_9CL_SCANseq2_ARACNe.h5ad')


K562_CEL=ad.read_h5ad(proc_folder+'processed_K562_CELseq.h5ad')
K562_CEL_ARACNe=ad.read_h5ad(proc_folder+'processed_K562_CELseq_ARACNe.h5ad')

K562_STORM=ad.read_h5ad(proc_folder+'processed_K562_STORMseq1M.h5ad')

K562_SMART3=ad.read_h5ad(proc_folder+'processed_K562_Smartseq3.h5ad')
K562_SMART3_ARACNe=ad.read_h5ad(proc_folder+'processed_K562_Smartseq3.h5ad')

K562_9CL_SCAN=ad.read_h5ad(proc_folder+'processed_K562_9CL_SCANseq2.h5ad')
K562_9CL_SCAN_ARACNe=ad.read_h5ad(proc_folder+'processed_K562_9CL_SCANseq2_ARACNe.h5ad')

K562_UMI200_SCAN=ad.read_h5ad(proc_folder+'processed_K562_UMI200_SCANseq2.h5ad')
K562_UMI200_SCAN_ARACNe=ad.read_h5ad(proc_folder+'processed_K562_UMI200_SCANseq2_ARACNe.h5ad')

In [None]:
# Consider only genes present in the fasta file
def Genes_in_fasta(adata,gnames):
    print(adata)
    inters=list(set(adata.var_names).intersection(set(gnames)))
    adata=adata[:,inters].copy()
    print(adata)
    return adata;

In [None]:
HepG2_SMART=Genes_in_fasta(HepG2_SMART,gname)
HepG2_10x=Genes_in_fasta(HepG2_10x,gname)
# HepG2_10x_ARACNe=Genes_in_fasta(HepG2_10x_ARACNe,gname)
HepG2_9CL_SCAN=Genes_in_fasta(HepG2_9CL_SCAN,gname)
HepG2_9CL_SCAN_ARACNe=Genes_in_fasta(HepG2_9CL_SCAN_ARACNe,gname)


K562_CEL=Genes_in_fasta(K562_CEL,gname)
K562_CEL_ARACNe=Genes_in_fasta(K562_CEL_ARACNe,gname)

K562_STORM=Genes_in_fasta(K562_STORM,gname)
K562_SMART3=Genes_in_fasta(K562_SMART3,gname)
K562_SMART3_ARACNe=Genes_in_fasta(K562_SMART3_ARACNe,gname)

K562_9CL_SCAN=Genes_in_fasta(K562_9CL_SCAN,gname)
K562_9CL_SCAN_ARACNe=Genes_in_fasta(K562_9CL_SCAN_ARACNe,gname)

K562_UMI200_SCAN=Genes_in_fasta(K562_UMI200_SCAN,gname)
K562_UMI200_SCAN_ARACNe=Genes_in_fasta(K562_UMI200_SCAN_ARACNe,gname)

## Load TFs and RBPs

In [None]:
# Load TFs from Beeline
TFs=pd.read_csv('human-tfs.csv')
TFs=list(set(TFs['TF']))
len(TFs)

In [None]:
RBPs=list(np.loadtxt("eCLIP_RBPs.txt",dtype=str))

## Gene sets

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def jaccard_similarity(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(set(list1)) + len(set(list2))) - intersection
    return float(intersection) / union

def jaccard_heatmap(adatas,protocols,ct):
    jaccard=np.zeros((len(adatas),len(adatas)))
    i=0
    for (adata1,prot1) in zip(adatas,protocols):
        j=0
        for (adata2,prot2) in zip(adatas,protocols):
            jaccard[i,j]=jaccard_similarity(list(adata1),list(adata2))
            j+=1
        i+=1
    
    # Getting the Upper Triangle of the co-relation matrix
    matrix = np.triu(jaccard)

    fig,ax =plt.subplots()
    ax.set_title(ct)
    # using the upper triangle matrix as mask 
    sns.heatmap(jaccard, annot=True, mask=matrix,ax=ax,
           xticklabels=protocols,yticklabels=protocols)
    plt.show(),plt.close()

### Highly variable genes

In [None]:
from gtfparse import read_gtf

# gtf from GENCODE corresponding to ENSEMBL 107
gtf_df = read_gtf("/Users/jonathan/Desktop/IIT/INTERACTomics/ENCODE_eCLIP_DATA/transcriptomes_gencode_V41/gencode.v41.primary_assembly.annotation.gtf")
gtf_df_pc=gtf_df[gtf_df.gene_type=='protein_coding']
gnames_pc=set(gtf_df_pc.gene_name)
gtf_df_lnc = read_gtf("/Users/jonathan/Desktop/IIT/INTERACTomics/ENCODE_eCLIP_DATA/transcriptomes_gencode_V41/gencode.v41.long_noncoding_RNAs.gtf")
gnames_nc=set(gtf_df_lnc.gene_name)
    

def HVgenes_noother(adata,eclip,ct,n):
    
    gnames=list(set(list(gnames_pc)+list(gnames_nc)))
    print('Total nr genes',len(set(gnames).intersection(set(adata.var_names))))
    inters=list(set(gnames).intersection(set(adata.var_names)))
    
    adata_all=adata[:,inters].copy()
    sc.pp.highly_variable_genes(adata_all,max_mean=10,n_top_genes=n)
    adata_HVGs=adata_all[:,adata_all.var['highly_variable']==True].copy()
    
    ordered_HVGs=adata_HVGs.var.sort_values('dispersions_norm',ascending=False).index
    
    return ordered_HVGs;

In [None]:
HVGs_noother_HepG2_Smartseq2_500=HVgenes_noother(HepG2_SMART,eclip_HepG2,'HepG2_Smartseq2',500)
HVGs_noother_HepG2_Smartseq2_1000=HVgenes_noother(HepG2_SMART,eclip_HepG2,'HepG2_Smartseq2',1000)

HVGs_noother_HepG2_10x_500=HVgenes_noother(HepG2_10x,eclip_HepG2,'HepG2_10x',500)
HVGs_noother_HepG2_10x_1000=HVgenes_noother(HepG2_10x,eclip_HepG2,'HepG2_10x',1000)

HVGs_noother_HepG2_9CL_SCAN_500=HVgenes_noother(HepG2_9CL_SCAN,eclip_HepG2,'HepG2_9CL_SCAN',500)
HVGs_noother_HepG2_9CL_SCAN_1000=HVgenes_noother(HepG2_9CL_SCAN,eclip_HepG2,'HepG2_9CL_SCAN',1000)

In [None]:
jaccard_heatmap([HVGs_noother_HepG2_Smartseq2_500,HVGs_noother_HepG2_10x_500,
                HVGs_noother_HepG2_9CL_SCAN_500],['Smart-seq2','10x', 'SCAN-seq2'],'HepG2')

In [None]:
jaccard_heatmap([HVGs_noother_HepG2_Smartseq2_1000,HVGs_noother_HepG2_10x_1000,
                HVGs_noother_HepG2_9CL_SCAN_1000],['Smart-seq2','10x','SCAN-seq2'],'HepG2')

In [None]:
HVGs_noother_K562_CELseq_500=HVgenes_noother(K562_CEL,eclip_K562,'K562_CELseq',500)
HVGs_noother_K562_CELseq_1000=HVgenes_noother(K562_CEL,eclip_K562,'K562_CELseq',1000)


HVGs_noother_K562_STORMseq_500=HVgenes_noother(K562_STORM,eclip_K562,'K562_STORMseq',500)
HVGs_noother_K562_STORMseq_1000=HVgenes_noother(K562_STORM,eclip_K562,'K562_STORMseq',1000)


HVGs_noother_K562_Smartseq3_500=HVgenes_noother(K562_SMART3,eclip_K562,'K562_Smartseq3',500)
HVGs_noother_K562_Smartseq3_1000=HVgenes_noother(K562_SMART3,eclip_K562,'K562_Smartseq3',1000)

HVGs_noother_K562_9CL_SCAN_500=HVgenes_noother(K562_9CL_SCAN,eclip_K562,'K562_9CL_SCAN',500)
HVGs_noother_K562_9CL_SCAN_1000=HVgenes_noother(K562_9CL_SCAN,eclip_K562,'K562_9CL_SCAN',1000)

HVGs_noother_K562_UMI200_SCAN_500=HVgenes_noother(K562_UMI200_SCAN,eclip_K562,'K562_UMI200_SCAN',500)
HVGs_noother_K562_UMI200_SCAN_1000=HVgenes_noother(K562_UMI200_SCAN,eclip_K562,'K562_UMI200_SCAN',1000)

In [None]:
jaccard_heatmap([HVGs_noother_K562_CELseq_500,HVGs_noother_K562_STORMseq_500,
                 HVGs_noother_K562_Smartseq3_500,HVGs_noother_K562_9CL_SCAN_500,
                HVGs_noother_K562_UMI200_SCAN_500],
                ['CEL-seq','STORM-seq','Smart-seq3','9CL_SCAN-seq2','UMI200_SCAN-seq2'],'K562')

In [None]:
jaccard_heatmap([HVGs_noother_K562_CELseq_1000,HVGs_noother_K562_STORMseq_1000,
                 HVGs_noother_K562_Smartseq3_1000,HVGs_noother_K562_9CL_SCAN_1000,
                HVGs_noother_K562_UMI200_SCAN_1000],
                ['CEL-seq','STORM-seq','Smart-seq3','9CL_SCAN-seq2','UMI200_SCAN-seq2'],'K562')

## TFs and RBP selection

In [None]:
def RBP_TF_selection(adata,all_TFs,all_RBPs):
    sc.pp.highly_variable_genes(adata,max_mean=10)
    high_var=adata[:,adata.var['highly_variable']==True].copy()
    
    myRBPs=list(set(all_RBPs).intersection(set(adata.var_names)))
    
    myHVTFs=list(set(all_TFs).intersection(set(high_var.var_names)))
    
    print(len(myRBPs),len(myHVTFs));
    
    return myRBPs, myHVTFs;

In [None]:
RBP_HepG2_Smartseq2, HVTFs_HepG2_Smartseq2 = RBP_TF_selection(HepG2_SMART,TFs,RBPs)
RBP_HepG2_10x, HVTFs_HepG2_10x = RBP_TF_selection(HepG2_10x,TFs,RBPs)
RBP_HepG2_9CL_SCAN, HVTFs_HepG2_9CL_SCAN = RBP_TF_selection(HepG2_9CL_SCAN,TFs,RBPs)


RBP_K562_CELseq, HVTFs_K562_CELseq = RBP_TF_selection(K562_CEL,TFs,RBPs)
RBP_K562_STORMseq, HVTFs_K562_STORMseq = RBP_TF_selection(K562_STORM,TFs,RBPs)
RBP_K562_Smartseq3, HVTFs_K562_Smartseq3 = RBP_TF_selection(K562_SMART3,TFs,RBPs)
RBP_K562_9CL_SCAN, HVTFs_K562_9CL_SCAN = RBP_TF_selection(K562_9CL_SCAN,TFs,RBPs)
RBP_K562_UMI200_SCAN, HVTFs_K562_UMI200_SCAN = RBP_TF_selection(K562_UMI200_SCAN,TFs,RBPs)

In [None]:
## Define the two sets of genes for all the datasets
HepG2_Smartseq2_RBP_RNA_500 = list(set(list(HVGs_noother_HepG2_Smartseq2_500)+list(RBP_HepG2_Smartseq2)))
HepG2_Smartseq2_TF_RNA_500 = list(set(list(HVGs_noother_HepG2_Smartseq2_500)+list(HVTFs_HepG2_Smartseq2)))

HepG2_Smartseq2_RBP_RNA_1000 = list(set(list(HVGs_noother_HepG2_Smartseq2_1000)+list(RBP_HepG2_Smartseq2)))
HepG2_Smartseq2_TF_RNA_1000 = list(set(list(HVGs_noother_HepG2_Smartseq2_1000)+list(HVTFs_HepG2_Smartseq2)))

print(len(HepG2_Smartseq2_RBP_RNA_500),
     len(HepG2_Smartseq2_TF_RNA_500))

print(len(HepG2_Smartseq2_RBP_RNA_1000),
     len(HepG2_Smartseq2_TF_RNA_1000))

In [None]:
## Define the two sets of genes for all the datasets
HepG2_10x_RBP_RNA_500 = list(set(list(HVGs_noother_HepG2_10x_500)+list(RBP_HepG2_10x)))
HepG2_10x_TF_RNA_500 = list(set(list(HVGs_noother_HepG2_10x_500)+list(HVTFs_HepG2_10x)))

HepG2_10x_RBP_RNA_1000 = list(set(list(HVGs_noother_HepG2_10x_1000)+list(RBP_HepG2_10x)))
HepG2_10x_TF_RNA_1000 = list(set(list(HVGs_noother_HepG2_10x_1000)+list(HVTFs_HepG2_10x)))

print(len(HepG2_10x_RBP_RNA_500),
     len(HepG2_10x_TF_RNA_500))

print(len(HepG2_10x_RBP_RNA_1000),
     len(HepG2_10x_TF_RNA_1000))

In [None]:
## Define the two sets of genes for all the datasets
HepG2_9CL_SCAN_RBP_RNA_500 = list(set(list(HVGs_noother_HepG2_9CL_SCAN_500)+list(RBP_HepG2_9CL_SCAN)))
HepG2_9CL_SCAN_TF_RNA_500 = list(set(list(HVGs_noother_HepG2_9CL_SCAN_500)+list(HVTFs_HepG2_9CL_SCAN)))

HepG2_9CL_SCAN_RBP_RNA_1000 = list(set(list(HVGs_noother_HepG2_9CL_SCAN_1000)+list(RBP_HepG2_9CL_SCAN)))
HepG2_9CL_SCAN_TF_RNA_1000 = list(set(list(HVGs_noother_HepG2_9CL_SCAN_1000)+list(HVTFs_HepG2_9CL_SCAN)))

print(len(HepG2_9CL_SCAN_RBP_RNA_500),
     len(HepG2_9CL_SCAN_TF_RNA_500))

print(len(HepG2_9CL_SCAN_RBP_RNA_1000),
     len(HepG2_9CL_SCAN_TF_RNA_1000))

In [None]:
## Define the two sets of genes for all the datasets
K562_CELseq_RBP_RNA_500 = list(set(list(HVGs_noother_K562_CELseq_500)+list(RBP_K562_CELseq)))
K562_CELseq_TF_RNA_500 = list(set(list(HVGs_noother_K562_CELseq_500)+list(HVTFs_K562_CELseq)))

K562_CELseq_RBP_RNA_1000 = list(set(list(HVGs_noother_K562_CELseq_1000)+list(RBP_K562_CELseq)))
K562_CELseq_TF_RNA_1000 = list(set(list(HVGs_noother_K562_CELseq_1000)+list(HVTFs_K562_CELseq)))

print(len(K562_CELseq_RBP_RNA_500),
     len(K562_CELseq_TF_RNA_500))

print(len(K562_CELseq_RBP_RNA_1000),
     len(K562_CELseq_TF_RNA_1000))

In [None]:
## Define the two sets of genes for all the datasets
K562_STORMseq_RBP_RNA_500 = list(set(list(HVGs_noother_K562_STORMseq_500)+list(RBP_K562_STORMseq)))
K562_STORMseq_TF_RNA_500 = list(set(list(HVGs_noother_K562_STORMseq_500)+list(HVTFs_K562_STORMseq)))

K562_STORMseq_RBP_RNA_1000 = list(set(list(HVGs_noother_K562_STORMseq_1000)+list(RBP_K562_STORMseq)))
K562_STORMseq_TF_RNA_1000 = list(set(list(HVGs_noother_K562_STORMseq_1000)+list(HVTFs_K562_STORMseq)))

print(len(K562_STORMseq_RBP_RNA_500),
     len(K562_STORMseq_TF_RNA_500))

print(len(K562_STORMseq_RBP_RNA_1000),
     len(K562_STORMseq_TF_RNA_1000))

In [None]:
## Define the two sets of genes for all the datasets
K562_Smartseq3_RBP_RNA_500 = list(set(list(HVGs_noother_K562_Smartseq3_500)+list(RBP_K562_Smartseq3)))
K562_Smartseq3_TF_RNA_500 = list(set(list(HVGs_noother_K562_Smartseq3_500)+list(HVTFs_K562_Smartseq3)))

K562_Smartseq3_RBP_RNA_1000 = list(set(list(HVGs_noother_K562_Smartseq3_1000)+list(RBP_K562_Smartseq3)))
K562_Smartseq3_TF_RNA_1000 = list(set(list(HVGs_noother_K562_Smartseq3_1000)+list(HVTFs_K562_Smartseq3)))

print(len(K562_Smartseq3_RBP_RNA_500),
     len(K562_Smartseq3_TF_RNA_500))

print(len(K562_Smartseq3_RBP_RNA_1000),
     len(K562_Smartseq3_TF_RNA_1000))

In [None]:
## Define the two sets of genes for all the datasets
K562_9CL_SCAN_RBP_RNA_500 = list(set(list(HVGs_noother_K562_9CL_SCAN_500)+list(RBP_K562_9CL_SCAN)))
K562_9CL_SCAN_TF_RNA_500 = list(set(list(HVGs_noother_K562_9CL_SCAN_500)+list(HVTFs_K562_9CL_SCAN)))

K562_9CL_SCAN_RBP_RNA_1000 = list(set(list(HVGs_noother_K562_9CL_SCAN_1000)+list(RBP_K562_9CL_SCAN)))
K562_9CL_SCAN_TF_RNA_1000 = list(set(list(HVGs_noother_K562_9CL_SCAN_1000)+list(HVTFs_K562_9CL_SCAN)))

print(len(K562_9CL_SCAN_RBP_RNA_500),
     len(K562_9CL_SCAN_TF_RNA_500))

print(len(K562_9CL_SCAN_RBP_RNA_1000),
     len(K562_9CL_SCAN_TF_RNA_1000))

In [None]:
## Define the two sets of genes for all the datasets
K562_UMI200_SCAN_RBP_RNA_500 = list(set(list(HVGs_noother_K562_UMI200_SCAN_500)+list(RBP_K562_UMI200_SCAN)))
K562_UMI200_SCAN_TF_RNA_500 = list(set(list(HVGs_noother_K562_UMI200_SCAN_500)+list(HVTFs_K562_UMI200_SCAN)))

K562_UMI200_SCAN_RBP_RNA_1000 = list(set(list(HVGs_noother_K562_UMI200_SCAN_1000)+list(RBP_K562_UMI200_SCAN)))
K562_UMI200_SCAN_TF_RNA_1000 = list(set(list(HVGs_noother_K562_UMI200_SCAN_1000)+list(HVTFs_K562_UMI200_SCAN)))

print(len(K562_UMI200_SCAN_RBP_RNA_500),
     len(K562_UMI200_SCAN_TF_RNA_500))

print(len(K562_UMI200_SCAN_RBP_RNA_1000),
     len(K562_UMI200_SCAN_TF_RNA_1000))

# Save the data

In [None]:
def SaveData(folder, adata, geneset, label1, label2):
    
    # Save the normalized data in a csv file
    tmp_df=pd.DataFrame(data=adata[:,geneset].X.T, index=adata[:,geneset].var_names,
                        columns=adata[:,geneset].obs_names)
    tmp_df.to_csv(folder+label1+'NormalizedData_'+label2+'.csv')

    # Save the raw data in a csv file
    tmp_df=pd.DataFrame(data=adata.raw[:, geneset].X.T, index=adata.raw[:, geneset].var_names, 
                        columns=adata[:, geneset].obs_names)
    tmp_df.to_csv(folder+label1+'RawData_'+label2+'.csv')

In [None]:
os.mkdir(input_folder2+'HepG2_Smartseq2_TF_RNA500')
os.mkdir(input_folder2+'HepG2_Smartseq2_RBP_RNA500')

os.mkdir(input_folder2+'HepG2_Smartseq2_TF_RNA1000')
os.mkdir(input_folder2+'HepG2_Smartseq2_RBP_RNA1000')

os.mkdir(input_folder2+'HepG2_9CL_SCAN_TF_RNA500')
os.mkdir(input_folder2+'HepG2_9CL_SCAN_RBP_RNA500')

os.mkdir(input_folder2+'HepG2_9CL_SCAN_TF_RNA1000')
os.mkdir(input_folder2+'HepG2_9CL_SCAN_RBP_RNA1000')

os.mkdir(input_folder2+'HepG2_10x_TF_RNA500')
os.mkdir(input_folder2+'HepG2_10x_RBP_RNA500')

os.mkdir(input_folder2+'HepG2_10x_TF_RNA1000')
os.mkdir(input_folder2+'HepG2_10x_RBP_RNA1000')

SaveData(input_folder2+'HepG2_Smartseq2_TF_RNA500/', HepG2_SMART, HepG2_Smartseq2_TF_RNA_500,
         'HepG2_Smartseq2', 'TF_RNA500')
SaveData(input_folder2+'HepG2_Smartseq2_RBP_RNA500/', HepG2_SMART, HepG2_Smartseq2_RBP_RNA_500,
         'HepG2_Smartseq2', 'RBP_RNA500')

SaveData(input_folder2+'HepG2_Smartseq2_TF_RNA1000/', HepG2_SMART, HepG2_Smartseq2_TF_RNA_1000,
         'HepG2_Smartseq2', 'TF_RNA1000')
SaveData(input_folder2+'HepG2_Smartseq2_RBP_RNA1000/', HepG2_SMART, HepG2_Smartseq2_RBP_RNA_1000,
         'HepG2_Smartseq2', 'RBP_RNA1000')


SaveData(input_folder2+'HepG2_10x_TF_RNA500/', HepG2_10x, HepG2_10x_TF_RNA_500,
         'HepG2_10x', 'TF_RNA500')
SaveData(input_folder2+'HepG2_10x_RBP_RNA500/', HepG2_10x, HepG2_10x_RBP_RNA_500,
         'HepG2_10x', 'RBP_RNA500')

SaveData(input_folder2+'HepG2_10x_TF_RNA1000/', HepG2_10x, HepG2_10x_TF_RNA_1000,
         'HepG2_10x', 'TF_RNA1000')
SaveData(input_folder2+'HepG2_10x_RBP_RNA1000/', HepG2_10x, HepG2_10x_RBP_RNA_1000,
         'HepG2_10x', 'RBP_RNA1000')

SaveData(input_folder2+'HepG2_9CL_SCAN_TF_RNA500/', HepG2_9CL_SCAN, HepG2_9CL_SCAN_TF_RNA_500,
         'HepG2_9CL_SCAN', 'TF_RNA500')
SaveData(input_folder2+'HepG2_9CL_SCAN_RBP_RNA500/', HepG2_9CL_SCAN, HepG2_9CL_SCAN_RBP_RNA_500,
         'HepG2_9CL_SCAN', 'RBP_RNA500')

SaveData(input_folder2+'HepG2_9CL_SCAN_TF_RNA1000/', HepG2_9CL_SCAN, HepG2_9CL_SCAN_TF_RNA_1000,
         'HepG2_9CL_SCAN', 'TF_RNA1000')
SaveData(input_folder2+'HepG2_9CL_SCAN_RBP_RNA1000/', HepG2_9CL_SCAN, HepG2_9CL_SCAN_RBP_RNA_1000,
         'HepG2_9CL_SCAN', 'RBP_RNA1000')

In [None]:
os.mkdir(input_folder2+'K562_CELseq_TF_RNA500')
os.mkdir(input_folder2+'K562_CELseq_RBP_RNA500')

os.mkdir(input_folder2+'K562_STORMseq_TF_RNA500')
os.mkdir(input_folder2+'K562_STORMseq_RBP_RNA500')

os.mkdir(input_folder2+'K562_Smartseq3_TF_RNA500')
os.mkdir(input_folder2+'K562_Smartseq3_RBP_RNA500')

os.mkdir(input_folder2+'K562_9CL_SCAN_TF_RNA500')
os.mkdir(input_folder2+'K562_9CL_SCAN_RBP_RNA500')

os.mkdir(input_folder2+'K562_9CLMix_SCAN_TF_RNA500')
os.mkdir(input_folder2+'K562_9CLMix_SCAN_RBP_RNA500')

os.mkdir(input_folder2+'K562_UMI200_SCAN_TF_RNA500')
os.mkdir(input_folder2+'K562_UMI200_SCAN_RBP_RNA500')

os.mkdir(input_folder2+'K562_CELseq_TF_RNA1000')
os.mkdir(input_folder2+'K562_CELseq_RBP_RNA1000')

os.mkdir(input_folder2+'K562_STORMseq_TF_RNA1000')
os.mkdir(input_folder2+'K562_STORMseq_RBP_RNA1000')

os.mkdir(input_folder2+'K562_Smartseq3_TF_RNA1000')
os.mkdir(input_folder2+'K562_Smartseq3_RBP_RNA1000')

os.mkdir(input_folder2+'K562_9CL_SCAN_TF_RNA1000')
os.mkdir(input_folder2+'K562_9CL_SCAN_RBP_RNA1000')

os.mkdir(input_folder2+'K562_UMI200_SCAN_TF_RNA1000')
os.mkdir(input_folder2+'K562_UMI200_SCAN_RBP_RNA1000')

SaveData(input_folder2+'K562_CELseq_TF_RNA500/', K562_CEL, K562_CELseq_TF_RNA_500,
         'K562_CELseq', 'TF_RNA500')
SaveData(input_folder2+'K562_CELseq_RBP_RNA500/', K562_CEL, K562_CELseq_RBP_RNA_500,
         'K562_CELseq', 'RBP_RNA500')

SaveData(input_folder2+'K562_CELseq_TF_RNA1000/', K562_CEL, K562_CELseq_TF_RNA_1000,
         'K562_CELseq', 'TF_RNA1000')
SaveData(input_folder2+'K562_CELseq_RBP_RNA1000/', K562_CEL, K562_CELseq_RBP_RNA_1000,
         'K562_CELseq', 'RBP_RNA1000')

SaveData(input_folder2+'K562_STORMseq_TF_RNA500/', K562_STORM, K562_STORMseq_TF_RNA_500,
         'K562_STORMseq', 'TF_RNA500')
SaveData(input_folder2+'K562_STORMseq_RBP_RNA500/', K562_STORM, K562_STORMseq_RBP_RNA_500,
         'K562_STORMseq', 'RBP_RNA500')

SaveData(input_folder2+'K562_STORMseq_TF_RNA1000/', K562_STORM, K562_STORMseq_TF_RNA_1000,
         'K562_STORMseq', 'TF_RNA1000')
SaveData(input_folder2+'K562_STORMseq_RBP_RNA1000/', K562_STORM, K562_STORMseq_RBP_RNA_1000,
         'K562_STORMseq', 'RBP_RNA1000')

SaveData(input_folder2+'K562_Smartseq3_TF_RNA500/', K562_SMART3, K562_Smartseq3_TF_RNA_500,
         'K562_Smartseq3', 'TF_RNA500')
SaveData(input_folder2+'K562_Smartseq3_RBP_RNA500/', K562_SMART3, K562_Smartseq3_RBP_RNA_500,
         'K562_Smartseq3', 'RBP_RNA500')

SaveData(input_folder2+'K562_Smartseq3_TF_RNA1000/', K562_SMART3, K562_Smartseq3_TF_RNA_1000,
         'K562_Smartseq3', 'TF_RNA1000')
SaveData(input_folder2+'K562_Smartseq3_RBP_RNA1000/', K562_SMART3, K562_Smartseq3_RBP_RNA_1000,
         'K562_Smartseq3', 'RBP_RNA1000')

SaveData(input_folder2+'K562_9CL_SCAN_TF_RNA500/', K562_9CL_SCAN, K562_9CL_SCAN_TF_RNA_500,
         'K562_9CL_SCAN', 'TF_RNA500')
SaveData(input_folder2+'K562_9CL_SCAN_RBP_RNA500/', K562_9CL_SCAN, K562_9CL_SCAN_RBP_RNA_500,
         'K562_9CL_SCAN', 'RBP_RNA500')

SaveData(input_folder2+'K562_9CL_SCAN_TF_RNA1000/', K562_9CL_SCAN, K562_9CL_SCAN_TF_RNA_1000,
         'K562_9CL_SCAN', 'TF_RNA1000')
SaveData(input_folder2+'K562_9CL_SCAN_RBP_RNA1000/', K562_9CL_SCAN, K562_9CL_SCAN_RBP_RNA_1000,
         'K562_9CL_SCAN', 'RBP_RNA1000')

SaveData(input_folder2+'K562_UMI200_SCAN_TF_RNA500/', K562_UMI200_SCAN, K562_UMI200_SCAN_TF_RNA_500,
         'K562_UMI200_SCAN', 'TF_RNA500')
SaveData(input_folder2+'K562_UMI200_SCAN_RBP_RNA500/', K562_UMI200_SCAN, K562_UMI200_SCAN_RBP_RNA_500,
         'K562_UMI200_SCAN', 'RBP_RNA500')

SaveData(input_folder2+'K562_UMI200_SCAN_TF_RNA1000/', K562_UMI200_SCAN, K562_UMI200_SCAN_TF_RNA_1000,
         'K562_UMI200_SCAN', 'TF_RNA1000')
SaveData(input_folder2+'K562_UMI200_SCAN_RBP_RNA1000/', K562_UMI200_SCAN, K562_UMI200_SCAN_RBP_RNA_1000,
         'K562_UMI200_SCAN', 'RBP_RNA1000')

In [None]:
input_folder3=input_folder+'GENE_SELECTION_MAIN/ARACNe_INPUT/'

if os.path.isdir(input_folder3)==False:
    os.mkdir(input_folder3)

In [None]:
def SaveDataforARACNe(folder, adata, geneset, label1, label2):
    adata_temp=ad.AnnData(X=adata[:,geneset].X)
    adata_temp.obs_names=adata[:,geneset].obs_names
    adata_temp.var_names=adata[:,geneset].var_names
    adata_temp.obs['batch']=label1
    adata_temp.write(folder+'processed_'+label1+'_'+label2+'.h5ad')

For the HepG2 10x dataset we select metacells prior to GRN inference with ARACNe. See details and code in the Inference folder.

In [None]:
SaveDataforARACNe(input_folder3+'HepG2_Smartseq2_TF_RNA500', HepG2_SMART.raw, HepG2_Smartseq2_TF_RNA_500,
         'HepG2_Smartseq2', 'TF_RNA500')
SaveDataforARACNe(input_folder3+'HepG2_Smartseq2_RBP_RNA500', HepG2_SMART.raw, HepG2_Smartseq2_RBP_RNA_500,
         'HepG2_Smartseq2', 'RBP_RNA500')

SaveDataforARACNe(input_folder3+'HepG2_Smartseq2_TF_RNA1000', HepG2_SMART.raw, HepG2_Smartseq2_TF_RNA_1000,
         'HepG2_Smartseq2', 'TF_RNA1000')
SaveDataforARACNe(input_folder3+'HepG2_Smartseq2_RBP_RNA1000', HepG2_SMART.raw, HepG2_Smartseq2_RBP_RNA_1000,
         'HepG2_Smartseq2', 'RBP_RNA1000')

SaveDataforARACNe(input_folder3+'HepG2_9CL_SCAN_TF_RNA500', HepG2_9CL_SCAN_ARACNe, HepG2_9CL_SCAN_TF_RNA_500,
         'HepG2_9CL_SCAN', 'TF_RNA500')
SaveDataforARACNe(input_folder3+'HepG2_9CL_SCAN_RBP_RNA500', HepG2_9CL_SCAN_ARACNe, HepG2_9CL_SCAN_RBP_RNA_500,
         'HepG2_9CL_SCAN', 'RBP_RNA500')

SaveDataforARACNe(input_folder3+'HepG2_9CL_SCAN_TF_RNA1000', HepG2_9CL_SCAN_ARACNe, HepG2_9CL_SCAN_TF_RNA_1000,
         'HepG2_9CL_SCAN', 'TF_RNA1000')
SaveDataforARACNe(input_folder3+'HepG2_9CL_SCAN_RBP_RNA1000', HepG2_9CL_SCAN_ARACNe, HepG2_9CL_SCAN_RBP_RNA_1000,
         'HepG2_9CL_SCAN', 'RBP_RNA1000')

In [None]:
SaveDataforARACNe(input_folder3+'K562_CELseq_TF_RNA500', K562_CEL_ARACNe, K562_CELseq_TF_RNA_500,
         'K562_CELseq', 'TF_RNA500')
SaveDataforARACNe(input_folder3+'K562_CELseq_RBP_RNA500', K562_CEL_ARACNe, K562_CELseq_RBP_RNA_500,
         'K562_CELseq', 'RBP_RNA500')

SaveDataforARACNe(input_folder3+'K562_CELseq_TF_RNA1000', K562_CEL_ARACNe, K562_CELseq_TF_RNA_1000,
         'K562_CELseq', 'TF_RNA1000')
SaveDataforARACNe(input_folder3+'K562_CELseq_RBP_RNA1000', K562_CEL_ARACNe, K562_CELseq_RBP_RNA_1000,
         'K562_CELseq', 'RBP_RNA1000')

SaveDataforARACNe(input_folder3+'K562_STORMseq_TF_RNA500', K562_STORM.raw, K562_STORMseq_TF_RNA_500,
         'K562_STORMseq', 'TF_RNA500')
SaveDataforARACNe(input_folder3+'K562_STORMseq_RBP_RNA500', K562_STORM.raw, K562_STORMseq_RBP_RNA_500,
         'K562_STORMseq', 'RBP_RNA500')

SaveDataforARACNe(input_folder3+'K562_STORMseq_TF_RNA1000', K562_STORM.raw, K562_STORMseq_TF_RNA_1000,
         'K562_STORMseq', 'TF_RNA1000')
SaveDataforARACNe(input_folder3+'K562_STORMseq_RBP_RNA1000', K562_STORM.raw, K562_STORMseq_RBP_RNA_1000,
         'K562_STORMseq', 'RBP_RNA1000')


SaveDataforARACNe(input_folder3+'K562_Smartseq3_TF_RNA500', K562_SMART3_ARACNe, K562_Smartseq3_TF_RNA_500,
         'K562_Smartseq3', 'TF_RNA500')
SaveDataforARACNe(input_folder3+'K562_Smartseq3_RBP_RNA500', K562_SMART3_ARACNe, K562_Smartseq3_RBP_RNA_500,
         'K562_Smartseq3', 'RBP_RNA500')

SaveDataforARACNe(input_folder3+'K562_Smartseq3_TF_RNA1000', K562_SMART3_ARACNe, K562_Smartseq3_TF_RNA_1000,
         'K562_Smartseq3', 'TF_RNA1000')
SaveDataforARACNe(input_folder3+'K562_Smartseq3_RBP_RNA1000', K562_SMART3_ARACNe, K562_Smartseq3_RBP_RNA_1000,
         'K562_Smartseq3', 'RBP_RNA1000')


SaveDataforARACNe(input_folder3+'K562_9CL_SCAN_TF_RNA500', K562_9CL_SCAN_ARACNe, K562_9CL_SCAN_TF_RNA_500,
         'K562_9CL_SCAN', 'TF_RNA500')
SaveDataforARACNe(input_folder3+'K562_9CL_SCAN_RBP_RNA500', K562_9CL_SCAN_ARACNe, K562_9CL_SCAN_RBP_RNA_500,
         'K562_9CL_SCAN', 'RBP_RNA500')

SaveDataforARACNe(input_folder3+'K562_9CL_SCAN_TF_RNA1000', K562_9CL_SCAN_ARACNe, K562_9CL_SCAN_TF_RNA_1000,
         'K562_9CL_SCAN', 'TF_RNA1000')
SaveDataforARACNe(input_folder3+'K562_9CL_SCAN_RBP_RNA1000', K562_9CL_SCAN_ARACNe, K562_9CL_SCAN_RBP_RNA_1000,
         'K562_9CL_SCAN', 'RBP_RNA1000')

SaveDataforARACNe(input_folder3+'K562_UMI200_SCAN_TF_RNA500', K562_UMI200_SCAN_ARACNe, K562_UMI200_SCAN_TF_RNA_500,
         'K562_UMI200_SCAN', 'TF_RNA500')
SaveDataforARACNe(input_folder3+'K562_UMI200_SCAN_RBP_RNA500', K562_UMI200_SCAN_ARACNe, K562_UMI200_SCAN_RBP_RNA_500,
         'K562_UMI200_SCAN', 'RBP_RNA500')

SaveDataforARACNe(input_folder3+'K562_UMI200_SCAN_TF_RNA1000', K562_UMI200_SCAN_ARACNe, K562_UMI200_SCAN_TF_RNA_1000,
         'K562_UMI200_SCAN', 'TF_RNA1000')
SaveDataforARACNe(input_folder3+'K562_UMI200_SCAN_RBP_RNA1000', K562_UMI200_SCAN_ARACNe, K562_UMI200_SCAN_RBP_RNA_1000,
         'K562_UMI200_SCAN', 'RBP_RNA1000')

In [None]:
gname_folder='./GENE_SELECTION_MAIN/gene_names/'

if os.path.isdir(gname_folder)==False:
    os.mkdir(gname_folder)

In [None]:
# Save the gene names for each dataset and their union for running catRAPID
np.savetxt(gname_folder+'gnamesHepG2_Smartseq2_TF_RNA500.txt',np.c_[HepG2_Smartseq2_TF_RNA_500],fmt='%s')
np.savetxt(gname_folder+'gnamesHepG2_Smartseq2_RBP_RNA500.txt',np.c_[HepG2_Smartseq2_RBP_RNA_500],fmt='%s')

np.savetxt(gname_folder+'gnamesHepG2_Smartseq2_TF_RNA1000.txt',np.c_[HepG2_Smartseq2_TF_RNA_1000],fmt='%s')
np.savetxt(gname_folder+'gnamesHepG2_Smartseq2_RBP_RNA1000.txt',np.c_[HepG2_Smartseq2_RBP_RNA_1000],fmt='%s')

np.savetxt(gname_folder+'gnamesHepG2_10x_TF_RNA500.txt',np.c_[HepG2_10x_TF_RNA_500],fmt='%s')
np.savetxt(gname_folder+'gnamesHepG2_10x_RBP_RNA500.txt',np.c_[HepG2_10x_RBP_RNA_500],fmt='%s')

np.savetxt(gname_folder+'gnamesHepG2_10x_TF_RNA1000.txt',np.c_[HepG2_10x_TF_RNA_1000],fmt='%s')
np.savetxt(gname_folder+'gnamesHepG2_10x_RBP_RNA1000.txt',np.c_[HepG2_10x_RBP_RNA_1000],fmt='%s')

np.savetxt(gname_folder+'gnamesHepG2_9CL_SCAN_TF_RNA500.txt',np.c_[HepG2_9CL_SCAN_TF_RNA_500],fmt='%s')
np.savetxt(gname_folder+'gnamesHepG2_9CL_SCAN_RBP_RNA500.txt',np.c_[HepG2_9CL_SCAN_RBP_RNA_500],fmt='%s')

np.savetxt(gname_folder+'gnamesHepG2_9CL_SCAN_TF_RNA1000.txt',np.c_[HepG2_9CL_SCAN_TF_RNA_1000],fmt='%s')
np.savetxt(gname_folder+'gnamesHepG2_9CL_SCAN_RBP_RNA1000.txt',np.c_[HepG2_9CL_SCAN_RBP_RNA_1000],fmt='%s')

np.savetxt(gname_folder+'gnamesK562_CELseq_TF_RNA500.txt',np.c_[K562_CELseq_TF_RNA_500],fmt='%s')
np.savetxt(gname_folder+'gnamesK562_CELseq_RBP_RNA500.txt',np.c_[K562_CELseq_RBP_RNA_500],fmt='%s')

np.savetxt(gname_folder+'gnamesK562_CELseq_TF_RNA1000.txt',np.c_[K562_CELseq_TF_RNA_1000],fmt='%s')
np.savetxt(gname_folder+'gnamesK562_CELseq_RBP_RNA1000.txt',np.c_[K562_CELseq_RBP_RNA_1000],fmt='%s')

np.savetxt(gname_folder+'gnamesK562_STORMseq_TF_RNA500.txt',np.c_[K562_STORMseq_TF_RNA_500],fmt='%s')
np.savetxt(gname_folder+'gnamesK562_STORMseq_RBP_RNA500.txt',np.c_[K562_STORMseq_RBP_RNA_500],fmt='%s')

np.savetxt(gname_folder+'gnamesK562_STORMseq_TF_RNA1000.txt',np.c_[K562_STORMseq_TF_RNA_1000],fmt='%s')
np.savetxt(gname_folder+'gnamesK562_STORMseq_RBP_RNA1000.txt',np.c_[K562_STORMseq_RBP_RNA_1000],fmt='%s')

np.savetxt(gname_folder+'gnamesK562_Smartseq3_TF_RNA500.txt',np.c_[K562_Smartseq3_TF_RNA_500],fmt='%s')
np.savetxt(gname_folder+'gnamesK562_Smartseq3_RBP_RNA500.txt',np.c_[K562_Smartseq3_RBP_RNA_500],fmt='%s')

np.savetxt(gname_folder+'gnamesK562_Smartseq3_TF_RNA1000.txt',np.c_[K562_Smartseq3_TF_RNA_1000],fmt='%s')
np.savetxt(gname_folder+'gnamesK562_Smartseq3_RBP_RNA1000.txt',np.c_[K562_Smartseq3_RBP_RNA_1000],fmt='%s')

np.savetxt(gname_folder+'gnamesK562_9CL_SCAN_TF_RNA500.txt',np.c_[K562_9CL_SCAN_TF_RNA_500],fmt='%s')
np.savetxt(gname_folder+'gnamesK562_9CL_SCAN_RBP_RNA500.txt',np.c_[K562_9CL_SCAN_RBP_RNA_500],fmt='%s')

np.savetxt(gname_folder+'gnamesK562_9CL_SCAN_TF_RNA1000.txt',np.c_[K562_9CL_SCAN_TF_RNA_1000],fmt='%s')
np.savetxt(gname_folder+'gnamesK562_9CL_SCAN_RBP_RNA1000.txt',np.c_[K562_9CL_SCAN_RBP_RNA_1000],fmt='%s')

np.savetxt(gname_folder+'gnamesK562_UMI200_SCAN_TF_RNA500.txt',np.c_[K562_UMI200_SCAN_TF_RNA_500],fmt='%s')
np.savetxt(gname_folder+'gnamesK562_UMI200_SCAN_RBP_RNA500.txt',np.c_[K562_UMI200_SCAN_RBP_RNA_500],fmt='%s')

np.savetxt(gname_folder+'gnamesK562_UMI200_SCAN_TF_RNA1000.txt',np.c_[K562_UMI200_SCAN_TF_RNA_1000],fmt='%s')
np.savetxt(gname_folder+'gnamesK562_UMI200_SCAN_RBP_RNA1000.txt',np.c_[K562_UMI200_SCAN_RBP_RNA_1000],fmt='%s')