# scRAPID tutorial

Rome, 14/06/2023. Jonathan Fiorentino

In this notebook we provide a tutorial to run the scRAPID pipeline on a GRN inferred from single-cell transcriptomic data.

In [1]:
import numpy as np
import pandas as pd
import scRAPID as scr
# from gtfparse import read_gtf
# import subprocess

In [2]:
# Define some variables that are needed in the analysis
# Id of the dataset: In this case we use 
# the C2C12 cell line
# the dataset is a short-read SpLIT-seq scRNA-seq of myoblasts 
datasetID = "C2C12_SR9kMBSC"

# the set_type identifies the regulators and the number of HVGs
# chosen at the gene selection step prior to GRN inference
# in this case RNA binding proteins + 500 HVGs
set_type = "RBP_RNA500"
# path to file with normalized count matrix
norm_data_file = "./inputs/"+datasetID+"_"+set_type+"/"+datasetID+"NormalizedData_"+set_type+".csv"
# specify the organism (Mouse and Human are currently supported
# with pre-computed catRAPID scores, but users can run catRAPID on 
# different organisms using catRAPID or catRAPID omicsv2.0; 
# visit http://s.tartaglialab.com/page/catrapid_group for more info)
organism="Mouse"

# Specify the algorithm you used to infer the GRN, in this case DeePSEM
# but our pipeline is agnostic to the algorithm used upstream
GRN_algo='DeePSEM'

In [3]:
# Load the list of RBPs and lncRNAs
if organism=='Human':
    RBPs=np.loadtxt('Human_RBPs.txt',dtype=str)

    # Get the gtf file from GENCODEv41 with lncRNA annotation for human
    # https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.long_noncoding_RNAs.gtf.gz
    gnames_lncRNA=list(np.loadtxt("Human_lncRNAs.txt",dtype=str))
elif organism=='Mouse':
    RBPs=np.loadtxt('Mouse_RBPs.txt',dtype=str)
    # Get the gtf file from GENCODEM30 with lncRNA annotation in mouse
    # https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M30/gencode.vM30.long_noncoding_RNAs.gtf.gz
    gnames_lncRNA=list(np.loadtxt("Mouse_lncRNAs.txt",dtype=str))

In [4]:
# Specify path to folder for writing the output
res_folder="./outputs/"+datasetID+"_"+set_type+"/"+GRN_algo+"/"

In [5]:
# Load the genes x cells matrix with the normalized counts 
norm_data=pd.read_csv(norm_data_file, header=0, index_col=0)
# All the genes in the dataset
all_genes=list(set(norm_data.index))

# Possible edges excluding self loops
possibleEdges=(len(all_genes)*len(all_genes))-len(all_genes)

# RBPs in the dataset
RBPs_in_data=list(set(all_genes).intersection(set(RBPs)))

# lncRNAs in the dataset
lncRNAs_in_data=list(set(all_genes).intersection(set(gnames_lncRNA)))

In [6]:
np.savetxt("RBPs_C2C12SR9kMBSC.txt",np.c_[RBPs_in_data],fmt="%s")

In [7]:
np.savetxt("RNAs_C2C12SR9kMBSC.txt",np.c_[all_genes],fmt="%s")

In [8]:
# Load the catRAPID table 
# see how it can be obtained from our database
catRAPIDDF=pd.read_csv("catRAPID_table_C2C12SR9KMBSC.csv")

In [9]:
catRAPIDDF["Edges"]=catRAPIDDF['protein_name']+'|'+catRAPIDDF['gene_name']
catRAPIDDF=catRAPIDDF.set_index("Edges")

In [10]:
# Read the ranked edges
# (formatted as in BEELINE, three columns: Gene1, Gene2, EdgeWeight. Delimiter is \t)
predDF=scr.preprocess(res_folder)

# Condition for defining significant edges 
# (for methods returning more than the 5% of the possible edges)
if len(list(predDF.index))>int(0.05*possibleEdges):
    predDF=predDF.iloc[:int(0.05*possibleEdges)]

# Keep only RBP-RNA interactions
predDF=predDF.loc[predDF['Gene1'].isin(RBPs)]
        
# Run scRAPID
# 1. Predict RBP co-interactions based on shared RNA targets
# Returns a dataframe with the RBP pairs and the Jaccard coefficient
# NOTE: We suggest to predcit RBP co-inbteractions in datasets with at least 2000 HVGs
CoInterDF=scr.CoInter(predDF)
        
# 2. catRAPID based-filter of inferred interactions
# (the default threshold on interaction propensity is 30)
predDF_catRAPID=scr.FilterRanking(predDF, catRAPIDDF, threshold = 30)
        
# 3. RBP interactions with long non-coding RNAs
predDF_catRAPID_lncRNA=predDF_catRAPID[predDF_catRAPID.Gene2.isin(gnames_lncRNA)].copy()
        
# 4. Predict hub RBPs
# This function returns a dataframe with RBPs and their out-degree centrality
# In the manuscript we considered as hubs the top 10%
# but different thresholds can be considered
HubRBPs=scr.HubReg(predDF_catRAPID,RBPs_in_data,GRN_algo)
        
# 5. Predict hub target RNAs (specify if you are interested only in lncRNAs or any RNA)
# This function returns a dataframe with RNAs and their in-degree centrality
# In the manuscript we considered as hubs the top 10% 
# but different thresholds can be considered
HubRNAs=scr.HubTarget(predDF_catRAPID,all_genes,GRN_algo)
        
# 6. Predict hub target lncRNAs
HublncRNAs=scr.HubTarget(predDF_catRAPID_lncRNA,lncRNAs_in_data,GRN_algo)

In [11]:
predDF

Unnamed: 0,Gene1,Gene2,EdgeWeight,Edges
0,Cmss1,Neb,0.013355,Cmss1|Neb
1,Cmss1,Gm29237,0.012816,Cmss1|Gm29237
2,Cmss1,Mir133a-1hg,0.012693,Cmss1|Mir133a-1hg
3,Cmss1,Syce2,0.012496,Cmss1|Syce2
4,Cmss1,A430103D13Rik,0.012493,Cmss1|A430103D13Rik
...,...,...,...,...
25496,Mrpl53,Pdlim5,0.003617,Mrpl53|Pdlim5
25515,Sinhcaf,Cacnb2,0.003616,Sinhcaf|Cacnb2
25519,Atp5pb,Selenoo,0.003616,Atp5pb|Selenoo
25521,Junb,Cdkn1a,0.003616,Junb|Cdkn1a


In [12]:
predDF_catRAPID

Unnamed: 0_level_0,Gene1,Gene2,EdgeWeight
Edges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cmss1|Neb,Cmss1,Neb,0.013355
Cmss1|Grip1,Cmss1,Grip1,0.012371
Cmss1|Hmcn2,Cmss1,Hmcn2,0.012080
Cmss1|Creg1,Cmss1,Creg1,0.012051
Cmss1|Zfr2,Cmss1,Zfr2,0.011973
...,...,...,...
Eif3j2|Stau1,Eif3j2,Stau1,0.003617
Rrp15|Eya1,Rrp15,Eya1,0.003617
Nav3|Egr3,Nav3,Egr3,0.003617
Sinhcaf|Cacnb2,Sinhcaf,Cacnb2,0.003616


In [13]:
predDF_catRAPID_lncRNA

Unnamed: 0_level_0,Gene1,Gene2,EdgeWeight
Edges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cmss1|H19,Cmss1,H19,0.011723
Cmss1|Gm16638,Cmss1,Gm16638,0.011447
Cmss1|6530402F18Rik,Cmss1,6530402F18Rik,0.011421
Cmss1|Gm5784,Cmss1,Gm5784,0.011177
Cmss1|Gm26811,Cmss1,Gm26811,0.011077
...,...,...,...
Rps8|6530402F18Rik,Rps8,6530402F18Rik,0.003638
Sec63|6530409C15Rik,Sec63,6530409C15Rik,0.003638
Tet2|Gm37644,Tet2,Gm37644,0.003633
Nav3|Gm5784,Nav3,Gm5784,0.003628


In [14]:
HubRBPs

Unnamed: 0,RBP,outdeg_centr
0,Mki67,0.690685
1,Hspa5,0.340949
2,Fn1,0.339192
3,Cmss1,0.293497
4,Nasp,0.223199
...,...,...
197,Nusap1,0.001757
198,Utp6,0.001757
199,Rps29,0.001757
200,Rps3,0.001757


In [15]:
HubRNAs

Unnamed: 0,RNA,indeg_centr
0,Adam12,0.043937
1,Peli3,0.042179
2,Ano3,0.035149
3,Nova2,0.035149
4,Neb,0.033392
...,...,...
565,Rps13,0.000000
566,Apobec3,0.000000
567,Rps29,0.000000
568,Rps3,0.000000


In [16]:
HublncRNAs

Unnamed: 0,RNA,indeg_centr
0,6530402F18Rik,0.169643
1,Gm16638,0.151786
2,Gm5784,0.142857
3,9330162G02Rik,0.142857
4,H19,0.133929
...,...,...
64,4930589O11Rik,0.008929
65,B230362B09Rik,0.008929
66,Mir22hg,0.008929
67,Tsix,0.008929
