# scRAPID tutorial

Rome, 14/06/2023. Jonathan Fiorentino

In this notebook we provide a tutorial to run the scRAPID pipeline on a GRN inferred from single-cell transcriptomic data.

In [1]:
import numpy as np
import pandas as pd
import scRAPID as scr
# from gtfparse import read_gtf
# import subprocess

In [2]:
# Define some variables that are needed in the analysis
# Id of the dataset: In this case we use 
# the C2C12 cell line
# the dataset is a short-read SpLIT-seq scRNA-seq of myoblasts 
datasetID = "C2C12_SR9kMBSC"

# the set_type identifies the regulators and the number of HVGs
# chosen at the gene selection step prior to GRN inference
# in this case RNA binding proteins + 500 HVGs
set_type = "RBP_RNA500"
# path to file with normalized count matrix
norm_data_file = "./inputs/"+datasetID+"_"+set_type+"/"+datasetID+"NormalizedData_"+set_type+".csv"
# specify the organism (Mouse and Human are currently supported
# with pre-computed catRAPID scores, but users can run catRAPID on 
# different organisms using catRAPID or catRAPID omicsv2.0; 
# visit http://s.tartaglialab.com/page/catrapid_group for more info)
organism="Mouse"

# Specify the algorithm you used to infer the GRN, in this case DeePSEM
# but our pipeline is agnostic to the algorithm used upstream
GRN_algo='DeePSEM'

In [3]:
# Load the list of RBPs and lncRNAs
if organism=='Human':
    RBPs=np.loadtxt('Human_RBPs.txt',dtype=str)

    # Get the gtf file from GENCODEv41 with lncRNA annotation for human
    # https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.long_noncoding_RNAs.gtf.gz
    gnames_lncRNA=list(np.loadtxt("Human_lncRNAs.txt",dtype=str))
elif organism=='Mouse':
    RBPs=np.loadtxt('Mouse_RBPs.txt',dtype=str)
    # Get the gtf file from GENCODEM30 with lncRNA annotation in mouse
    # https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M30/gencode.vM30.long_noncoding_RNAs.gtf.gz
    gnames_lncRNA=list(np.loadtxt("Mouse_lncRNAs.txt",dtype=str))

In [4]:
# Specify path to folder for writing the output
res_folder="./outputs/"+datasetID+"_"+set_type+"/"+GRN_algo+"/"

In [5]:
# Load the genes x cells matrix with the normalized counts 
norm_data=pd.read_csv(norm_data_file, header=0, index_col=0)
# All the genes in the dataset
all_genes=list(set(norm_data.index))

# Possible edges excluding self loops
possibleEdges=(len(all_genes)*len(all_genes))-len(all_genes)

# RBPs in the dataset
RBPs_in_data=list(set(all_genes).intersection(set(RBPs)))

# lncRNAs in the dataset
lncRNAs_in_data=list(set(all_genes).intersection(set(gnames_lncRNA)))

In [8]:
# Load Uniprot table to convert gene names to Uniprot IDs
uniprot=pd.read_csv("./uniprot_mouse.txt",delimiter="\t")
uniprot_dict=dict(zip(uniprot.protein_name,uniprot.accession_number))

In [9]:
RBPs_Uniprot_IDs=list(pd.Series(RBPs_in_data).map(uniprot_dict).dropna())
np.savetxt("RBPs_C2C12SR9kMBSC.txt",np.c_[RBPs_Uniprot_IDs],fmt="%s")

In [10]:
np.savetxt("RNAs_C2C12SR9kMBSC.txt",np.c_[all_genes],fmt="%s")

In [12]:
# Load the catRAPID table 
# see how it can be obtained from our database
catRAPIDDF=pd.read_csv("catRAPID_table_C2C12SR9KMBSC.csv")

In [13]:
uniprot_dict2=dict(zip(uniprot.accession_number,uniprot.protein_name))
catRAPIDDF['protein_name']=catRAPIDDF.Uniprot_ID.map(uniprot_dict2)
catRAPIDDF["Edges"]=catRAPIDDF['protein_name']+'|'+catRAPIDDF['gene_name']
catRAPIDDF=catRAPIDDF.set_index("Edges")

In [14]:
# Read the ranked edges
# (formatted as in BEELINE, three columns: Gene1, Gene2, EdgeWeight. Delimiter is \t)
predDF=scr.preprocess(res_folder)

# Condition for defining significant edges 
# (for methods returning more than the 5% of the possible edges)
if len(list(predDF.index))>int(0.05*possibleEdges):
    predDF=predDF.iloc[:int(0.05*possibleEdges)]

# Keep only RBP-RNA interactions
predDF=predDF.loc[predDF['Gene1'].isin(RBPs)]
        
# Run scRAPID
# 1. Predict RBP co-interactions based on shared RNA targets
# Returns a dataframe with the RBP pairs and the Jaccard coefficient
CoInterDF=scr.CoInter(predDF)
        
# 2. catRAPID based-filter of inferred interactions
# (the default threshold on interaction propensity is 30)
predDF_catRAPID=scr.FilterRanking(predDF, catRAPIDDF, threshold = 30)
        
# 3. RBP interactions with long non-coding RNAs
predDF_catRAPID_lncRNA=predDF_catRAPID[predDF_catRAPID.Gene2.isin(gnames_lncRNA)].copy()
        
# 4. Predict hub RBPs
# This function returns a dataframe with RBPs and their out-degree centrality
# In the manuscript we considered as hubs the top 10%
# but different thresholds can be considered
HubRBPs=scr.HubReg(predDF_catRAPID,RBPs_in_data,GRN_algo)
        
# 5. Predict hub target RNAs (specify if you are interested only in lncRNAs or any RNA)
# This function returns a dataframe with RNAs and their in-degree centrality
# In the manuscript we considered as hubs the top 10% 
# but different thresholds can be considered
HubRNAs=scr.HubTarget(predDF_catRAPID,all_genes,GRN_algo)
        
# 6. Predict hub target lncRNAs
HublncRNAs=scr.HubTarget(predDF_catRAPID_lncRNA,lncRNAs_in_data,GRN_algo)