# Analysis of a scRNA-seq datasets of retinoic acid-induced differentiation of mESCs

Here we analyze a SCRB-seq scRNA-seq dataset of mouse embryonic stem cells (mESCs) differentiation driven by retinoic acid (RA) [https://pubmed.ncbi.nlm.nih.gov/29061959/](https://pubmed.ncbi.nlm.nih.gov/29061959/). 

mESCs were sequenced at 9 different time points (0h, 6h, 12h, 24h, 36h, 48h, 60h, 72h, 96h). The UMI count matrices for all the time points are available on GEO with accession number [GSE79578](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE79578). 

In this case the authors provided all the cells, including bad quality ones, thus we removed cells with less than 2000 total UMI counts as done in the original publication. Then, we computed a diffusion map as described above, which showed a clear trajectory starting from the undifferentiated mESCs, and we selected as root cell for the computation of the diffusion pseudotime the cell at 0h with the smallest value of diffusion component 1.

Next, we retained the 282 undifferentiated mESCs (0h) and the 137 ones obtained after 4 days of RA-induced differentiation (96h), and we performed the gene selection step for GRN inference as described in the previous paragraph.

The interactions for Caprin1 were obtained from a RIP-seq experiment performed in undifferentiated mESCs and RA-differentiated cells at 96h [https://pubmed.ncbi.nlm.nih.gov/36495875/](https://pubmed.ncbi.nlm.nih.gov/36495875/), matching the time points of the scRNA-seq dataset. From the RIP-seq, the authors identified 1178 and 2116 Caprin1 RNA targets at 0h and 96h, respectively.

In [None]:
%matplotlib inline

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad

# SCRB-seq

In [None]:
# load the data
adata_0h=ad.read_text("SCRB-seq_d0_d4_Smartseq2_d0_d2/GSE79578_RAW/GSM2098545_scrbseq_2i.txt.gz")
adata_0h=adata_0h.T
adata_0h.obs['Timepoint']='0h'

adata_6h=ad.read_text("SCRB-seq_d0_d4_Smartseq2_d0_d2/GSE79578_RAW/GSM2098546_scrbseq_6h.txt.gz")
adata_6h=adata_6h.T
adata_6h.obs['Timepoint']='6h'

adata_12h=ad.read_text("SCRB-seq_d0_d4_Smartseq2_d0_d2/GSE79578_RAW/GSM2098547_scrbseq_12h.txt.gz")
adata_12h=adata_12h.T
adata_12h.obs['Timepoint']='12h'

adata_24h=ad.read_text("SCRB-seq_d0_d4_Smartseq2_d0_d2/GSE79578_RAW/GSM2098548_scrbseq_24h.txt.gz")
adata_24h=adata_24h.T
adata_24h.obs['Timepoint']='24h'

adata_36h=ad.read_text("SCRB-seq_d0_d4_Smartseq2_d0_d2/GSE79578_RAW/GSM2098549_scrbseq_36h.txt.gz")
adata_36h=adata_36h.T
adata_36h.obs['Timepoint']='36h'

adata_48h=ad.read_text("SCRB-seq_d0_d4_Smartseq2_d0_d2/GSE79578_RAW/GSM2098550_scrbseq_48h.txt.gz")
adata_48h=adata_48h.T
adata_48h.obs['Timepoint']='48h'

adata_60h=ad.read_text("SCRB-seq_d0_d4_Smartseq2_d0_d2/GSE79578_RAW/GSM2098551_scrbseq_60h.txt.gz")
adata_60h=adata_60h.T
adata_60h.obs['Timepoint']='60h'

adata_72h=ad.read_text("SCRB-seq_d0_d4_Smartseq2_d0_d2/GSE79578_RAW/GSM2098552_scrbseq_72h.txt.gz")
adata_72h=adata_72h.T
adata_72h.obs['Timepoint']='72h'

adata_96h=ad.read_text("SCRB-seq_d0_d4_Smartseq2_d0_d2/GSE79578_RAW/GSM2098553_scrbseq_96h.txt.gz")
adata_96h=adata_96h.T
adata_96h.obs['Timepoint']='96h'

In [None]:
adata_SCRBseq=ad.concat([adata_0h,adata_6h,adata_12h,adata_24h,adata_36h,adata_48h,adata_60h,adata_72h,adata_96h])

In [None]:
adata_SCRBseq.obs_names_make_unique()

In [None]:
sc.pp.calculate_qc_metrics(adata_SCRBseq,inplace=True)

In [None]:
adata_SCRBseq.obs.Timepoint.value_counts()

In [None]:
# Remove cells with less than 2000 UMIs
sc.pp.filter_cells(adata_SCRBseq,min_counts=2000)
adata_SCRBseq_UMI=adata_SCRBseq.copy()
sc.pp.normalize_total(adata_SCRBseq)
adata_SCRBseq_aracne=adata_SCRBseq.copy()
sc.pp.log1p(adata_SCRBseq)
sc.pp.filter_genes(adata_SCRBseq,min_cells=10)
sc.pp.highly_variable_genes(adata_SCRBseq,n_top_genes=3000)

adata_SCRBseq_high_var=adata_SCRBseq[:,adata_SCRBseq.var.highly_variable].copy()
sc.pp.scale(adata_SCRBseq_high_var,max_value=10)
sc.tl.pca(adata_SCRBseq_high_var,svd_solver='arpack')
sc.pp.neighbors(adata_SCRBseq_high_var)
sc.tl.umap(adata_SCRBseq_high_var)
sc.tl.diffmap(adata_SCRBseq_high_var)
sc.pl.umap(adata_SCRBseq_high_var,color='Timepoint')
sc.pl.diffmap(adata_SCRBseq_high_var,color='Timepoint')

In [None]:
adata_SCRBseq_high_var.uns['iroot'] = np.argmin(adata_SCRBseq_high_var.obsm['X_diffmap'][:,1])
sc.tl.dpt(adata_SCRBseq_high_var)

In [None]:
sc.pl.diffmap(adata_SCRBseq_high_var,color=['Timepoint','dpt_pseudotime'],save='caprin1_diffmap.pdf')

In [None]:
sc.pl.violin(adata_SCRBseq_high_var,keys=['dpt_pseudotime'],groupby='Timepoint')

In [None]:
adata_SCRBseq.obs['dpt_pseudotime']=list(adata_SCRBseq_high_var.obs.dpt_pseudotime)

In [None]:
adata_SCRBseq.obs.Timepoint.value_counts()

In [None]:
adata_SCRBseq_0h=adata_SCRBseq[adata_SCRBseq.obs.Timepoint=='0h'].copy()
adata_SCRBseq_96h=adata_SCRBseq[adata_SCRBseq.obs.Timepoint=='96h'].copy()

In [None]:
adata_SCRBseq_0h_UMI=adata_SCRBseq_UMI[adata_SCRBseq_UMI.obs.Timepoint=='0h'].copy()
adata_SCRBseq_96h_UMI=adata_SCRBseq_UMI[adata_SCRBseq_UMI.obs.Timepoint=='96h'].copy()

In [None]:
adata_SCRBseq_0h_aracne=adata_SCRBseq_aracne[adata_SCRBseq_aracne.obs.Timepoint=='0h'].copy()
adata_SCRBseq_96h_aracne=adata_SCRBseq_aracne[adata_SCRBseq_aracne.obs.Timepoint=='96h'].copy()

In [None]:
sc.pp.filter_genes(adata_SCRBseq_0h,min_cells=10)
sc.pp.filter_genes(adata_SCRBseq_96h,min_cells=10)

In [None]:
sc.pp.highly_variable_genes(adata_SCRBseq_0h,n_top_genes=1000)
adata_SCRBseq_0h_high_var=adata_SCRBseq_0h[:,adata_SCRBseq_0h.var.highly_variable].copy()

sc.pp.highly_variable_genes(adata_SCRBseq_96h,n_top_genes=1000)
adata_SCRBseq_96h_high_var=adata_SCRBseq_96h[:,adata_SCRBseq_96h.var.highly_variable].copy()

In [None]:
sc.pl.violin(adata_SCRBseq,keys='Caprin1',groupby='Timepoint')

In [None]:
# Load Caprin1 interactors
caprin1_inter=pd.read_excel("./Caprin1_interactors/mmc5.xlsx",sheet_name="Table S1 B",skiprows=2)

In [None]:
caprin1_inter_0h=list(set(caprin1_inter['Convs2i']))
caprin1_inter_0h=[i for i in caprin1_inter_0h if i!=np.nan]
caprin1_inter_96h=list(set(caprin1_inter['ConvsRA']))
caprin1_inter_96h=[i for i in caprin1_inter_96h if i!=np.nan]

In [None]:
'Caprin1' in list(adata_SCRBseq_0h_high_var.var_names)

In [None]:
print(len(caprin1_inter_0h),len(set(caprin1_inter_0h).intersection(set(adata_SCRBseq_0h_high_var.var_names))))

In [None]:
'Caprin1' in list(adata_SCRBseq_96h_high_var.var_names)

In [None]:
print(len(caprin1_inter_96h),len(set(caprin1_inter_96h).intersection(set(adata_SCRBseq_96h_high_var.var_names))))

In [None]:
# DE genes between 0 and 96 hours
sc.tl.rank_genes_groups(adata_SCRBseq,groups=['0h'],groupby='Timepoint',ref='96h')

In [None]:
dedf_SCRBseq = sc.get.rank_genes_groups_df(adata_SCRBseq, group="0h")

In [None]:
dedf_SCRBseq=dedf_SCRBseq[dedf_SCRBseq.pvals_adj<0.05]
dedf_SCRBseq

In [None]:
dedf_SCRBseq_0h=dedf_SCRBseq[dedf_SCRBseq.logfoldchanges>0].copy()
dedf_SCRBseq_96h=dedf_SCRBseq[dedf_SCRBseq.logfoldchanges<0].copy()

In [None]:
dedf_SCRBseq_0h.sort_values('pvals_adj',inplace=True)
dedf_SCRBseq_96h.sort_values('pvals_adj',inplace=True)

In [None]:
'Caprin1' in list(dedf_SCRBseq_0h.iloc[:1000].names)

In [None]:
print(len(caprin1_inter_0h),len(set(caprin1_inter_0h).intersection(set(dedf_SCRBseq_0h.iloc[:1000].names))))

In [None]:
print(len(caprin1_inter_96h),len(set(caprin1_inter_96h).intersection(set(dedf_SCRBseq_96h.iloc[:1000].names))))

In [None]:
'Caprin1' in list(dedf_SCRBseq_96h.iloc[:1000].names)

In [None]:
genes_0h=list(set(list(dedf_SCRBseq_0h.iloc[:1000].names)+list(adata_SCRBseq_0h_high_var.var_names)))
genes_96h=list(set(list(dedf_SCRBseq_96h.iloc[:1000].names)+list(adata_SCRBseq_96h_high_var.var_names)))

In [None]:
np.savetxt('SCRB_seq_0h_top1000_HVGs.txt',np.c_[list(adata_SCRBseq_0h_high_var.var_names)],fmt="%s")
np.savetxt('SCRB_seq_96h_top1000_HVGs.txt',np.c_[list(adata_SCRBseq_96h_high_var.var_names)],fmt="%s")
np.savetxt('SCRB_seq_0h_top1000_DEGs.txt',np.c_[list(dedf_SCRBseq_0h.iloc[:1000].names)],fmt="%s")
np.savetxt('SCRB_seq_96h_top1000_DEGs.txt',np.c_[list(dedf_SCRBseq_96h.iloc[:1000].names)],fmt="%s")

In [None]:
len(genes_0h)

In [None]:
for i in genes_0h:
    if 'mt-' in i:
        print(i)

In [None]:
for i in genes_96h:
    if 'mt-' in i:
        print(i)

## Gene Selection

In [None]:
from Bio import SeqIO
gname=[]
gid=[]
f_open = open("/Users/jonathan/Desktop/IIT/INTERACTomics/scRNA-seq_data/Datasets_Applications/catRAPID_full_FEB2023/mmusculus_gene_ensembl_107_canonical.fa", "rU")
for rec in SeqIO.parse(f_open, "fasta"):
    myid = rec.id
    gname.append(myid.split('|')[4])
    gid.append(myid.split('|')[0])


In [None]:
inters0h=list(set(gname).intersection(set(adata_SCRBseq_0h.var_names)))
print(adata_SCRBseq_0h)
adata_SCRBseq_0h=adata_SCRBseq_0h[:,inters0h].copy()
print(adata_SCRBseq_0h)
inters96h=list(set(gname).intersection(set(adata_SCRBseq_96h.var_names)))
print(adata_SCRBseq_96h)
adata_SCRBseq_96h=adata_SCRBseq_96h[:,inters96h].copy()
print(adata_SCRBseq_96h)

In [None]:
# Save the pseudotime
import os

input_data='./GENE_SELECTION/'

if os.path.isdir(input_data)==False:
    os.mkdir(input_data)

In [None]:
# Save the pseudotime data
pseudo_df_0h=pd.DataFrame(data=adata_SCRBseq_0h.obs['dpt_pseudotime'], index=adata_SCRBseq_0h.obs_names)
pseudo_df_0h.to_csv(input_data+'SCRBseq_0h_PseudoTime.csv')

# Save the pseudotime data
pseudo_df_96h=pd.DataFrame(data=adata_SCRBseq_96h.obs['dpt_pseudotime'], index=adata_SCRBseq_96h.obs_names)
pseudo_df_96h.to_csv(input_data+'SCRBseq_96h_PseudoTime.csv')

In [None]:
# Remove mito genes in the 0h and 96 h dataset
print(adata_SCRBseq_0h)
mito_genes = adata_SCRBseq_0h.var_names.str.startswith('mt-')
adata_SCRBseq_0h=adata_SCRBseq_0h[:,~mito_genes].copy()
print(adata_SCRBseq_0h)
# print(len(adata_SCRBseq_0h.raw.var_names))



In [None]:
# Compute the highly variable genes
sc.pp.highly_variable_genes(adata_SCRBseq_0h)

In [None]:
# Remove mito genes in the 0h and 96 h dataset
print(adata_SCRBseq_96h)
mito_genes = adata_SCRBseq_96h.var_names.str.startswith('mt-')
adata_SCRBseq_96h=adata_SCRBseq_96h[:,~mito_genes].copy()
print(adata_SCRBseq_96h)

# Compute the highly variable genes
sc.pp.highly_variable_genes(adata_SCRBseq_96h)

## HVRBPs

In [None]:
# Load the list of mouse RBPs from RBP2GO database
mouse_RBPs=pd.read_csv("Table_MM_RBP.txt",delimiter='\t',skiprows=4)
print(mouse_RBPs.RBP2GO_Score.min(),human_RBPs.RBP2GO_Score.max())
len(mouse_RBPs)
mouse_RBPs=mouse_RBPs[mouse_RBPs.RBP2GO_Score>=10]

In [None]:
HVRBPs_0h=['Caprin1']+list(set(mouse_RBPs.Gene_Name).intersection(set(adata_SCRBseq_0h.var[adata_SCRBseq_0h.var.highly_variable].index)))
HVRBPs_96h=['Caprin1']+list(set(mouse_RBPs.Gene_Name).intersection(set(adata_SCRBseq_96h.var[adata_SCRBseq_96h.var.highly_variable].index)))
print(len(HVRBPs_0h),len(HVRBPs_96h)) 

In [None]:
## Top 500 and 1000 HVGs
sc.pp.highly_variable_genes(adata_SCRBseq_0h,n_top_genes=500)
print(len(adata_SCRBseq_0h.var[adata_SCRBseq_0h.var.highly_variable].index))

SCRBseq_0h_RBP_RNA500=list(set(HVRBPs_0h+list(adata_SCRBseq_0h.var[adata_SCRBseq_0h.var.highly_variable].index)))
print(len(SCRBseq_0h_RBP_RNA500))

sc.pp.highly_variable_genes(adata_SCRBseq_96h,n_top_genes=500)
print(len(adata_SCRBseq_96h.var[adata_SCRBseq_96h.var.highly_variable].index))

SCRBseq_96h_RBP_RNA500=list(set(HVRBPs_96h+list(adata_SCRBseq_96h.var[adata_SCRBseq_96h.var.highly_variable].index)))
print(len(SCRBseq_96h_RBP_RNA500))

In [None]:
## Top 500 and 1000 HVGs
sc.pp.highly_variable_genes(adata_SCRBseq_0h,n_top_genes=1000)
print(len(adata_SCRBseq_0h.var[adata_SCRBseq_0h.var.highly_variable].index))

SCRBseq_0h_RBP_RNA1000=list(set(HVRBPs_0h+list(adata_SCRBseq_0h.var[adata_SCRBseq_0h.var.highly_variable].index)))
print(len(SCRBseq_0h_RBP_RNA1000))

sc.pp.highly_variable_genes(adata_SCRBseq_96h,n_top_genes=1000)
print(len(adata_SCRBseq_96h.var[adata_SCRBseq_96h.var.highly_variable].index))

SCRBseq_96h_RBP_RNA1000=list(set(HVRBPs_96h+list(adata_SCRBseq_96h.var[adata_SCRBseq_96h.var.highly_variable].index)))
print(len(SCRBseq_96h_RBP_RNA1000))

In [None]:
# Save the normalized data in a csv file
tmp_df=pd.DataFrame(data=adata_SCRBseq_0h[:,SCRBseq_0h_RBP_RNA500].X.T,
                    index=adata_SCRBseq_0h[:,SCRBseq_0h_RBP_RNA500].var_names,
                    columns=adata_SCRBseq_0h[:,SCRBseq_0h_RBP_RNA500].obs_names)
tmp_df.to_csv(input_data+'SCRBseq_0hNormalizedData_RBP_RNA500.csv')

# Save the raw data in a csv file
tmp_df=pd.DataFrame(data=adata_SCRBseq_0h_UMI[:, SCRBseq_0h_RBP_RNA500].X.T, 
                    index=adata_SCRBseq_0h_UMI[:, SCRBseq_0h_RBP_RNA500].var_names, 
                    columns=adata_SCRBseq_0h_UMI[:, SCRBseq_0h_RBP_RNA500].obs_names)
tmp_df.to_csv(input_data+'SCRBseq_0hRawData_RBP_RNA500.csv')

tmp_df=pd.DataFrame(data=adata_SCRBseq_0h[:,SCRBseq_0h_RBP_RNA1000].X.T,
                    index=adata_SCRBseq_0h[:,SCRBseq_0h_RBP_RNA1000].var_names,
                    columns=adata_SCRBseq_0h[:,SCRBseq_0h_RBP_RNA1000].obs_names)
tmp_df.to_csv(input_data+'SCRBseq_0hNormalizedData_RBP_RNA1000.csv')

# Save the raw data in a csv file
tmp_df=pd.DataFrame(data=adata_SCRBseq_0h_UMI[:, SCRBseq_0h_RBP_RNA1000].X.T, 
                    index=adata_SCRBseq_0h_UMI[:, SCRBseq_0h_RBP_RNA1000].var_names, 
                    columns=adata_SCRBseq_0h_UMI[:, SCRBseq_0h_RBP_RNA1000].obs_names)
tmp_df.to_csv(input_data+'SCRBseq_0hRawData_RBP_RNA1000.csv')

# Save the normalized data in a csv file
tmp_df=pd.DataFrame(data=adata_SCRBseq_96h[:,SCRBseq_96h_RBP_RNA500].X.T,
                    index=adata_SCRBseq_96h[:,SCRBseq_96h_RBP_RNA500].var_names,
                    columns=adata_SCRBseq_96h[:,SCRBseq_96h_RBP_RNA500].obs_names)
tmp_df.to_csv(input_data+'SCRBseq_96hNormalizedData_RBP_RNA500.csv')

# Save the raw data in a csv file
tmp_df=pd.DataFrame(data=adata_SCRBseq_96h_UMI[:, SCRBseq_96h_RBP_RNA500].X.T, 
                    index=adata_SCRBseq_96h_UMI[:, SCRBseq_96h_RBP_RNA500].var_names, 
                    columns=adata_SCRBseq_96h_UMI[:, SCRBseq_96h_RBP_RNA500].obs_names)
tmp_df.to_csv(input_data+'SCRBseq_96hRawData_RBP_RNA500.csv')

tmp_df=pd.DataFrame(data=adata_SCRBseq_96h[:,SCRBseq_96h_RBP_RNA1000].X.T,
                    index=adata_SCRBseq_96h[:,SCRBseq_96h_RBP_RNA1000].var_names,
                    columns=adata_SCRBseq_96h[:,SCRBseq_96h_RBP_RNA1000].obs_names)
tmp_df.to_csv(input_data+'SCRBseq_96hNormalizedData_RBP_RNA1000.csv')

# Save the raw data in a csv file
tmp_df=pd.DataFrame(data=adata_SCRBseq_96h_UMI[:, SCRBseq_96h_RBP_RNA1000].X.T, 
                    index=adata_SCRBseq_96h_UMI[:, SCRBseq_96h_RBP_RNA1000].var_names, 
                    columns=adata_SCRBseq_96h_UMI[:, SCRBseq_96h_RBP_RNA1000].obs_names)
tmp_df.to_csv(input_data+'SCRBseq_96hRawData_RBP_RNA1000.csv')

In [None]:
input_folder_aracne=input_data+'ARACNe_INPUT/'

if os.path.isdir(input_folder_aracne)==False:
    os.mkdir(input_folder_aracne)

In [None]:
def SaveDataforARACNe(folder, adata, geneset, label1, label2):
    adata_temp=ad.AnnData(X=adata[:,geneset].X)
    adata_temp.obs_names=adata[:,geneset].obs_names
    adata_temp.var_names=adata[:,geneset].var_names
    adata_temp.obs['batch']=label1
    adata_temp.write(folder+'processed_'+label1+'_'+label2+'.h5ad')

In [None]:
SaveDataforARACNe(input_folder_aracne+'SCRBseq_0h_RBP_RNA500', adata_SCRBseq_0h_aracne, SCRBseq_0h_RBP_RNA500,
         'SCRBseq_0h', 'RBP_RNA500')

SaveDataforARACNe(input_folder_aracne+'SCRBseq_0h_RBP_RNA1000', adata_SCRBseq_0h_aracne, SCRBseq_0h_RBP_RNA1000,
         'SCRBseq_0h', 'RBP_RNA1000')

SaveDataforARACNe(input_folder_aracne+'SCRBseq_96h_RBP_RNA500', adata_SCRBseq_96h_aracne, SCRBseq_96h_RBP_RNA500,
         'SCRBseq_96h', 'RBP_RNA500')

SaveDataforARACNe(input_folder_aracne+'SCRBseq_96h_RBP_RNA1000', adata_SCRBseq_96h_aracne, SCRBseq_96h_RBP_RNA1000,
         'SCRBseq_96h', 'RBP_RNA1000')