# Pre-processing and gene selection for HEK293T and HCT116 single-cell RNA-seq data

Rome, 04/03/2022 Jonathan Fiorentino

Here we analyze two HEK293T and one HCT116 scRNA-seq datasets, which we use for the prediction of RBP co-interactions. 

Description of the datasets and of the analysis:

HEK293T

We selected two scRNA-seq datasets for the HEK293T cell line. The 10x dataset is available from the website of 10x Genomics [https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/293t](https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/293t) and it contains 2885 cells. 

The [Smart-seq3 dataset][https://pubmed.ncbi.nlm.nih.gov/32518404/] is available in ArrayExpress with accession [E-MTAB-8735](https://www.ebi.ac.uk/arrayexpress/experiments/E-MTAB-8735/) and the single cell identifier column in the sample information table is “HEK293T Smart-seq3”; it contains 117 cells. We followed the same pre-processing steps explained above for the HepG2 and K562 datasets.

HCT116
The Drop-seq scRNA-seq dataset for the HCT116 cell line [https://pubmed.ncbi.nlm.nih.gov/32846134/](https://pubmed.ncbi.nlm.nih.gov/32846134/) is available on GEO with accession number [GSE149224](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE149224) . The dataset includes three different cell lines (RKO, HCT116 and SW480) treated with different doses of 5-fluorouracil treatment to study the DNA-damage response of the transcriptome.

We selected only the 3011 HCT116 cells and we exploited the presence of treated cells to compute the diffusion pseudotime. We followed the pre-processing steps explained above for the HepG2 and K562 datasets, then we computed a diffusion map of HCT116 cells. We computed the diffusion pseudotime choosing as the root cell the control cell farthest from the treated ones. Next, we kept only the 2161 control cells for downstream analysis, following the analyses done for the HepG2 and K562 cell lines for gene selection and GRN inference.


In [None]:
%matplotlib inline

In [None]:
import scanpy as sc
import anndata as ad
import scprep as scp
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os

# HEK293T

## 10x

In [None]:
data_folder='./HEK293T/10x/'
input_folder="./ANALYSIS_FEB_2023_RIBO/COINTER_RBPs/HEK293T/"

In [None]:
if os.path.isdir(input_folder)==False:
    os.mkdir(input_folder)

In [None]:
out_folder="./ANALYSIS_FEB_2023_RIBO/COINTER_RBPs/HEK293T/GENE_SELECTION/"
aracne_folder="./ANALYSIS_FEB_2023_RIBO/COINTER_RBPs/HEK293T/GENE_SELECTION/ARACNe_INPUT"

if os.path.isdir(out_folder)==False:
    os.mkdir(out_folder)
if os.path.isdir(aracne_folder)==False:
    os.mkdir(aracne_folder)

### Load the data

In [None]:
adata_hek293t=sc.read_10x_mtx(data_folder)
adata_hek293t
# Filter genes expressed in less than 1% of the cells
sc.pp.filter_genes(adata_hek293t, min_cells=int(0.01*adata_hek293t.n_obs))
print(adata_hek293t)
adata_hek293t.obs['batch']='HEK293'
adata_hek293t_UMI=adata_hek293t.copy()
# adata_hek293t.write_h5ad(ARACNe_folder+"processed_HEK293.h5ad")
# Store the UMIs
adata_hek293t.raw=adata_hek293t

# Normalize the data
sc.pp.normalize_total(adata_hek293t,inplace=True)
adata_hek293t_for_ARACNe=adata_hek293t.copy()
sc.pp.log1p(adata_hek293t)

In [None]:
def FilterMito(adata):
    mito_genes = adata.var_names.str.startswith('MT-')
    genes_to_keep = np.invert(mito_genes)
    print('before',adata)
    adata = adata[:,genes_to_keep].copy()
    print('after',adata)
    return adata;

In [None]:
# Remove mitochondrial and ribosomal genes
adata_hek293t=FilterMito(adata_hek293t)

### Compute diffusion pseudotime

In [None]:
sc.pp.highly_variable_genes(adata_hek293t,max_mean=10,n_top_genes=2000)  #calculate highly variable genes
adata_hek293t_high_var = adata_hek293t[:,adata_hek293t.var['highly_variable']==True]  #select only highly variable genes
sc.pp.scale(adata_hek293t_high_var,max_value=10)
sc.tl.pca(adata_hek293t_high_var,svd_solver='arpack')
sc.pl.pca_overview(adata_hek293t_high_var)

In [None]:
sc.pp.neighbors(adata_hek293t_high_var)#), n_neighbors=10, n_pcs=10)
sc.tl.umap(adata_hek293t_high_var)
sc.pl.umap(adata_hek293t_high_var)

In [None]:
sc.tl.leiden(adata_hek293t_high_var)
sc.pl.umap(adata_hek293t_high_var,color='leiden')

In [None]:
# Create the diffusion map
sc.tl.diffmap(adata_hek293t_high_var)
sc.pl.diffmap(adata_hek293t_high_var,color=['leiden'])

In [None]:
adata_hek293t_high_var.uns['iroot'] = np.argmin(adata_hek293t_high_var.obsm['X_diffmap'][:,1])

# Run Diffusion Pseudotime with 1 branching event
sc.tl.dpt(adata_hek293t_high_var)

# Grab the output and store in our metadata DataFrame
adata_hek293t_high_var.obs['dpt'] = adata_hek293t_high_var.obs['dpt_pseudotime']
# adata_hek293t_high_var.obs['dpt_branch'] = adata_hek293t_high_var.obs['dpt_groups'].astype(int)
adata_hek293t_high_var.obs.head()

In [None]:
pseudo_df=pd.DataFrame(data=adata_hek293t_high_var.obs['dpt'], index=adata_hek293t_high_var.obs_names)
pseudo_df.to_csv(out_folder+'HEK293TPseudoTime.csv')

### Gene Selection

In [None]:
from Bio import SeqIO
gname=[]
gid=[]
f_open = open("/Users/jonathan/Desktop/IIT/INTERACTomics/ENCODE_eCLIP_DATA/transcriptomes/hsapiens_gene_ensembl_107_canonical_new.fa", "rU")
for rec in SeqIO.parse(f_open, "fasta"):
    myid = rec.id
    gname.append(myid.split('|')[4])
    gid.append(myid.split('|')[0])

In [None]:
# Consider only genes present in the fasta file
def Genes_in_fasta(adata,gnames):
    print(adata)
    inters=list(set(adata.var_names).intersection(set(gnames)))
    adata=adata[:,inters].copy()
    print(adata)
    return adata;

In [None]:
adata_hek293t=Genes_in_fasta(adata_hek293t,gname)

In [None]:
tmp_adata=adata_hek293t.copy()
sc.pp.highly_variable_genes(tmp_adata,max_mean=10,n_top_genes=500)
adata_HVGs=tmp_adata[:,tmp_adata.var['highly_variable']==True].copy()

HEK293T_RBP_RNA500=list(adata_HVGs.var_names)
print(len(HEK293T_RBP_RNA500))

tmp_adata=adata_hek293t.copy()
sc.pp.highly_variable_genes(tmp_adata,max_mean=10,n_top_genes=1000)
adata_HVGs=tmp_adata[:,tmp_adata.var['highly_variable']==True].copy()

HEK293T_RBP_RNA1000=list(adata_HVGs.var_names)
print(len(HEK293T_RBP_RNA1000))

tmp_adata=adata_hek293t.copy()
sc.pp.highly_variable_genes(tmp_adata,max_mean=10,n_top_genes=2000)
adata_HVGs=tmp_adata[:,tmp_adata.var['highly_variable']==True].copy()

HEK293T_RBP_RNA2000=list(adata_HVGs.var_names)
print(len(HEK293T_RBP_RNA2000))

tmp_adata=adata_hek293t.copy()
sc.pp.highly_variable_genes(tmp_adata,max_mean=10,n_top_genes=3000)
adata_HVGs=tmp_adata[:,tmp_adata.var['highly_variable']==True].copy()

HEK293T_RBP_RNA3000=list(adata_HVGs.var_names)
print(len(HEK293T_RBP_RNA3000))

tmp_adata=adata_hek293t.copy()
sc.pp.highly_variable_genes(tmp_adata,max_mean=10)
adata_HVGs=tmp_adata[:,tmp_adata.var['highly_variable']==True].copy()

HEK293T_RBP_RNAHVG=list(adata_HVGs.var_names)
print(len(HEK293T_RBP_RNAHVG))

In [None]:
# Bioplex data for HEK293T
Bioplex_HEK293T=pd.read_csv("/Users/jonathan/Desktop/IIT/INTERACTomics/scRNA-seq_data/ANALYSIS_FEB_2023_RIBO/Evaluation/COMPLEXES/BioPlex_293T_Network_10K_Dec_2019.tsv",delimiter="\t")
Bioplex_HEK293T=Bioplex_HEK293T.loc[:,['SymbolA','SymbolB']].copy()

# Load RBPs
human_RBPs=pd.read_csv("/Users/jonathan/Desktop/IIT/INTERACTomics/scRNA-seq_data/Datasets_Applications/RBPs/Table_HS_RBP.txt",delimiter='\t',skiprows=4)
human_RBPs=human_RBPs[human_RBPs.RBP2GO_Score>=10]
print(len(human_RBPs))

Bioplex_HEK293T_proteins=list(set(list(Bioplex_HEK293T.SymbolA)+list(Bioplex_HEK293T.SymbolB)))
Bioplex_HEK293T_RBPs=list(set(human_RBPs.Gene_Name).intersection(set(Bioplex_HEK293T_proteins)))
len(Bioplex_HEK293T_RBPs)

In [None]:
Bioplex_HEK293T[(Bioplex_HEK293T.SymbolA.isin(Bioplex_HEK293T_RBPs)) & (Bioplex_HEK293T.SymbolB.isin(Bioplex_HEK293T_RBPs))]


In [None]:
print(len(set(HEK293T_RBP_RNA500)),len(set(HEK293T_RBP_RNA500).intersection(Bioplex_HEK293T_RBPs)))
print(len(set(HEK293T_RBP_RNA1000)),len(set(HEK293T_RBP_RNA1000).intersection(Bioplex_HEK293T_RBPs)))
print(len(set(HEK293T_RBP_RNA2000)),len(set(HEK293T_RBP_RNA2000).intersection(Bioplex_HEK293T_RBPs)))
print(len(set(HEK293T_RBP_RNA3000)),len(set(HEK293T_RBP_RNA3000).intersection(Bioplex_HEK293T_RBPs)))
print(len(set(HEK293T_RBP_RNAHVG)),len(set(HEK293T_RBP_RNAHVG).intersection(Bioplex_HEK293T_RBPs)))

In [None]:
def SaveData(folder, adata,adata_UMI, geneset, label1, label2):
    
    # Save the normalized data in a csv file
    tmp_df=pd.DataFrame(data=adata[:,geneset].X.T.todense(), index=adata[:,geneset].var_names,
                        columns=adata[:,geneset].obs_names)
    tmp_df.to_csv(folder+label1+'NormalizedData_'+label2+'.csv')

    # Save the raw data in a csv file
    tmp_df=pd.DataFrame(data=adata_UMI[:, geneset].X.T.todense(), index=adata_UMI[:, geneset].var_names, 
                        columns=adata_UMI[:, geneset].obs_names)
    tmp_df.to_csv(folder+label1+'RawData_'+label2+'.csv')

In [None]:
if os.path.isdir(out_folder+'HEK293T_RBP_RNA500')==False:
    os.mkdir(out_folder+'HEK293T_RBP_RNA500')
    
if os.path.isdir(out_folder+'HEK293T_RBP_RNA1000')==False:
    os.mkdir(out_folder+'HEK293T_RBP_RNA1000')

if os.path.isdir(out_folder+'HEK293T_RBP_RNA2000')==False:
    os.mkdir(out_folder+'HEK293T_RBP_RNA2000')
    
if os.path.isdir(out_folder+'HEK293T_RBP_RNA3000')==False:
    os.mkdir(out_folder+'HEK293T_RBP_RNA3000')

if os.path.isdir(out_folder+'HEK293T_RBP_RNAHVG')==False:
    os.mkdir(out_folder+'HEK293T_RBP_RNAHVG')

In [None]:
SaveData(out_folder+'HEK293T_RBP_RNA500/', adata_hek293t,adata_hek293t_UMI, HEK293T_RBP_RNA500,
         'HEK293T', 'RBP_RNA500')
SaveData(out_folder+'HEK293T_RBP_RNA1000/', adata_hek293t,adata_hek293t_UMI, HEK293T_RBP_RNA1000,
         'HEK293T', 'RBP_RNA1000')
SaveData(out_folder+'HEK293T_RBP_RNA2000/', adata_hek293t,adata_hek293t_UMI, HEK293T_RBP_RNA2000,
         'HEK293T', 'RBP_RNA2000')
SaveData(out_folder+'HEK293T_RBP_RNA3000/', adata_hek293t,adata_hek293t_UMI, HEK293T_RBP_RNA3000,
         'HEK293T', 'RBP_RNA3000')
SaveData(out_folder+'HEK293T_RBP_RNAHVG/', adata_hek293t,adata_hek293t_UMI, HEK293T_RBP_RNAHVG,
         'HEK293T', 'RBP_RNAHVG')

In [None]:
gname_folder='./ANALYSIS_FEB_2023_RIBO/COINTER_RBPs/HEK293T/GENE_SELECTION/gene_names/'

if os.path.isdir(gname_folder)==False:
    os.mkdir(gname_folder)

In [None]:
np.savetxt(gname_folder+'gnamesHEK293T_RBP_RNA500.txt',np.c_[HEK293T_RBP_RNA500],fmt='%s')
np.savetxt(gname_folder+'gnamesHEK293T_RBP_RNA1000.txt',np.c_[HEK293T_RBP_RNA1000],fmt='%s')
np.savetxt(gname_folder+'gnamesHEK293T_RBP_RNA2000.txt',np.c_[HEK293T_RBP_RNA2000],fmt='%s')
np.savetxt(gname_folder+'gnamesHEK293T_RBP_RNA3000.txt',np.c_[HEK293T_RBP_RNA3000],fmt='%s')
np.savetxt(gname_folder+'gnamesHEK293T_RBP_RNAHVG.txt',np.c_[HEK293T_RBP_RNAHVG],fmt='%s')

## Smart-seq3

In [None]:
data_folder='./HEK293T/Smart-seq3/Normal/'
input_folder="./ANALYSIS_FEB_2023_RIBO/COINTER_RBPs/HEK293T_smartseq3/"

In [None]:
if os.path.isdir(input_folder)==False:
    os.mkdir(input_folder)

In [None]:
out_folder="./ANALYSIS_FEB_2023_RIBO/COINTER_RBPs/HEK293T_smartseq3/GENE_SELECTION/"
aracne_folder="./ANALYSIS_FEB_2023_RIBO/COINTER_RBPs/HEK293T_smartseq3/GENE_SELECTION/ARACNe_INPUT"

if os.path.isdir(out_folder)==False:
    os.mkdir(out_folder)
if os.path.isdir(aracne_folder)==False:
    os.mkdir(aracne_folder)

### Load the data

In [None]:
adata_hek293t_smart=ad.read_text(data_folder+"Smartseq3.HEK.fwdprimer.UMIcounts_gnames.txt")
adata_hek293t_smart=adata_hek293t_smart.T

In [None]:
adata_hek293t_smart_metadata=pd.read_csv(data_folder+"Smartseq3.HEK.fwdprimer.sample_annotation.txt",
                                        delimiter="\t")
adata_hek293t_smart_metadata=adata_hek293t_smart_metadata.set_index("BC")

In [None]:
adata_hek293t_smart.obs_names

In [None]:
len(set(adata_hek293t_smart_metadata.index).intersection(adata_hek293t_smart.obs_names))

In [None]:
adata_hek293t_smart_metadata=adata_hek293t_smart_metadata.reindex(adata_hek293t_smart.obs_names)
(adata_hek293t_smart_metadata.index==adata_hek293t_smart.obs_names).all()

In [None]:
adata_hek293t_smart.obs=adata_hek293t_smart_metadata

In [None]:
# Filter genes expressed in less than 10 cells
sc.pp.filter_genes(adata_hek293t_smart, min_cells=10)
print(adata_hek293t_smart)
adata_hek293t_smart.obs['batch']='HEK293'
adata_hek293t_smart_UMI=adata_hek293t_smart.copy()
# adata_hek293t.write_h5ad(ARACNe_folder+"processed_HEK293.h5ad")
# Store the UMIs
adata_hek293t_smart.raw=adata_hek293t_smart

# Normalize the data
sc.pp.normalize_total(adata_hek293t_smart,inplace=True)
adata_hek293t_smart_for_ARACNe=adata_hek293t_smart.copy()
sc.pp.log1p(adata_hek293t_smart)

In [None]:
def FilterMito(adata):
    mito_genes = adata.var_names.str.startswith('MT-')
    genes_to_keep = np.invert(mito_genes)
    print('before',adata)
    adata = adata[:,genes_to_keep].copy()
    print('after',adata)
    return adata;

In [None]:
# Remove mitochondrial and ribosomal genes
adata_hek293t_smart=FilterMito(adata_hek293t_smart)

### Compute diffusion pseudotime

In [None]:
sc.pp.highly_variable_genes(adata_hek293t_smart,max_mean=10,n_top_genes=2000)  #calculate highly variable genes
adata_hek293t_smart_high_var = adata_hek293t_smart[:,adata_hek293t_smart.var['highly_variable']==True]  #select only highly variable genes
sc.pp.scale(adata_hek293t_smart_high_var,max_value=10)
sc.tl.pca(adata_hek293t_smart_high_var,svd_solver='arpack')
sc.pl.pca_overview(adata_hek293t_smart_high_var)

In [None]:
adata_hek293t_smart

In [None]:
sc.pp.neighbors(adata_hek293t_smart_high_var)#), n_neighbors=10, n_pcs=10)
sc.tl.umap(adata_hek293t_smart_high_var)


In [None]:
sc.pl.umap(adata_hek293t_smart_high_var)

In [None]:
# Create the diffusion map
sc.tl.diffmap(adata_hek293t_smart_high_var)
sc.pl.diffmap(adata_hek293t_smart_high_var)

In [None]:
adata_hek293t_smart_high_var.uns['iroot'] = np.argmin(adata_hek293t_smart_high_var.obsm['X_diffmap'][:,1])

# Run Diffusion Pseudotime with 1 branching event
sc.tl.dpt(adata_hek293t_smart_high_var)

# Grab the output and store in our metadata DataFrame
adata_hek293t_smart_high_var.obs['dpt'] = adata_hek293t_smart_high_var.obs['dpt_pseudotime']
# adata_hek293t_high_var.obs['dpt_branch'] = adata_hek293t_high_var.obs['dpt_groups'].astype(int)
adata_hek293t_smart_high_var.obs.head()

In [None]:
adata_hek293t_smart_high_var.obs_names

In [None]:
adata_hek293t_smart.obs_names

In [None]:
pseudo_df=pd.DataFrame(data=adata_hek293t_smart_high_var.obs['dpt'], index=adata_hek293t_smart_high_var.obs_names)
pseudo_df.to_csv(out_folder+'HEK293T_smartseq3PseudoTime.csv')

In [None]:
# adata_hek293t_smart.var_names_make_unique()

In [None]:
from Bio import SeqIO
gname=[]
gid=[]
f_open = open("/Users/jonathan/Desktop/IIT/INTERACTomics/ENCODE_eCLIP_DATA/transcriptomes/hsapiens_gene_ensembl_107_canonical_new.fa", "rU")
for rec in SeqIO.parse(f_open, "fasta"):
    myid = rec.id
    gname.append(myid.split('|')[4])
    gid.append(myid.split('|')[0])

In [None]:
# Consider only genes present in the fasta file
def Genes_in_fasta(adata,gnames):
    print(adata)
    inters=list(set(adata.var_names).intersection(set(gname)))
    adata=adata[:,inters].copy()
    print(adata)
    return adata;

In [None]:
adata_hek293t_smart=Genes_in_fasta(adata_hek293t_smart,gname)

In [None]:
len(set(HEK293T_SMART_RBP_RNA500).intersection(set(adata_hek293t_smart.var_names)))

In [None]:
tmp_adata=adata_hek293t_smart.copy()
sc.pp.highly_variable_genes(tmp_adata,max_mean=10,n_top_genes=500)
adata_HVGs=tmp_adata[:,tmp_adata.var['highly_variable']==True].copy()

HEK293T_SMART_RBP_RNA500=list(adata_HVGs.var_names)
print(len(HEK293T_SMART_RBP_RNA500))

tmp_adata=adata_hek293t_smart.copy()
sc.pp.highly_variable_genes(tmp_adata,max_mean=10,n_top_genes=1000)
adata_HVGs=tmp_adata[:,tmp_adata.var['highly_variable']==True].copy()

HEK293T_SMART_RBP_RNA1000=list(adata_HVGs.var_names)
print(len(HEK293T_SMART_RBP_RNA1000))

tmp_adata=adata_hek293t_smart.copy()
sc.pp.highly_variable_genes(tmp_adata,max_mean=10,n_top_genes=2000)
adata_HVGs=tmp_adata[:,tmp_adata.var['highly_variable']==True].copy()

HEK293T_SMART_RBP_RNA2000=list(adata_HVGs.var_names)
print(len(HEK293T_SMART_RBP_RNA2000))

tmp_adata=adata_hek293t_smart.copy()
sc.pp.highly_variable_genes(tmp_adata,max_mean=10,n_top_genes=3000)
adata_HVGs=tmp_adata[:,tmp_adata.var['highly_variable']==True].copy()

HEK293T_SMART_RBP_RNA3000=list(adata_HVGs.var_names)
print(len(HEK293T_SMART_RBP_RNA3000))

tmp_adata=adata_hek293t_smart.copy()
sc.pp.highly_variable_genes(tmp_adata,max_mean=10)
adata_HVGs=tmp_adata[:,tmp_adata.var['highly_variable']==True].copy()

HEK293T_SMART_RBP_RNAHVG=list(adata_HVGs.var_names)
print(len(HEK293T_SMART_RBP_RNAHVG))

In [None]:
# Bioplex data for HEK293T
Bioplex_HEK293T=pd.read_csv("/Users/jonathan/Desktop/IIT/INTERACTomics/scRNA-seq_data/ANALYSIS_FEB_2023_RIBO/Evaluation/COMPLEXES/BioPlex_293T_Network_10K_Dec_2019.tsv",delimiter="\t")
Bioplex_HEK293T=Bioplex_HEK293T.loc[:,['SymbolA','SymbolB']].copy()

# Load RBPs
human_RBPs=pd.read_csv("/Users/jonathan/Desktop/IIT/INTERACTomics/scRNA-seq_data/Datasets_Applications/RBPs/Table_HS_RBP.txt",delimiter='\t',skiprows=4)
human_RBPs=human_RBPs[human_RBPs.RBP2GO_Score>=10]
print(len(human_RBPs))

Bioplex_HEK293T_proteins=list(set(list(Bioplex_HEK293T.SymbolA)+list(Bioplex_HEK293T.SymbolB)))
Bioplex_HEK293T_RBPs=list(set(human_RBPs.Gene_Name).intersection(set(Bioplex_HEK293T_proteins)))

In [None]:
print(len(set(HEK293T_SMART_RBP_RNA500)),len(set(HEK293T_SMART_RBP_RNA500).intersection(Bioplex_HEK293T_RBPs)))
print(len(set(HEK293T_SMART_RBP_RNA1000)),len(set(HEK293T_SMART_RBP_RNA1000).intersection(Bioplex_HEK293T_RBPs)))
print(len(set(HEK293T_SMART_RBP_RNA2000)),len(set(HEK293T_SMART_RBP_RNA2000).intersection(Bioplex_HEK293T_RBPs)))
print(len(set(HEK293T_SMART_RBP_RNA3000)),len(set(HEK293T_SMART_RBP_RNA3000).intersection(Bioplex_HEK293T_RBPs)))
print(len(set(HEK293T_SMART_RBP_RNAHVG)),len(set(HEK293T_SMART_RBP_RNAHVG).intersection(Bioplex_HEK293T_RBPs)))

In [None]:
def SaveData(folder, adata,adata_UMI, geneset, label1, label2):
    
    # Save the normalized data in a csv file
    tmp_df=pd.DataFrame(data=adata[:,geneset].X.T, index=adata[:,geneset].var_names,
                        columns=adata[:,geneset].obs_names)
    tmp_df.to_csv(folder+label1+'NormalizedData_'+label2+'.csv')

    # Save the raw data in a csv file
    tmp_df=pd.DataFrame(data=adata_UMI[:, geneset].X.T, index=adata_UMI[:, geneset].var_names, 
                        columns=adata_UMI[:, geneset].obs_names)
    tmp_df.to_csv(folder+label1+'RawData_'+label2+'.csv')

In [None]:
if os.path.isdir(out_folder+'HEK293T_smartseq3_RBP_RNA500')==False:
    os.mkdir(out_folder+'HEK293T_smartseq3_RBP_RNA500')
    
if os.path.isdir(out_folder+'HEK293T_smartseq3_RBP_RNA1000')==False:
    os.mkdir(out_folder+'HEK293T_smartseq3_RBP_RNA1000')

if os.path.isdir(out_folder+'HEK293T_smartseq3_RBP_RNA2000')==False:
    os.mkdir(out_folder+'HEK293T_smartseq3_RBP_RNA2000')
    
if os.path.isdir(out_folder+'HEK293T_smartseq3_RBP_RNA3000')==False:
    os.mkdir(out_folder+'HEK293T_smartseq3_RBP_RNA3000')

if os.path.isdir(out_folder+'HEK293T_smartseq3_RBP_RNAHVG')==False:
    os.mkdir(out_folder+'HEK293T_smartseq3_RBP_RNAHVG')

In [None]:
SaveData(out_folder+'HEK293T_smartseq3_RBP_RNA500/', adata_hek293t_smart,adata_hek293t_smart_UMI, HEK293T_SMART_RBP_RNA500,
         'HEK293T_smartseq3', 'RBP_RNA500')
SaveData(out_folder+'HEK293T_smartseq3_RBP_RNA1000/', adata_hek293t_smart,adata_hek293t_smart_UMI, HEK293T_SMART_RBP_RNA1000,
         'HEK293T_smartseq3', 'RBP_RNA1000')
SaveData(out_folder+'HEK293T_smartseq3_RBP_RNA2000/', adata_hek293t_smart,adata_hek293t_smart_UMI, HEK293T_SMART_RBP_RNA2000,
         'HEK293T_smartseq3', 'RBP_RNA2000')
SaveData(out_folder+'HEK293T_smartseq3_RBP_RNA3000/', adata_hek293t_smart,adata_hek293t_smart_UMI, HEK293T_SMART_RBP_RNA3000,
         'HEK293T_smartseq3', 'RBP_RNA3000')
SaveData(out_folder+'HEK293T_smartseq3_RBP_RNAHVG/', adata_hek293t_smart,adata_hek293t_smart_UMI, HEK293T_SMART_RBP_RNAHVG,
         'HEK293T_smartseq3', 'RBP_RNAHVG')

In [None]:
gname_folder='./ANALYSIS_FEB_2023_RIBO/COINTER_RBPs/HEK293T/GENE_SELECTION/gene_names/'

if os.path.isdir(gname_folder)==False:
    os.mkdir(gname_folder)

In [None]:
np.savetxt(gname_folder+'gnamesHEK293T_smartseq3_RBP_RNA500.txt',np.c_[HEK293T_SMART_RBP_RNA500],fmt='%s')
np.savetxt(gname_folder+'gnamesHEK293T_smartseq3_RBP_RNA1000.txt',np.c_[HEK293T_SMART_RBP_RNA1000],fmt='%s')
np.savetxt(gname_folder+'gnamesHEK293T_smartseq3_RBP_RNA2000.txt',np.c_[HEK293T_SMART_RBP_RNA2000],fmt='%s')
np.savetxt(gname_folder+'gnamesHEK293T_smartseq3_RBP_RNA3000.txt',np.c_[HEK293T_SMART_RBP_RNA3000],fmt='%s')
np.savetxt(gname_folder+'gnamesHEK293T_smartseq3_RBP_RNAHVG.txt',np.c_[HEK293T_SMART_RBP_RNAHVG],fmt='%s')

In [None]:
# Save data for ARACNe
def SaveDataforARACNe(folder, adata, geneset, label1, label2):
    adata_temp=ad.AnnData(X=adata[:,geneset].X)
    adata_temp.obs_names=adata[:,geneset].obs_names
    adata_temp.var_names=adata[:,geneset].var_names
    adata_temp.obs['batch']=label1
    adata_temp.write(folder+'processed_'+label1+'_'+label2+'.h5ad')

In [None]:
Y=log(X+1)

In [None]:
eY=X+1

In [None]:
adata_hek293t_for_ARACNe=ad.AnnData(X=np.exp(adata_hek293t_smart.X)-1)
adata_hek293t_for_ARACNe.obs_names=adata_hek293t_smart.obs_names
adata_hek293t_for_ARACNe.var_names=adata_hek293t_smart.var_names
adata_hek293t_for_ARACNe.obs=adata_hek293t_smart.obs

In [None]:
SaveDataforARACNe(aracne_folder+'HEK293T_smartseq3_RBP_RNA500', adata_hek293t_for_ARACNe, HEK293T_SMART_RBP_RNA500,
         'HEK293T_smartseq3', 'RBP_RNA500')
SaveDataforARACNe(aracne_folder+'HEK293T_smartseq3_RBP_RNA1000', adata_hek293t_for_ARACNe, HEK293T_SMART_RBP_RNA1000,
         'HEK293T_smartseq3', 'RBP_RNA1000')
SaveDataforARACNe(aracne_folder+'HEK293T_smartseq3_RBP_RNA2000', adata_hek293t_for_ARACNe, HEK293T_SMART_RBP_RNA2000,
         'HEK293T_smartseq3', 'RBP_RNA2000')
SaveDataforARACNe(aracne_folder+'HEK293T_smartseq3_RBP_RNA3000', adata_hek293t_for_ARACNe, HEK293T_SMART_RBP_RNA3000,
         'HEK293T_smartseq3', 'RBP_RNA3000')
SaveDataforARACNe(aracne_folder+'HEK293T_smartseq3_RBP_RNAHVG', adata_hek293t_for_ARACNe, HEK293T_SMART_RBP_RNAHVG,
         'HEK293T_smartseq3', 'RBP_RNAHVG')



# HCT116

In [None]:
data_folder='./HCT116/'
input_folder="./ANALYSIS_FEB_2023_RIBO/COINTER_RBPs/HCT116/"

In [None]:
if os.path.isdir(input_folder)==False:
    os.mkdir(input_folder)

In [None]:
out_folder="./ANALYSIS_FEB_2023_RIBO/COINTER_RBPs/HCT116/GENE_SELECTION/"
aracne_folder="./ANALYSIS_FEB_2023_RIBO/COINTER_RBPs/HCT116/GENE_SELECTION/ARACNe_INPUT/"

if os.path.isdir(out_folder)==False:
    os.mkdir(out_folder)
if os.path.isdir(aracne_folder)==False:
    os.mkdir(aracne_folder)

## Load the data

In [None]:
adata_HCT116=ad.read_csv(data_folder+"GSE149224_RSH.all.counts.txt.gz",delimiter=' ')
adata_HCT116=adata_HCT116.T

In [None]:
metadata_HCT116=pd.read_csv(data_folder+"GSE149224_meta.information.csv.gz",index_col=0)

In [None]:
metadata_HCT116=metadata_HCT116[metadata_HCT116["df.gid"]=="HCT116"].copy()

In [None]:
adata_HCT116=adata_HCT116[metadata_HCT116.index,:]

In [None]:
metadata_HCT116

In [None]:
adata_HCT116

In [None]:
(adata_HCT116.obs_names==metadata_HCT116.index).all()

In [None]:
adata_HCT116.obs=metadata_HCT116

In [None]:
adata_HCT116.obs

In [None]:
metadata_HCT116[metadata_HCT116.dose==0]

In [None]:
# Filter genes expressed in less than 1% of the cells
sc.pp.filter_genes(adata_HCT116, min_cells=int(0.01*adata_HCT116.n_obs))
print(adata_HCT116)
adata_HCT116.obs['batch']='HCT116'
adata_HCT116_UMI=adata_HCT116.copy()
# adata_hek293t.write_h5ad(ARACNe_folder+"processed_HEK293.h5ad")
# Store the UMIs
adata_HCT116.raw=adata_HCT116

# Normalize the data
sc.pp.normalize_total(adata_HCT116,inplace=True)
# adata_hek293t_for_ARACNe=adata_HCT116.copy()
sc.pp.log1p(adata_HCT116)

In [None]:
def FilterMito(adata):
    mito_genes = adata.var_names.str.startswith('MT-')
    genes_to_keep = np.invert(mito_genes)
    print('before',adata)
    adata = adata[:,genes_to_keep].copy()
    print('after',adata)
    return adata;

In [None]:
# Remove mitochondrial and ribosomal genes
adata_HCT116=FilterMito(adata_HCT116)

## Compute diffusion pseudotime

In [None]:
sc.pp.highly_variable_genes(adata_HCT116,max_mean=10,n_top_genes=2000)  #calculate highly variable genes
adata_HCT116_high_var = adata_HCT116[:,adata_HCT116.var['highly_variable']==True]  #select only highly variable genes
sc.pp.scale(adata_HCT116_high_var,max_value=10)
sc.tl.pca(adata_HCT116_high_var,svd_solver='arpack')
sc.pl.pca_overview(adata_HCT116_high_var)

In [None]:
adata_HCT116_high_var.obs['dose']=adata_HCT116_high_var.obs['dose'].astype("category")

In [None]:
sc.pp.neighbors(adata_HCT116_high_var)#), n_neighbors=10, n_pcs=10)
sc.tl.umap(adata_HCT116_high_var)
sc.pl.umap(adata_HCT116_high_var,color=['dose'])

In [None]:
# Create the diffusion map
sc.tl.diffmap(adata_HCT116_high_var)
sc.pl.diffmap(adata_HCT116_high_var,color=['dose'])

In [None]:
adata_HCT116_high_var.uns['iroot'] = np.argmax(adata_HCT116_high_var.obsm['X_diffmap'][:,1])

# Run Diffusion Pseudotime with 1 branching event
sc.tl.dpt(adata_HCT116_high_var)

# Grab the output and store in our metadata DataFrame
adata_HCT116_high_var.obs['dpt'] = adata_HCT116_high_var.obs['dpt_pseudotime']
# adata_hek293t_high_var.obs['dpt_branch'] = adata_hek293t_high_var.obs['dpt_groups'].astype(int)
adata_HCT116_high_var.obs.head()

In [None]:
sc.pl.diffmap(adata_HCT116_high_var,color=['dose','dpt'])

In [None]:
# Select only cells with dose 0
adata_HCT116_high_var=adata_HCT116_high_var[adata_HCT116_high_var.obs.dose==0,:].copy()
adata_HCT116=adata_HCT116[adata_HCT116.obs.dose==0,:].copy()
adata_HCT116_UMI=adata_HCT116_UMI[adata_HCT116_UMI.obs.dose==0,:].copy()
adata_HCT116_UMI.write_h5ad(aracne_folder+'processed_HCT116.h5ad')
print(adata_HCT116_high_var)
print(adata_HCT116)
print(adata_HCT116_UMI)

In [None]:
sc.pp.filter_genes(adata_HCT116,min_cells=int(0.01*adata_HCT116.n_obs))

In [None]:
pseudo_df=pd.DataFrame(data=adata_HCT116_high_var.obs['dpt'], index=adata_HCT116_high_var.obs_names)
pseudo_df.to_csv(out_folder+'HCT116PseudoTime.csv')

## Gene Selection

In [None]:
from Bio import SeqIO
gname=[]
gid=[]
f_open = open("/Users/jonathan/Desktop/IIT/INTERACTomics/ENCODE_eCLIP_DATA/transcriptomes/hsapiens_gene_ensembl_107_canonical_new.fa", "rU")
for rec in SeqIO.parse(f_open, "fasta"):
    myid = rec.id
    gname.append(myid.split('|')[4])
    gid.append(myid.split('|')[0])

In [None]:
# Consider only genes present in the fasta file
def Genes_in_fasta(adata,gnames):
    print(adata)
    inters=list(set(adata.var_names).intersection(set(gnames)))
    adata=adata[:,inters].copy()
    print(adata)
    return adata;

In [None]:
adata_HCT116=Genes_in_fasta(adata_HCT116,gname)

In [None]:
tmp_adata=adata_HCT116.copy()
sc.pp.highly_variable_genes(tmp_adata,max_mean=10,n_top_genes=500)
adata_HVGs=tmp_adata[:,tmp_adata.var['highly_variable']==True].copy()

HCT116_RBP_RNA500=list(adata_HVGs.var_names)
print(len(HCT116_RBP_RNA500))

tmp_adata=adata_HCT116.copy()
sc.pp.highly_variable_genes(tmp_adata,max_mean=10,n_top_genes=1000)
adata_HVGs=tmp_adata[:,tmp_adata.var['highly_variable']==True].copy()

HCT116_RBP_RNA1000=list(adata_HVGs.var_names)
print(len(HCT116_RBP_RNA1000))

tmp_adata=adata_HCT116.copy()
sc.pp.highly_variable_genes(tmp_adata,max_mean=10,n_top_genes=2000)
adata_HVGs=tmp_adata[:,tmp_adata.var['highly_variable']==True].copy()

HCT116_RBP_RNA2000=list(adata_HVGs.var_names)
print(len(HCT116_RBP_RNA2000))

tmp_adata=adata_HCT116.copy()
sc.pp.highly_variable_genes(tmp_adata,max_mean=10,n_top_genes=3000)
adata_HVGs=tmp_adata[:,tmp_adata.var['highly_variable']==True].copy()

HCT116_RBP_RNA3000=list(adata_HVGs.var_names)
print(len(HCT116_RBP_RNA3000))

tmp_adata=adata_HCT116.copy()
sc.pp.highly_variable_genes(tmp_adata,max_mean=10)
adata_HVGs=tmp_adata[:,tmp_adata.var['highly_variable']==True].copy()

HCT116_RBP_RNAHVG=list(adata_HVGs.var_names)
print(len(HCT116_RBP_RNAHVG))

In [None]:
# Bioplex data for HEK293T
Bioplex_HCT116=pd.read_csv("/Users/jonathan/Desktop/IIT/INTERACTomics/scRNA-seq_data/ANALYSIS_FEB_2023_RIBO/Evaluation/COMPLEXES/BioPlex_HCT116_Network_5.5K_Dec_2019.tsv",delimiter="\t")
Bioplex_HCT116=Bioplex_HCT116.loc[:,['SymbolA','SymbolB']].copy()

# Load RBPs
human_RBPs=pd.read_csv("/Users/jonathan/Desktop/IIT/INTERACTomics/scRNA-seq_data/Datasets_Applications/RBPs/Table_HS_RBP.txt",delimiter='\t',skiprows=4)
human_RBPs=human_RBPs[human_RBPs.RBP2GO_Score>=10]
print(len(human_RBPs))

Bioplex_HCT116_proteins=list(set(list(Bioplex_HCT116.SymbolA)+list(Bioplex_HCT116.SymbolB)))
Bioplex_HCT116_RBPs=list(set(human_RBPs.Gene_Name).intersection(set(Bioplex_HCT116_proteins)))
len(Bioplex_HCT116_RBPs)

In [None]:
Bioplex_HCT116[(Bioplex_HCT116.SymbolA.isin(Bioplex_HCT116_RBPs))]

In [None]:
Bioplex_HCT116[(Bioplex_HCT116.SymbolA.isin(Bioplex_HCT116_RBPs)) & (Bioplex_HCT116.SymbolB.isin(Bioplex_HCT116_RBPs))]

In [None]:
print(len(set(HCT116_RBP_RNA500)),len(set(HCT116_RBP_RNA500).intersection(Bioplex_HCT116_RBPs)))
print(len(set(HCT116_RBP_RNA1000)),len(set(HCT116_RBP_RNA1000).intersection(Bioplex_HCT116_RBPs)))
print(len(set(HCT116_RBP_RNA2000)),len(set(HCT116_RBP_RNA2000).intersection(Bioplex_HCT116_RBPs)))
print(len(set(HCT116_RBP_RNA3000)),len(set(HCT116_RBP_RNA3000).intersection(Bioplex_HCT116_RBPs)))
print(len(set(HCT116_RBP_RNAHVG)),len(set(HCT116_RBP_RNAHVG).intersection(Bioplex_HCT116_RBPs)))

In [None]:
def SaveData(folder, adata,adata_UMI, geneset, label1, label2):
    
    # Save the normalized data in a csv file
    tmp_df=pd.DataFrame(data=adata[:,geneset].X.T, index=adata[:,geneset].var_names,
                        columns=adata[:,geneset].obs_names)
    tmp_df.to_csv(folder+label1+'NormalizedData_'+label2+'.csv')

    # Save the raw data in a csv file
    tmp_df=pd.DataFrame(data=adata_UMI[:, geneset].X.T, index=adata_UMI[:, geneset].var_names, 
                        columns=adata_UMI[:, geneset].obs_names)
    tmp_df.to_csv(folder+label1+'RawData_'+label2+'.csv')

In [None]:
if os.path.isdir(out_folder+'HCT116_RBP_RNA500')==False:
    os.mkdir(out_folder+'HCT116_RBP_RNA500')
    
if os.path.isdir(out_folder+'HCT116_RBP_RNA1000')==False:
    os.mkdir(out_folder+'HCT116_RBP_RNA1000')

if os.path.isdir(out_folder+'HCT116_RBP_RNA2000')==False:
    os.mkdir(out_folder+'HCT116_RBP_RNA2000')
    
if os.path.isdir(out_folder+'HCT116_RBP_RNA3000')==False:
    os.mkdir(out_folder+'HCT116_RBP_RNA3000')
    
if os.path.isdir(out_folder+'HCT116_RBP_RNAHVG')==False:
    os.mkdir(out_folder+'HCT116_RBP_RNAHVG')

In [None]:
SaveData(out_folder+'HCT116_RBP_RNA500/', adata_HCT116,adata_HCT116_UMI, HCT116_RBP_RNA500,
         'HCT116', 'RBP_RNA500')
SaveData(out_folder+'HCT116_RBP_RNA1000/', adata_HCT116,adata_HCT116_UMI, HCT116_RBP_RNA1000,
         'HCT116', 'RBP_RNA1000')
SaveData(out_folder+'HCT116_RBP_RNA2000/', adata_HCT116,adata_HCT116_UMI, HCT116_RBP_RNA2000,
         'HCT116', 'RBP_RNA2000')
SaveData(out_folder+'HCT116_RBP_RNA3000/', adata_HCT116,adata_HCT116_UMI, HCT116_RBP_RNA3000,
         'HCT116', 'RBP_RNA3000')
SaveData(out_folder+'HCT116_RBP_RNAHVG/', adata_HCT116,adata_HCT116_UMI, HCT116_RBP_RNAHVG,
         'HCT116', 'RBP_RNAHVG')

In [None]:
gname_folder='./ANALYSIS_FEB_2023_RIBO/COINTER_RBPs/HCT116/GENE_SELECTION/gene_names/'

if os.path.isdir(gname_folder)==False:
    os.mkdir(gname_folder)

In [None]:
np.savetxt(gname_folder+'gnamesHCT116_RBP_RNA500.txt',np.c_[HCT116_RBP_RNA500],fmt='%s')
np.savetxt(gname_folder+'gnamesHCT116_RBP_RNA1000.txt',np.c_[HCT116_RBP_RNA1000],fmt='%s')
np.savetxt(gname_folder+'gnamesHCT116_RBP_RNA2000.txt',np.c_[HCT116_RBP_RNA2000],fmt='%s')
np.savetxt(gname_folder+'gnamesHCT116_RBP_RNA3000.txt',np.c_[HCT116_RBP_RNA3000],fmt='%s')
np.savetxt(gname_folder+'gnamesHCT116_RBP_RNAHVG.txt',np.c_[HCT116_RBP_RNAHVG],fmt='%s')

In [None]:
np.savetxt("./ANALYSIS_FEB_2023_RIBO/COINTER_RBPs/EvaluationScripts/Bioplex_HCT116_RBPs.txt",np.c_[Bioplex_HCT116_RBPs],fmt="%s")
np.savetxt("./ANALYSIS_FEB_2023_RIBO/COINTER_RBPs/EvaluationScripts/Bioplex_HEK293T_RBPs.txt",np.c_[Bioplex_HEK293T_RBPs],fmt="%s")

In [None]:
Bioplex_HCT116['Edges']=[''.join(sorted(filter(None, x))) for x in Bioplex_HCT116.to_numpy()]
Bioplex_HEK293T['Edges']=[''.join(sorted(filter(None, x))) for x in Bioplex_HEK293T.to_numpy()]
np.savetxt('Bioplex_HCT116.gmt',np.c_[['Bioplex_HCT116']+list(Bioplex_HCT116.Edges)],fmt="%s")
np.savetxt('Bioplex_HEK293T.gmt',np.c_[['Bioplex_HEK293T']+list(Bioplex_HEK293T.Edges)],fmt="%s")