In [None]:
PROJECT_NAME = "reverse-gene-finder"

In [None]:
import os
PROJECT_HOME = os.path.join("/content/drive/My Drive/Projects", PROJECT_NAME)

In [None]:
# Google Drive storage setup
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
%pip install rpy2 anndata pyensembl > /dev/null 2> /dev/null

In [None]:
import scipy
import anndata
import numpy as np
import pandas as pd

from pyensembl import EnsemblRelease

import rpy2.robjects as ro
from rpy2.robjects.packages import importr

In [None]:
raw_data_dir = os.path.join(PROJECT_HOME, "raw_data")
data_dir = os.path.join(PROJECT_HOME, "data")
os.makedirs(data_dir, exist_ok=True)
tmp_dir = "/tmp"

In [None]:
importr('Matrix')
readRDS = ro.r['readRDS']
rownames = ro.r['rownames']
write_csv = ro.r['write.csv']
write_table = ro.r['write.table']
writeMM = ro.r['writeMM']
data_frame = ro.r['data.frame']

In [None]:
# Data available at https://compbio.mit.edu/microglia_states/
metadata =  readRDS(os.path.join(raw_data_dir, 'ROSMAP.ImmuneCells.6regions.snRNAseq.meta.rds'))
write_csv(metadata, file=os.path.join(data_dir, 'metadata.csv'), quote=False)

<rpy2.rinterface_lib.sexp.NULLType object at 0x7f2f22451780> [RTYPES.NILSXP]

In [None]:
# Data available at https://compbio.mit.edu/microglia_states/
count_data = readRDS(os.path.join(raw_data_dir, 'ROSMAP.ImmuneCells.6regions.snRNAseq.counts.rds'))
write_table(data_frame(gene=rownames(count_data)), file=os.path.join(tmp_dir, 'gene_names.csv'), quote=False, row_names=False, col_names=False)
writeMM(count_data, file=os.path.join(tmp_dir, 'counts.txt'))

<rpy2.rinterface_lib.sexp.NULLType object at 0x7f2f22451780> [RTYPES.NILSXP]

In [None]:
X = scipy.io.mmread(os.path.join(tmp_dir, "counts.txt"))
cell_meta = pd.read_csv(os.path.join(data_dir, "metadata.csv"))
with open(os.path.join(tmp_dir, "gene_names.csv"), 'r') as f:
    gene_names = f.read().splitlines()

In [None]:
!pyensembl install --release 75 --species human > /dev/null 2> /dev/null

In [None]:
selected_gene_ids_counter = 0
ensembl_data = EnsemblRelease(75)
exclude_genes = []
gene_ids = []
for gene_name in gene_names:
    try:
        results = ensembl_data.gene_ids_of_gene_name(gene_name)
        gene_ids.append(results[0])
        selected_gene_ids_counter += 1
    except:
        gene_ids.append('')
        exclude_genes.append(gene_name)

In [None]:
print("# of gene ids: %d" % selected_gene_ids_counter)

# of gene ids: 15549


In [None]:
adata = anndata.AnnData(X=X.transpose().tocsr())

In [None]:
adata.var["gene_name"] = gene_names
adata.var["ensembl_id"] = gene_ids

In [None]:
adata.obs["disease"] = pd.Categorical(cell_meta["ADdiag3types"])
adata.obs["n_counts"] = adata.X.sum(axis=1)
adata.obs["joinid"] = list(range(adata.n_obs))

In [None]:
with open(os.path.join(PROJECT_HOME, data_dir, "gene_info.csv"), "w") as gene_f:
    gene_f.write("gene_name,gene_id\n")
    for gene_name, gene_id in zip(gene_names, gene_ids):
        gene_f.write("%s,%s\n" % (gene_name, gene_id))

In [None]:
adata.write(os.path.join(data_dir, 'h5ad', 'adata.h5ad'))