In [1]:
# general imports
import warnings
import numpy as np
import os
import pandas as pd
import scipy as sp
from scipy.sparse import coo_matrix
import collections
import scanpy as sc


# Images, plots, display, and visualization
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.manifold import TSNE
import sklearn as sk

# matplotlib settings for Jupyter notebooks only
%matplotlib inline

import pickle
import gzip
from pathlib import Path

aug_data_path = f"{os.getcwd()}/../data/single_cell_data/augmented_pbmc_data/"



# Get the common variable genes

In [2]:
""" # 68k process

# read in the data

data_path = f"{os.getcwd()}/../data/single_cell_data/pbmc68k/hg19/"
scpred_path = f"{os.getcwd()}/../results/single_cell_data/pbmc_cell_labels/"


adata = sc.read_10x_mtx(
    data_path,                               # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

adata.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`


# add metadata
meta_data = pd.read_csv(f"{scpred_path}/pbmc68k_scpred.tsv", sep="\t", index_col='code')
barcodes = pd.read_csv(f"{data_path}/barcodes.tsv", header=None, names=['code'])
meta_df = barcodes.join(other=meta_data, on=['code'], how='left', sort=False)


adata.obs['CellType'] = meta_df['cell_type'].tolist()
adata.obs['scpred_CellType'] = meta_df['scpred_prediction'].tolist()

# filter out cells with less than 200 genes and genes expressed in less than 3 cells
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)

# remove genes with high mitochondrial content
adata.var['mt'] = adata.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

# slice the data based on the plots from above
# remove cells with more than 3500 genes
# remove cells with more than 10% MTgenes
adata = adata[adata.obs.n_genes_by_counts < 2000, :]
adata = adata[adata.obs.pct_counts_mt < 10, :]


# normalize to 10K counts per cell
sc.pp.normalize_total(adata, target_sum=1e4)

# log data
sc.pp.log1p(adata)

# get high variance genes
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
pbmc_68k_gene_ids = adata.var['gene_ids']

# and filter
adata = adata[:, adata.var.highly_variable]

pbmc_68k_gene_ids_variable = adata.var['gene_ids']
 """

' # 68k process\n\n# read in the data\n\ndata_path = f"{os.getcwd()}/../data/single_cell_data/pbmc68k/hg19/"\nscpred_path = f"{os.getcwd()}/../results/single_cell_data/pbmc_cell_labels/"\n\n\nadata = sc.read_10x_mtx(\n    data_path,                               # the directory with the `.mtx` file\n    var_names=\'gene_symbols\',                # use gene symbols for the variable names (variables-axis index)\n    cache=True)                              # write a cache file for faster subsequent reading\n\nadata.var_names_make_unique()  # this is unnecessary if using `var_names=\'gene_ids\'` in `sc.read_10x_mtx`\n\n\n# add metadata\nmeta_data = pd.read_csv(f"{scpred_path}/pbmc68k_scpred.tsv", sep="\t", index_col=\'code\')\nbarcodes = pd.read_csv(f"{data_path}/barcodes.tsv", header=None, names=[\'code\'])\nmeta_df = barcodes.join(other=meta_data, on=[\'code\'], how=\'left\', sort=False)\n\n\nadata.obs[\'CellType\'] = meta_df[\'cell_type\'].tolist()\nadata.obs[\'scpred_CellType\'] = met

In [3]:
# 6k process

# read in the data

data_path = f"{os.getcwd()}/../data/single_cell_data/pbmc6k/hg19/"
scpred_path = f"{os.getcwd()}/../results/single_cell_data/pbmc_cell_labels/"


adata = sc.read_10x_mtx(
    data_path,                               # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

adata.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`


# add metadata
meta_data = pd.read_csv(f"{scpred_path}/pbmc6k_scpred.tsv", sep="\t", index_col='code')
barcodes = pd.read_csv(f"{data_path}/barcodes.tsv", header=None, names=['code'])
meta_df = barcodes.join(other=meta_data, on=['code'], how='left', sort=False)


adata.obs['CellType'] = meta_df['cell_type'].tolist()
adata.obs['scpred_CellType'] = meta_df['scpred_prediction'].tolist()

# filter out cells with less than 200 genes and genes expressed in less than 3 cells
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)

# remove genes with high mitochondrial content
adata.var['mt'] = adata.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

# slice the data based on the plots from above
# remove cells with more than 3500 genes
# remove cells with more than 10% MTgenes
adata = adata[adata.obs.n_genes_by_counts < 2000, :]
adata = adata[adata.obs.pct_counts_mt < 10, :]


# normalize to 10K counts per cell
sc.pp.normalize_total(adata, target_sum=1e4)

# log data
sc.pp.log1p(adata)

# get high variance genes
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
pbmc_6k_gene_ids = adata.var['gene_ids']

# and filter
adata = adata[:, adata.var.highly_variable]

pbmc_6k_gene_ids_variable = adata.var['gene_ids']


  view_to_actual(adata)


In [4]:
# 3k process

# read in the data

data_path = f"{os.getcwd()}/../data/single_cell_data/pbmc3k/hg19/"
scpred_path = f"{os.getcwd()}/../results/single_cell_data/pbmc_cell_labels/"


adata = sc.read_10x_mtx(
    data_path,                               # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

adata.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`


# add metadata
meta_data = pd.read_csv(f"{scpred_path}/pbmc3k_scpred.tsv", sep="\t", index_col='code')
barcodes = pd.read_csv(f"{data_path}/barcodes.tsv", header=None, names=['code'])
meta_df = barcodes.join(other=meta_data, on=['code'], how='left', sort=False)


adata.obs['scpred_CellType'] = meta_df['scpred_prediction'].tolist()

# filter out cells with less than 200 genes and genes expressed in less than 3 cells
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)

# remove genes with high mitochondrial content
adata.var['mt'] = adata.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

# slice the data based on the plots from above
# remove cells with more than 3500 genes
# remove cells with more than 10% MTgenes
adata = adata[adata.obs.n_genes_by_counts < 2000, :]
adata = adata[adata.obs.pct_counts_mt < 7, :]


# normalize to 10K counts per cell
sc.pp.normalize_total(adata, target_sum=1e4)

# log data
sc.pp.log1p(adata)

# get high variance genes
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
pbmc_3k_gene_ids = adata.var['gene_ids']

# and filter
adata = adata[:, adata.var.highly_variable]

pbmc_3k_gene_ids_variable = adata.var['gene_ids']


In [5]:
# pbmc1_sm2

# read in the data

data_path = f"{os.getcwd()}/../data/single_cell_data/GSE132044/sm2_hg19/"
scpred_path = f"{os.getcwd()}/../results/single_cell_data/pbmc_cell_labels/"


adata = sc.read_10x_mtx(
    data_path,                               # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

adata.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`


# add metadata
meta_data = pd.read_csv(f"{scpred_path}/pbmc_rep1_sm2.tsv", sep="\t", index_col='code')
barcodes = pd.read_csv(f"{data_path}/barcodes.tsv", header=None, names=['code'])
meta_df = barcodes.join(other=meta_data, on=['code'], how='left', sort=False)


adata.obs['CellType'] = meta_df['CellType'].tolist()
adata.obs['scpred_CellType'] = meta_df['scpred_prediction'].tolist()
adata.obs['Experiment'] = meta_df['Experiment'].tolist()
adata.obs['Method'] = meta_df['Method'].tolist()

# filter it for only our method and experiment
adata = adata[adata.obs["Experiment"] == 'pbmc1', :]
adata = adata[adata.obs["Method"] == 'Smart-seq2', :]


# filter out cells with less than 200 genes and genes expressed in less than 3 cells
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)

# remove genes with high mitochondrial content
adata.var['mt'] = adata.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

# slice the data based on the plots from above
# remove cells with more than 3500 genes
# remove cells with more than 10% MTgenes
adata = adata[adata.obs.n_genes_by_counts < 4000, :]
adata = adata[adata.obs.pct_counts_mt < 10, :]


# normalize to 10K counts per cell
sc.pp.normalize_total(adata, target_sum=1e4)

# log data
sc.pp.log1p(adata)

# get high variance genes
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
pbmc1_sm2_gene_ids = adata.var['gene_ids']

# and filter
adata = adata[:, adata.var.highly_variable]

pbmc1_sm2_gene_ids_variable = adata.var['gene_ids']


Trying to set attribute `.obs` of view, copying.


In [6]:
# pbmc1_10x

# read in the data

data_path = f"{os.getcwd()}/../data/single_cell_data/GSE132044/10x_hg19/"
scpred_path = f"{os.getcwd()}/../results/single_cell_data/pbmc_cell_labels/"


adata = sc.read_10x_mtx(
    data_path,                               # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

adata.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`


# add metadata
meta_data = pd.read_csv(f"{scpred_path}/pbmc_rep1_10xV2a.tsv", sep="\t", index_col='code')
barcodes = pd.read_csv(f"{data_path}/barcodes.tsv", header=None, names=['code'])
meta_df = barcodes.join(other=meta_data, on=['code'], how='left', sort=False)


adata.obs['CellType'] = meta_df['CellType'].tolist()
adata.obs['scpred_CellType'] = meta_df['scpred_prediction'].tolist()
adata.obs['Experiment'] = meta_df['Experiment'].tolist()
adata.obs['Method'] = meta_df['Method'].tolist()

# filter it for only our method and experiment
adata = adata[adata.obs["Experiment"] == 'pbmc1', :]
adata = adata[adata.obs["Method"] == '10x Chromium (v2) A', :]


# filter out cells with less than 200 genes and genes expressed in less than 3 cells
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)

# remove genes with high mitochondrial content
adata.var['mt'] = adata.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

# slice the data based on the plots from above
# remove cells with more than 3500 genes
# remove cells with more than 10% MTgenes
adata = adata[adata.obs.n_genes_by_counts < 3500, :]
adata = adata[adata.obs.pct_counts_mt < 10, :]


# normalize to 10K counts per cell
sc.pp.normalize_total(adata, target_sum=1e4)

# log data
sc.pp.log1p(adata)

# get high variance genes
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
pbmc1_10x_gene_ids = adata.var['gene_ids']

# and filter
adata = adata[:, adata.var.highly_variable]

pbmc1_10x_gene_ids_variable = adata.var['gene_ids']


Trying to set attribute `.obs` of view, copying.


In [7]:
# pbmc2_10x

# read in the data

data_path = f"{os.getcwd()}/../data/single_cell_data/GSE132044/10x_hg19/"
scpred_path = f"{os.getcwd()}/../results/single_cell_data/pbmc_cell_labels/"


adata = sc.read_10x_mtx(
    data_path,                               # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

adata.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`


# add metadata
meta_data = pd.read_csv(f"{scpred_path}/pbmc_rep2_10xV2.tsv", sep="\t", index_col='code')
barcodes = pd.read_csv(f"{data_path}/barcodes.tsv", header=None, names=['code'])
meta_df = barcodes.join(other=meta_data, on=['code'], how='left', sort=False)


adata.obs['CellType'] = meta_df['CellType'].tolist()
adata.obs['scpred_CellType'] = meta_df['scpred_prediction'].tolist()
adata.obs['Experiment'] = meta_df['Experiment'].tolist()
adata.obs['Method'] = meta_df['Method'].tolist()

# filter it for only our method and experiment
adata = adata[adata.obs["Experiment"] == 'pbmc2', :]
adata = adata[adata.obs["Method"] == '10x Chromium (v2)', :]


# filter out cells with less than 200 genes and genes expressed in less than 3 cells
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)

# remove genes with high mitochondrial content
adata.var['mt'] = adata.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

# slice the data based on the plots from above
# remove cells with more than 3500 genes
# remove cells with more than 10% MTgenes
adata = adata[adata.obs.n_genes_by_counts < 3500, :]
adata = adata[adata.obs.pct_counts_mt < 10, :]


# normalize to 10K counts per cell
sc.pp.normalize_total(adata, target_sum=1e4)

# log data
sc.pp.log1p(adata)

# get high variance genes
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
pbmc2_10x_gene_ids = adata.var['gene_ids']

# and filter
adata = adata[:, adata.var.highly_variable]

pbmc2_10x_gene_ids_variable = adata.var['gene_ids']


Trying to set attribute `.obs` of view, copying.


In [8]:
genes_all = [pbmc1_sm2_gene_ids,
             pbmc1_10x_gene_ids, pbmc2_10x_gene_ids,
             pbmc_6k_gene_ids, pbmc_3k_gene_ids]

genes_all = list(set.intersection(*[set(x) for x in genes_all]))
len(genes_all)

11667

In [10]:
genes_all_variable = [pbmc1_sm2_gene_ids_variable,
                pbmc1_10x_gene_ids_variable, pbmc2_10x_gene_ids_variable,
                pbmc_6k_gene_ids_variable, pbmc_3k_gene_ids_variable]

genes_all_variable = list(set.intersection(*[set(x) for x in genes_all_variable]))
len(genes_all_variable)



119

In [11]:
# write out the gene ids
gene_out_file = os.path.join(aug_data_path, "intersection_genes.pkl")
gene_out_path = Path(gene_out_file)
pickle.dump( genes_all, open( gene_out_path, "wb" ) )
