In [1]:
import numpy as np
import pandas as pd
import scanpy as sc

In [2]:
path = "/Users/anna.schaar/phd/datasets/lu/GSE172129_RAW/"

In [3]:
out_path = "/Users/anna.schaar/phd/datasets/lu/"

# sorted HSC

In [4]:
data_file = "GSM5242403_E14.5FL_HSC_matrix.mtx"
gene_file = "GSM5242403_E14.5FL_HSC_features.tsv"
barcode_file = "GSM5242403_E14.5FL_HSC_barcodes.tsv.gz"

In [5]:
adata = sc.read(path+data_file)
adata = adata.transpose()
adata.X = adata.X.toarray()

barcodes = pd.read_csv(path + barcode_file, header=None, sep='\t')
genes = pd.read_csv(path + gene_file, header=None, sep='\t')

barcodes.rename(columns={0:'barcode'}, inplace=True)
barcodes.set_index('barcode', inplace=True)
adata.obs = barcodes

genes.rename(columns={0:'chozen_isoform', 1:'gene_short_name', 2:'code'}, inplace=True)
genes.set_index('gene_short_name', inplace=True)
adata.var = genes

In [6]:
adata.var_names_make_unique()

## Quality control 

In [7]:
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

mt_gene_mask = [gene.startswith('mt-') for gene in adata.var_names]
adata.obs['mt_frac'] = adata.X[:, mt_gene_mask].sum(1)/adata.obs['n_counts']

In [8]:
# Filter cells according to identified QC thresholds:
print('Total number of cells: {:d}'.format(adata.n_obs))

sc.pp.filter_cells(adata, min_genes = 500)
print('Number of cells after gene filter: {:d}'.format(adata.n_obs))

#Filter genes:
print('Total number of genes: {:d}'.format(adata.n_vars))

# Min 3 cells
sc.pp.filter_genes(adata, min_cells=3)
print('Number of genes after cell filter: {:d}'.format(adata.n_vars))


adata = adata[adata.obs['mt_frac'] < 0.1]
print('Number of cells after MT filter: {:d}'.format(adata.n_obs))

Total number of cells: 8613
Number of cells after gene filter: 7436
Total number of genes: 28692
Number of genes after cell filter: 15899
Number of cells after MT filter: 6486


## Normalization

In [9]:
sc.pp.normalize_total(adata, target_sum=1e5)
sc.pp.log1p(adata)

  view_to_actual(adata)


In [10]:
adata

AnnData object with n_obs × n_vars = 6486 × 15899
    obs: 'n_counts', 'log_counts', 'n_genes', 'mt_frac'
    var: 'chozen_isoform', 'code', 'n_cells'
    uns: 'log1p'

In [11]:
adata.write(out_path+"sorted_HSC.h5ad")

  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'code' as categorical


# Fetal liver

In [12]:
data_file = "GSM5242402_E14.5FL_matrix.mtx"
gene_file = "GSM5242402_E14.5FL_features.tsv"
barcode_file = "GSM5242402_E14.5FL_barcodes.tsv.gz"

In [13]:
adata = sc.read(path+data_file)
adata = adata.transpose()
adata.X = adata.X.toarray()

barcodes = pd.read_csv(path + barcode_file, header=None, sep='\t')
genes = pd.read_csv(path + gene_file, header=None, sep='\t')

barcodes.rename(columns={0:'barcode'}, inplace=True)
barcodes.set_index('barcode', inplace=True)
adata.obs = barcodes

genes.rename(columns={0:'chozen_isoform', 1:'gene_short_name', 2:'code'}, inplace=True)
genes.set_index('gene_short_name', inplace=True)
adata.var = genes

In [14]:
adata.var_names_make_unique()

## Quality Control

In [15]:
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

mt_gene_mask = [gene.startswith('mt-') for gene in adata.var_names]
adata.obs['mt_frac'] = adata.X[:, mt_gene_mask].sum(1)/adata.obs['n_counts']

In [16]:
# Filter cells according to identified QC thresholds:
print('Total number of cells: {:d}'.format(adata.n_obs))

sc.pp.filter_cells(adata, min_genes = 500)
print('Number of cells after gene filter: {:d}'.format(adata.n_obs))

#Filter genes:
print('Total number of genes: {:d}'.format(adata.n_vars))

# Min 3 cells
sc.pp.filter_genes(adata, min_cells=3)
print('Number of genes after cell filter: {:d}'.format(adata.n_vars))


adata = adata[adata.obs['mt_frac'] < 0.1]
print('Number of cells after MT filter: {:d}'.format(adata.n_obs))

Total number of cells: 9448
Number of cells after gene filter: 8286
Total number of genes: 28692
Number of genes after cell filter: 16187
Number of cells after MT filter: 7788


## Normalization

In [17]:
sc.pp.normalize_total(adata, target_sum=1e5)
sc.pp.log1p(adata)

  view_to_actual(adata)


In [18]:
adata

AnnData object with n_obs × n_vars = 7788 × 16187
    obs: 'n_counts', 'log_counts', 'n_genes', 'mt_frac'
    var: 'chozen_isoform', 'code', 'n_cells'
    uns: 'log1p'

In [19]:
adata.write(out_path+"fetal_liver.h5ad")

  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'code' as categorical
