In [None]:
import scanpy as sc
import os
import pandas as pd
import numpy as np
from anndata import AnnData, read_csv
import seaborn as sns
from matplotlib import pyplot as plt
from glob import glob

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
loc = './data/'

# Create Zebrafish AnnData

### Assign annotations from SAMap

Download the raw zebrafish h5ad File and map annotations from [elife paper](https://elifesciences.org/articles/66747).

In [None]:
try:
    zebrafish = sc.read(os.path.join(loc,'WagnerScience2018.h5ad'))
except FileNotFoundError:
    !wget -O ./data/WagnerScience2018.h5ad https://kleintools.hms.harvard.edu/paper_websites/wagner_zebrafish_timecourse2018/WagnerScience2018.h5ad
    zebrafish = sc.read(os.path.join(loc,'WagnerScience2018.h5ad'))
zebrafish.obs['cluster'] = pd.Categorical([z[6:] if '-' in z else z for z in zebrafish.obs['ClusterName']])

In [None]:
with open(os.path.join(loc,'zebrafish_cell_types_mapping')) as f:
    cell_types_mapping = f.readlines()
ct_map = {}
for line in cell_types_mapping[1:]:
    el = line.split("\t")
    ct_map[el[0].strip()] = el[1].strip()
ct_map['periderm'] = 'Periderm'
ct_map['pluripotent'] = 'Pluripotent'
ct_map['neural - floorplate posterior'] = 'Notoplate'
ct_map['neural crest - mcamb'] = 'Neural crest'
ct_map['neural crest - melanoblast'] = 'Neural crest'
ct_map['neural crest - iridoblast'] = 'Neural crest'
ct_map['neural crest - xanthophore'] = 'Neural crest'
ct_map['neural crest - crestin'] = 'Neural crest'

In [None]:
samap_clusters = []
not_found = []
for f in zebrafish.obs['cluster']:
    if f.strip() not in ct_map:
        samap_clusters.append('NaN')
        not_found.append(f)
    else:
        samap_clusters.append(ct_map[f.strip()])
print(set(not_found))
zebrafish.obs['cell_type'] = pd.Categorical(samap_clusters)
zebrafish = zebrafish[zebrafish.obs['cell_type']!='NaN']
zebrafish.write(os.path.join(loc,'zebrafish_annot.h5ad'))

### Load zebrafish data

In [None]:
zebrafish = sc.read(os.path.join(loc,'zebrafish_annot.h5ad'))

In [None]:
zebrafish.obs

In [None]:
zebrafish.obs_names = [x for x in zebrafish.obs_names ]
zebrafish.var_names = [x for x in zebrafish.var_names ]
zebrafish.obs.cell_type = [x for x in zebrafish.obs.cell_type ]

In [None]:
sc.pp.filter_cells(zebrafish, min_genes=500)
sc.pp.filter_genes(zebrafish, min_cells=10)

In [None]:
zebrafish.X.toarray().max()

In [None]:
zebrafish.write(os.path.join(loc, "zebrafish.h5ad"))

In [None]:
zebrafish

In [None]:
# free up memory
zebrafish.X = None
zebrafish = None

# Create Frog AnnData
Download the raw frog h5ad File and map annotations from [elife paper](https://elifesciences.org/articles/66747).

Download the Data from GSE

In [None]:
if not os.path.exists(os.path.join('.','data','GSE113074_Raw_combined.annotated_counts.tsv')):
    !wget -O ./data/GSE113074_Raw_combined.annotated_counts.tsv.gz "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE113nnn/GSE113074/suppl/GSE113074_Raw_combined.annotated_counts.tsv.gz"
    !gunzip ./data/GSE113074_Raw_combined.annotated_counts.tsv.gz

In [None]:
def generate_frog_h5ad():    
    filepath_frog = os.path.join(loc,'GSE113074_Raw_combined.annotated_counts.tsv')
    with open(filepath_frog) as f:
        frog_data = f.readlines()
    print(f'Data size: {len(frog_data)}')
    barcodes = [f for f in frog_data[5].split("\t")]
    parent_clusters = [f[4:].strip() if f.startswith('S') else f for f in frog_data[8].split("\t")]
    clusters = [f[4:].strip() if f.startswith('S') else f for f in frog_data[7].split("\t")]
    genes = []
    data = np.zeros((len(frog_data[9:]),len(barcodes)-1), dtype=np.float32)
    print(f'Data shape: {data.shape}')
    frog_data = frog_data[9:]
    i = 0
    while len(frog_data):
        f = frog_data.pop(0)
        cutoff = f.find('\t')
        #line = f.split("\t")
        genes.append(f[:cutoff])
        data[i, :] = np.fromstring(f[cutoff+1:],sep='\t', dtype=np.float32)
        if i%10==0:
            print(f'Line {i}', end='\r')
        i += 1
    
    # create anndata
    data = data.T
    from scipy import sparse
    data = sparse.csc_matrix(data)
    adata = AnnData(data)
    adata.var_names = genes
    adata.obs_names = barcodes[1:]
    adata.obs['clusters'] = clusters[1:]
    adata.obs['parent_clusters'] = parent_clusters[1:]
    
    # add samap cell type annotations
    with open(os.path.join(loc, 'frog_cell_types_mapping')) as f:
        cell_types_mapping = f.readlines()
    ct_map = {}
    for line in cell_types_mapping[1:]:
        el = line.split("\t")
        ct_map[el[0].strip()] = el[1].strip()
    ct_map['Outlier'] = 'Outlier'
    samap_clusters = []
    for f in adata.obs['clusters']:
        if f.strip() not in ct_map:
            print(f)
        samap_clusters.append(ct_map[f.strip()])
    adata.obs['cell_type'] = samap_clusters
    
    adata.write(os.path.join(loc,'GSE113074_Corrected_combined.annotated_counts.h5ad'))    
    return adata

In [None]:
frog = generate_frog_h5ad()

In [None]:
frog_annot = sc.read_h5ad(os.path.join(loc,'GSE113074_Corrected_combined.annotated_counts.h5ad'))

In [None]:
frog_counts = frog

In [None]:
frog_annot

In [None]:
frog

In [None]:
frog = frog[frog.obs['cell_type']!='Outlier']

In [None]:
frog.obs_names = [x for x in frog.obs_names ]
frog.var_names = [x for x in frog.var_names ]
frog.obs.cell_type = [x for x in frog.obs.cell_type ]

In [None]:
sc.pp.filter_cells(frog, min_genes=500)
sc.pp.filter_genes(frog, min_cells=10)

In [None]:
frog.X

In [None]:
frog.write(os.path.join(loc, "frog.h5ad"))

In [None]:
frog