In [2]:
import re
import urllib
from zipfile import ZipFile
from pathlib import Path
import os

import anndata
import numpy as np
import scipy
import scipy.sparse

Download sc-variability-paper scripts written by Norton (2020)

In [17]:
rp2_scripts = ("__init__.py", "analysis.py", "data.py", "environment.py", "hagai_2018.py", "notebooks.py",
              "paths.py", "processing.py", "regression.py", "txburst.py", "ui.py")

def make_rp2_path(filename):
    
    cwd = os.getcwd()
    if not os.path.exists(f"{cwd}\\rp2"):
        os.makedirs(f"{cwd}\\rp2")
        
    rp2_path = os.path.join("rp2", filename)
    
    return rp2_path

for filename in rp2_scripts:
    path = make_rp2_path(filename)
    url = f"https://raw.githubusercontent.com/dwnorton/sc-variability-paper/master/rp2/{filename}"
    urllib.request.urlretrieve(url, path)
    
print("sc-variability scripts downloaded")

sc-variability scripts downloaded


In [11]:
burst_model_fitting_nb = "003_Burst_Model_Fitting.ipynb"

cwd = os.getcwd()
nb_path = os.path.join(cwd, burst_model_fitting_nb)
nb_url = "https://raw.githubusercontent.com/dwnorton/sc-variability-paper/master/002_Burst_Model_Fitting.ipynb"
urllib.request.urlretrieve(nb_url, nb_path)
print("Burst_Model_Fitting notebook downloaded.")

Burst_Model_Fitting notebook downloaded.


In [8]:
from rp2 import fetch_file, notebooks
from rp2.paths import get_data_path, get_scripts_path, get_model_path

nb_env = notebooks.initialise_environment("Obtain_Data")

In [14]:
ppfit_model_path = get_model_path("ppfit")
fetch_file(
    "https://raw.githubusercontent.com/dwnorton/sc-variability-paper/master/Models/ppfit/species%3Dmouse-counts%3Dmedian-index%3Dreplicate%2Btreatment%2Btime_point.csv",
    ppfit_model_path,
);

os.rename(f"{ppfit_model_path}\\species%3Dmouse-counts%3Dmedian-index%3Dreplicate%2Btreatment%2Btime_point.csv",
          f"{ppfit_model_path}\\species=mouse-counts=median-index=replicate+treatment+time_point.csv")

In [15]:
txburst_model_path = get_model_path("txburst")
fetch_file(
    "https://raw.githubusercontent.com/dwnorton/sc-variability-paper/master/Models/txburst/species%3Dmouse-counts%3Dmedian-index%3Dreplicate%2Btreatment%2Btime_point.csv",
    txburst_model_path
);

fetch_file(
    "https://raw.githubusercontent.com/dwnorton/sc-variability-paper/master/Models/txburst/species%3Dmouse-counts%3Dumi-index%3Dreplicate%2Btreatment%2Btime_point.csv",
    txburst_model_path
);

fetch_file(
    "https://raw.githubusercontent.com/dwnorton/sc-variability-paper/master/Models/txburst/species%3Dpig-counts%3Dmedian-index%3Dreplicate%2Btreatment%2Btime_point.csv",
    txburst_model_path
);

fetch_file(
    "https://raw.githubusercontent.com/dwnorton/sc-variability-paper/master/Models/txburst/species%3Drabbit-counts%3Dmedian-index%3Dreplicate%2Btreatment%2Btime_point.csv",
    txburst_model_path
);

fetch_file(
    "https://raw.githubusercontent.com/dwnorton/sc-variability-paper/master/Models/txburst/species%3Drat-counts%3Dmedian-index%3Dreplicate%2Btreatment%2Btime_point.csv",
    txburst_model_path
);

In [16]:
os.rename(f"{txburst_model_path}\\species%3Dmouse-counts%3Dmedian-index%3Dreplicate%2Btreatment%2Btime_point.csv",
         f"{txburst_model_path}\\species=mouse-counts=median-index=replicate+treatment+time_point.csv")

os.rename(f"{txburst_model_path}\\species%3Dmouse-counts%3Dumi-index%3Dreplicate%2Btreatment%2Btime_point.csv",
         f"{txburst_model_path}\\species=mouse-counts=umi-index=replicate+treatment+time_point.csv")

os.rename(f"{txburst_model_path}\\species%3Dpig-counts%3Dmedian-index%3Dreplicate%2Btreatment%2Btime_point.csv",
         f"{txburst_model_path}\\species=pig-counts=median-index=replicate+treatment+time_point.csv")

os.rename(f"{txburst_model_path}\\species%3Drabbit-counts%3Dmedian-index%3Dreplicate%2Btreatment%2Btime_point.csv",
         f"{txburst_model_path}\\species=rabbit-counts=median-index=replicate+treatment+time_point.csv")

os.rename(f"{txburst_model_path}\\species%3Drat-counts%3Dmedian-index%3Dreplicate%2Btreatment%2Btime_point.csv",
         f"{txburst_model_path}\\species=rat-counts=median-index=replicate+treatment+time_point.csv")

In [17]:
hagai_path = get_data_path("hagai_2018")
fetch_file(
    "https://raw.githubusercontent.com/dwnorton/sc-variability-paper/master/Data/RP2/analysis_genes.csv",
    hagai_path
);

Download supplementary data for Hagai *et al.* (2018)

In [5]:
fetch_file(
    "https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-018-0657-2/MediaObjects/41586_2018_657_MOESM4_ESM.xlsx",
    hagai_path
);

Download Hagai *et al.* (2018) datasets from ArrayExpress

In [6]:
ae_path = get_data_path("ArrayExpress")
fetch_file(
    "https://www.ebi.ac.uk/arrayexpress/files/E-MTAB-6754/E-MTAB-6754.processed.2.zip",
    ae_path
);

Extract zipped ArrayExpress datasets

In [7]:
for zip_path in ae_path.glob("*.zip"):
    extract_path = ae_path.joinpath(zip_path.stem)
    if extract_path.exists():
        continue

    print("Extracting:", zip_path)

    with ZipFile(zip_path, "r") as zip_file:
        zip_file.extractall(extract_path)

Extracting: C:\Users\wolke\Documents\Code\Data\ArrayExpress\E-MTAB-6754.processed.2.zip


Download txburst scripts for Larsson *et al.* (2019) burst modelling

In [8]:
txburst_filenames = (
    "txburstML.py",
    "txburstPL.py",
    "txburstTEST.py",
)

txburst_path = get_scripts_path("txburst")

for filename in txburst_filenames:
    url = f"https://raw.githubusercontent.com/sandberg-lab/txburst/master/{filename}"
    fetch_file(url, txburst_path)

Collate Hagai *et al.* (2018) UMI counts into a single dataframe

In [6]:
def extract_species_and_replicate(token):
    return token[:-1], token[-1]


def extract_treatment_and_time_point(token):
    matches = re.match(r"^([a-z]+)(\d*[^\d]*)$", token)
    return matches.group(1), matches.group(2) or "0"


def load_umi_count_adata(file_path):
    species_and_replicate, treatment_and_time_point = file_path.name.split("_")[:2]
    species, replicate = extract_species_and_replicate(species_and_replicate)
    treatment, time_point = extract_treatment_and_time_point(treatment_and_time_point)

    umi_ad = anndata.read_csv(file_path, delimiter=" ").T
    umi_ad.X = scipy.sparse.csr_matrix(umi_ad.X, dtype=np.int)

    umi_ad.obs["species"] = species
    umi_ad.obs["replicate"] = replicate
    umi_ad.obs["treatment"] = treatment
    umi_ad.obs["time_point"] = time_point
    umi_ad.obs["barcode"] = umi_ad.obs.index

    return umi_ad


def collate_umi_counts(csv_file_paths):
    total_obs = 0
    all_adata = []

    for file_path in csv_file_paths:
        print("Loading", file_path.name)

        csv_adata = load_umi_count_adata(file_path)
        csv_adata.obs.index = [str(i) for i in range(total_obs, total_obs + csv_adata.n_obs)]

        all_adata.append(csv_adata)

        total_obs += csv_adata.n_obs

    print("Collating")

    adata = all_adata[0].concatenate(all_adata[1:], join="outer", index_unique=None)
    adata.obs.drop(columns=["batch"], inplace=True)
    return adata

In [7]:
species_id_map = {
    "mouse": "mmusculus",
    "pig": "sscrofa",
    "rabbit": "ocuniculus",
    "rat": "rnorvegicus",
}

species_of_interest = species_id_map.keys()

umi_files_path = ae_path.joinpath("E-MTAB-6754.processed.2")

for species in species_of_interest:
    umi_file_path = umi_files_path.parent.joinpath(umi_files_path.name + f".{species}.h5ad")

    if not umi_file_path.exists():
        csv_glob = umi_files_path.glob(f"{species}*.txt.gz")
        adata = collate_umi_counts(csv_glob)
        adata.write_h5ad(umi_file_path)

Loading rat1_lps2_filtered_by_cell_cluster0.txt.gz
Loading rat1_lps4_filtered_by_cell_cluster0.txt.gz
Loading rat1_lps6_filtered_by_cell_cluster0.txt.gz
Loading rat1_pic2_filtered_by_cell_cluster0.txt.gz
Loading rat1_pic4_filtered_by_cell_cluster0.txt.gz
Loading rat1_pic6_filtered_by_cell_cluster0.txt.gz
Loading rat1_unst_filtered_by_cell_cluster0.txt.gz
Loading rat2_lps2_filtered_by_cell_cluster0.txt.gz
Loading rat2_lps4_filtered_by_cell_cluster0.txt.gz
Loading rat2_lps6_filtered_by_cell_cluster0.txt.gz
Loading rat2_pic2_filtered_by_cell_cluster0.txt.gz
Loading rat2_pic4_filtered_by_cell_cluster0.txt.gz
Loading rat2_pic6_filtered_by_cell_cluster0.txt.gz
Loading rat2_unst_filtered_by_cell_cluster0.txt.gz
Loading rat3_lps2_filtered_by_cell_cluster0.txt.gz
Loading rat3_lps4_filtered_by_cell_cluster0.txt.gz
Loading rat3_lps6_filtered_by_cell_cluster0.txt.gz
Loading rat3_pic2_filtered_by_cell_cluster0.txt.gz
Loading rat3_pic4_filtered_by_cell_cluster0.txt.gz
Loading rat3_pic6_filtered_by_c

... storing 'species' as categorical
... storing 'replicate' as categorical
... storing 'treatment' as categorical
... storing 'time_point' as categorical
... storing 'barcode' as categorical


Download a list of mouse genes (Ensembl ID, symbol and description) from BioMart

In [19]:
def make_biomart_genes_query_xml(species_id):
    return '<?xml version="1.0" encoding="UTF-8"?>' \
           '<!DOCTYPE Query>' \
           '<Query virtualSchemaName="default" formatter="TSV" header="0" uniqueRows="1" count="" datasetConfigVersion="0.6">' \
           f'<Dataset name="{species_id}_gene_ensembl" interface="default">' \
           '<Attribute name="ensembl_gene_id" />' \
           '<Attribute name="external_gene_name" />' \
           '<Attribute name="description" />' \
           '</Dataset>' \
           '</Query>'


def make_biomart_query_url(query_xml):
    return "http://mar2016.archive.ensembl.org/biomart/martservice?query=" + urllib.parse.quote(query_xml)


def make_biomart_genes_query_url(species_id):
    biomart_query = make_biomart_genes_query_xml(species_id)
    return "http://mar2016.archive.ensembl.org/biomart/martservice?query=" + urllib.parse.quote(biomart_query)

def make_biomart_mouse_orthologues_query_url():
    mouse_query_xml = '<?xml version="1.0" encoding="UTF-8"?>' \
                '<!DOCTYPE Query>' \
                '<Query  virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >' \
                '<Dataset name = "mmusculus_gene_ensembl" interface = "default">' \
                '<Attribute name = "ensembl_gene_id" />' \
                '<Attribute name = "sscrofa_homolog_ensembl_gene" />' \
                '<Attribute name = "ocuniculus_homolog_ensembl_gene" />' \
                '<Attribute name = "rnorvegicus_homolog_ensembl_gene" />' \
                '</Dataset>' \
                '</Query>'
    return make_biomart_query_url(mouse_query_xml)

def make_biomart_rat_orthologues_query_url():
    rat_query_xml = '<?xml version="1.0" encoding="UTF-8"?>' \
                    '<!DOCTYPE Query>' \
                    '<Query  virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >'\
                    '<Dataset name = "rnorvegicus_gene_ensembl" interface = "default" >' \
                    '<Attribute name = "ensembl_gene_id" />' \
                    '<Attribute name = "mmusculus_homolog_ensembl_gene" />' \
                    '<Attribute name = "sscrofa_homolog_ensembl_gene" />' \
                    '<Attribute name = "ocuniculus_homolog_ensembl_gene" />' \
                    '</Dataset>' \
                    '</Query>'
    return make_biomart_query_url(rat_query_xml)

def make_biomart_pig_orthologues_query_url():
    pig_query_xml = '<?xml version="1.0" encoding="UTF-8"?>'\
                    '<!DOCTYPE Query>' \
                    '<Query  virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >' \
                    '<Dataset name = "sscrofa_gene_ensembl" interface = "default" >' \
                    '<Attribute name = "ensembl_gene_id" />' \
                    '<Attribute name = "mmusculus_homolog_ensembl_gene" />' \
                    '<Attribute name = "ocuniculus_homolog_ensembl_gene" />' \
                    '<Attribute name = "rnorvegicus_homolog_ensembl_gene" />' \
                    '</Dataset>' \
                    '</Query>'
    return make_biomart_query_url(pig_query_xml)

def make_biomart_rabbit_orthologues_query_url():
    rabbit_query_xml = '<?xml version="1.0" encoding="UTF-8"?>'\
                '<!DOCTYPE Query>'\
                '<Query  virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >'\
                '<Dataset name = "ocuniculus_gene_ensembl" interface = "default" >'\
                '<Attribute name = "ensembl_gene_id" />'\
                '<Attribute name = "mmusculus_homolog_ensembl_gene" />'\
                '<Attribute name = "sscrofa_homolog_ensembl_gene" />'\
                '<Attribute name = "rnorvegicus_homolog_ensembl_gene" />' \
                '</Dataset>' \
                '</Query>'
    return make_biomart_query_url(rabbit_query_xml)


biomart_path = get_data_path("BioMart")

for species, species_id in species_id_map.items():
    genes_tsv_path = biomart_path.joinpath(f"{species}_genes.tsv")

    if not genes_tsv_path.exists():
        biomart_url = make_biomart_genes_query_url(species_id)
        fetch_file(biomart_url, genes_tsv_path.parent, rename_to=genes_tsv_path.name)

fetch_file(make_biomart_mouse_orthologues_query_url(), biomart_path, rename_to="mouse_orthologues.tsv")
fetch_file(make_biomart_rat_orthologues_query_url(), biomart_path, rename_to="rat_orthologues.tsv")
fetch_file(make_biomart_pig_orthologues_query_url(), biomart_path, rename_to="pig_orthologues.tsv")
fetch_file(make_biomart_rabbit_orthologues_query_url(), biomart_path, rename_to="rabbit_orthologues.tsv")

print("All done")

All done
