In [1]:
from typing import List

import deeptools.getScorePerBigWigBin as score_bw
from deeptools.correlation import Correlation
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
import requests

In [2]:
def get_metadata(organism, assay_title, output_type, file_format):
    file_name = f"{''.join(c if c.isalnum() else '-' for c in organism.lower())}_{''.join(c if c.isalnum() else '-' for c in assay_title.lower())}"
    url = (
        "https://www.encodeproject.org/search/?type=Experiment"
        f"&replicates.library.biosample.donor.organism.scientific_name={organism}"
        "&status=released"
        f"&assay_title={assay_title}"
        # '&biosample_ontology.classification=cell+line'
        # '&replicates.library.biosample.donor.organism.scientific_name=Homo+sapiens'
        # '&target.investigated_as=transcription+factor'
        # '&replicates.library.biosample.treatments!=*'
        # '&award.project=ENCODE'
        # '&award.rfa=ENCODE3'
        # '&target.label=CTCF'
        "&frame=embedded"
        "&format=json"
        "&limit=all"
    )
    r = requests.get(url)
    experiments = r.json()["@graph"]
    if not experiments:
        print(f"No experiments found for {file_name}, skipping...")
        return
    else:
        print(f"Found {len(experiments)} experiments for {file_name}")

    biosample_ontologies = []
    for e in experiments:
        # Get the name of the cell line, as well as more
        # general categorical information.
        bo_dict = e.get("biosample_ontology", {})
        del e["biosample_ontology"]
        bo_dict["biosample_id"] = bo_dict.pop("@id")
        bo_dict["biosample_type"] = bo_dict.pop("@type")
        e["experiment_id"] = e.pop("@id")
        e["experiment_type"] = e.pop("@type")
        bo_dict.update(e)
        bo_dict["dataset"] = bo_dict["experiment_id"]
        biosample_ontologies.append(bo_dict)
    biosample_metadata = pd.DataFrame(biosample_ontologies)
    biosample_metadata.to_csv(f"metadata/{file_name}_experiments.csv.gz", index=False)
    # also to parquet
    biosample_metadata.to_pickle(f"metadata/{file_name}_experiments.pkl")

    datasets = ["&dataset={}".format(d) for d in biosample_metadata.dataset]
    batch_size = 100
    files = []
    url_base = (
        "https://www.encodeproject.org/search/?type=File"
        "&status=released"
        "&field=dataset"
        "&field=biological_replicates"
        "&field=cloud_metadata.url"
        "&field=assembly"
        "&field=output_type"
        "&field=file_format"
        "&frame=object"
        "&format=json"
        "&limit=all"
    )
    if output_type is not None:
        if isinstance(output_type, list):
            for o in output_type:
                url_base += f"&output_type={o}"
        else:
            url_base += f"&output_type={output_type}"
    if file_format is not None:
        if isinstance(file_format, list):
            for f in file_format:
                url_base += f"&file_format={f}"
        else:
            url_base += f"&file_format={file_format}"
    for i in range(0, len(datasets), batch_size):
        url = url_base + "".join(datasets[i : i + batch_size])
        r = requests.get(url)
        # Filter the files to only those belonging to multiple replicates.
        files.extend(r.json()["@graph"])
    if not files:
        print(f"No files found for {file_name}, skipping...")
        return
    else:
        print(f"Found {len(files)} files for {file_name}")
    file_metadata = pd.DataFrame(files)
    file_metadata["cloud_metadata"] = file_metadata.cloud_metadata.apply(
        lambda x: x.get("url")
    )
    file_metadata = file_metadata.rename(
        columns={"@id": "file_id", "@type": "file_type"}
    )

    metadata = file_metadata.merge(biosample_metadata, how="left", on="dataset")
    # metadata.to_csv(f"metadata/{file_name}_files.csv", index=False)
    metadata.to_pickle(f"metadata/{file_name}_files.pkl")


## Epigenomics

In [20]:
for organism in [
    "Mus+musculus",
    "Homo+sapiens",
    "Drosophila+melanogaster",
    "Caenorhabditis+elegans",
]:
    # mint-chip-seq only measures histone marks
    for assay_title in [
        "TF+ChIP-seq",
        "Control+ChIP-seq",
        # "Mint-ChIP-seq",
        # "Control+Mint-ChIP-seq",
        "DNase-seq",
        "MNase-seq",
        "ATAC-seq",
    ]:
        get_metadata(organism, assay_title, None, ["bigWig", "bigBed", "bed"])


## RNA-seq

In [3]:
for organism in [
    "Homo+sapiens",  # 1426
    "Mus+musculus",  # 515
    "Drosophila+melanogaster",  # 329
    "Caenorhabditis+elegans",  # 218
    "Drosophila+pseudoobscura",  # 4
    "Drosophila+mojavensis",  # 2
    "Trichechus+manatus",  # 1
]:
    for assay_title in [
        "total+RNA-seq",
        "polyA+plus+RNA-seq",
        "polyA+minus+RNA-seq",
        "CRISPRi+RNA-seq",
        "long+read+RNA-seq",
        "CAGE",
    ]:
        get_metadata(
            organism,
            assay_title,
            None,
            ["bigWig", "bed", "bigBed", "gff", "gtf", "tar", "tsv"],
        )


Found 666 experiments for homo-sapiens_total-rna-seq
Found 12054 files for homo-sapiens_total-rna-seq
Found 458 experiments for homo-sapiens_polya-plus-rna-seq
Found 8904 files for homo-sapiens_polya-plus-rna-seq
Found 32 experiments for homo-sapiens_polya-minus-rna-seq
Found 1200 files for homo-sapiens_polya-minus-rna-seq
Found 77 experiments for homo-sapiens_crispri-rna-seq
Found 3136 files for homo-sapiens_crispri-rna-seq
Found 116 experiments for homo-sapiens_long-read-rna-seq
Found 357 files for homo-sapiens_long-read-rna-seq
Found 77 experiments for homo-sapiens_cage
Found 2130 files for homo-sapiens_cage
Found 220 experiments for mus-musculus_total-rna-seq
Found 3781 files for mus-musculus_total-rna-seq
Found 224 experiments for mus-musculus_polya-plus-rna-seq
Found 3774 files for mus-musculus_polya-plus-rna-seq
No experiments found for mus-musculus_polya-minus-rna-seq, skipping...
No experiments found for mus-musculus_crispri-rna-seq, skipping...
Found 71 experiments for mus-mu

## Hi-C

In [4]:
for organism in ["Homo+sapiens", "Mus+musculus"]:
    for assay in ["in+situ+Hi-C"]:
        # hic is contact matrix and should be the most important
        get_metadata(organism, assay, None, ["bigWig", "bedpe", "bed", "hdf5", "hic", "pairs", "tsv"])

Found 82 experiments for homo-sapiens_in-situ-hi-c
Found 1742 files for homo-sapiens_in-situ-hi-c
Found 15 experiments for mus-musculus_in-situ-hi-c
Found 252 files for mus-musculus_in-situ-hi-c


## Single-cell (scRNA, scATAC, long-read scRNA)

In [7]:
organisms = ["Mus+musculus", "Homo+sapiens"]
assay_titles = ["long+read+scRNA-seq", "scRNA-seq", "snATAC-seq"]
file_formats = ["tsv", "tar", "h5ad", "gtf", "bigWig", "bigBed", "bed"]
for assay_title in assay_titles:
    for organism in organisms:
        get_metadata(organism, assay_title, None, file_formats)

Found 64 experiments for mus-musculus_long-read-scrna-seq
Found 12 files for mus-musculus_long-read-scrna-seq
No experiments found for homo-sapiens_long-read-scrna-seq, skipping...
Found 282 experiments for mus-musculus_scrna-seq
Found 1633 files for mus-musculus_scrna-seq
Found 126 experiments for homo-sapiens_scrna-seq
Found 735 files for homo-sapiens_scrna-seq
Found 51 experiments for mus-musculus_snatac-seq
Found 80 files for mus-musculus_snatac-seq
Found 302 experiments for homo-sapiens_snatac-seq
Found 302 files for homo-sapiens_snatac-seq


In [11]:
organism = "Mus+musculus"
assay_title = "TF+ChIP-seq"
file_name = f"{''.join(c if c.isalnum() else '_' for c in organism.lower())}_{''.join(c if c.isalnum() else '_' for c in assay_title.lower())}"
url = (
    "https://www.encodeproject.org/search/?type=Experiment"
    f"&replicates.library.biosample.donor.organism.scientific_name={organism}"
    "&status=released"
    f"&assay_title={assay_title}"
    # '&biosample_ontology.classification=cell+line'
    # '&replicates.library.biosample.donor.organism.scientific_name=Homo+sapiens'
    # '&target.investigated_as=transcription+factor'
    # '&replicates.library.biosample.treatments!=*'
    # '&award.project=ENCODE'
    # '&award.rfa=ENCODE3'
    # '&target.label=CTCF'
    "&frame=embedded"
    "&format=json"
    "&limit=all"
)
r = requests.get(url)
experiments = r.json()["@graph"]


In [14]:
experiments[0]["@id"], experiments[0]["biosample_ontology"]["@id"]

('/experiments/ENCSR000ETM/', '/biosample-types/cell_line_EFO_0003971/')

In [15]:
experiments[0]["@type"], experiments[0]["biosample_ontology"]["@type"]

(['Experiment', 'Dataset', 'Item'], ['BiosampleType', 'Item'])

In [17]:
experiments[0].keys(), experiments[0]["biosample_ontology"].keys()

(dict_keys(['@id', '@type', 'accession', 'aliases', 'alternate_accessions', 'analyses', 'assay_slims', 'assay_term_id', 'assay_term_name', 'assay_title', 'assembly', 'audit', 'award', 'bio_replicate_count', 'biosample_ontology', 'biosample_summary', 'category_slims', 'contributing_files', 'date_created', 'date_released', 'date_submitted', 'dbxrefs', 'default_analysis', 'description', 'documents', 'doi', 'files', 'hub', 'internal_status', 'internal_tags', 'lab', 'objective_slims', 'original_files', 'perturbed', 'possible_controls', 'references', 'related_annotations', 'related_files', 'related_series', 'replicates', 'replication_type', 'revoked_files', 'schema_version', 'simple_biosample_summary', 'status', 'submitted_by', 'superseded_by', 'supersedes', 'target', 'tech_replicate_count', 'type_slims', 'uuid']),
 dict_keys(['dbxrefs', 'organ_slims', 'system_slims', 'aliases', 'references', '@type', 'synonyms', 'term_id', 'classification', 'uuid', 'schema_version', 'term_name', 'cell_slims

In [20]:
[(k, type(experiments[0][k])) for k in experiments[0]]

[('@id', str),
 ('@type', list),
 ('accession', str),
 ('aliases', list),
 ('alternate_accessions', list),
 ('analyses', list),
 ('assay_slims', list),
 ('assay_term_id', str),
 ('assay_term_name', str),
 ('assay_title', str),
 ('assembly', list),
 ('audit', dict),
 ('award', dict),
 ('bio_replicate_count', int),
 ('biosample_ontology', dict),
 ('biosample_summary', str),
 ('category_slims', list),
 ('contributing_files', list),
 ('date_created', str),
 ('date_released', str),
 ('date_submitted', str),
 ('dbxrefs', list),
 ('default_analysis', str),
 ('description', str),
 ('documents', list),
 ('doi', str),
 ('files', list),
 ('hub', str),
 ('internal_status', str),
 ('internal_tags', list),
 ('lab', dict),
 ('objective_slims', list),
 ('original_files', list),
 ('perturbed', bool),
 ('possible_controls', list),
 ('references', list),
 ('related_annotations', list),
 ('related_files', list),
 ('related_series', list),
 ('replicates', list),
 ('replication_type', str),
 ('revoked_files

In [44]:
url = (
    'https://www.encodeproject.org/search/?type=Experiment'
    # '&replicates.library.biosample.donor.organism.scientific_name=Drosophila+melanogaster'
    '&replicates.library.biosample.donor.organism.scientific_name=Mus+musculus'
    # '&replicates.library.biosample.donor.organism.scientific_name=Mus+musculus'
    '&replicates.library.biosample.donor.organism.scientific_name=Mus+musculus'
    '&status=released'
    '&assay_title=TF+ChIP-seq'
    # '&biosample_ontology.classification=cell+line'
    # '&replicates.library.biosample.donor.organism.scientific_name=Homo+sapiens'
    # '&target.investigated_as=transcription+factor'
    # '&replicates.library.biosample.treatments!=*'
    # '&award.project=ENCODE'
    # '&award.rfa=ENCODE3'
    # '&target.label=CTCF'
    '&frame=embedded'
    '&format=json'
    '&limit=all'
)
r = requests.get(url)

In [45]:
experiments = r.json()['@graph']
len(experiments)

197

In [53]:
biosample_ontologies = []
for e in experiments:
    # Get the name of the cell line, as well as more
    # general categorical information.
    biosample_ontology_condensed = e.get('biosample_ontology', {})
    biosample_ontology_condensed["dataset"] = e["@id"]
    biosample_ontologies.append(biosample_ontology_condensed)
biosample_metadata = pd.DataFrame(biosample_ontologies)
# rename columns
biosample_metadata.rename(columns={"@type": "experiment_type", "@id": "id"}, inplace=True)
biosample_metadata

Unnamed: 0,dbxrefs,organ_slims,system_slims,aliases,references,experiment_type,synonyms,term_id,classification,uuid,schema_version,term_name,cell_slims,developmental_slims,name,id,status,dataset
0,[Cellosaurus:CVCL_2111],"[immune organ, spleen]","[digestive system, immune system]",[],[],"[BiosampleType, Item]","[MEL, Mouse Erythroleukemia cell line]",EFO:0003971,cell line,83b1b448-a4f5-4d31-a8fc-c7ea40f5a177,1,MEL,[cancer cell],[mesoderm],cell_line_EFO_0003971,/biosample-types/cell_line_EFO_0003971/,released,/experiments/ENCSR000ETM/
1,[Cellosaurus:CVCL_2111],"[immune organ, spleen]","[digestive system, immune system]",[],[],"[BiosampleType, Item]","[MEL, Mouse Erythroleukemia cell line]",EFO:0003971,cell line,83b1b448-a4f5-4d31-a8fc-c7ea40f5a177,1,MEL,[cancer cell],[mesoderm],cell_line_EFO_0003971,/biosample-types/cell_line_EFO_0003971/,released,/experiments/ENCSR000ETK/
2,[],[brain],[central nervous system],[],[],"[BiosampleType, Item]",[bulbus olfactorius],UBERON:0002264,tissue,13370fb0-7402-420d-b246-6f6ae32a877d,1,olfactory bulb,[],[ectoderm],tissue_UBERON_0002264,/biosample-types/tissue_UBERON_0002264/,released,/experiments/ENCSR000CDY/
3,[Cellosaurus:CVCL_2111],"[immune organ, spleen]","[digestive system, immune system]",[],[],"[BiosampleType, Item]","[MEL, Mouse Erythroleukemia cell line]",EFO:0003971,cell line,83b1b448-a4f5-4d31-a8fc-c7ea40f5a177,1,MEL,[cancer cell],[mesoderm],cell_line_EFO_0003971,/biosample-types/cell_line_EFO_0003971/,released,/experiments/ENCSR000ETP/
4,[Cellosaurus:CVCL_2111],"[immune organ, spleen]","[digestive system, immune system]",[],[],"[BiosampleType, Item]","[MEL, Mouse Erythroleukemia cell line]",EFO:0003971,cell line,83b1b448-a4f5-4d31-a8fc-c7ea40f5a177,1,MEL,[cancer cell],[mesoderm],cell_line_EFO_0003971,/biosample-types/cell_line_EFO_0003971/,released,/experiments/ENCSR000EUA/
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192,[Cellosaurus:CVCL_K037],[embryo],[],[],[],"[BiosampleType, Item]",[],EFO:0005483,cell line,7138a597-261b-4072-bfdc-56f485b54d7a,1,ES-Bruce4,"[stem cell, embryonic cell]",[],cell_line_EFO_0005483,/biosample-types/cell_line_EFO_0005483/,released,/experiments/ENCSR000CCD/
193,[Cellosaurus:CVCL_K037],[embryo],[],[],[],"[BiosampleType, Item]",[],EFO:0005483,cell line,7138a597-261b-4072-bfdc-56f485b54d7a,1,ES-Bruce4,"[stem cell, embryonic cell]",[],cell_line_EFO_0005483,/biosample-types/cell_line_EFO_0005483/,released,/experiments/ENCSR000CCB/
194,[],[heart],[circulatory system],[],[],"[BiosampleType, Item]","[chambered heart, vertebrate heart]",UBERON:0000948,tissue,d796c29f-b97c-4d77-81a4-f36fe2f060c6,1,heart,[],[mesoderm],tissue_UBERON_0000948,/biosample-types/tissue_UBERON_0000948/,released,/experiments/ENCSR491NUM/
195,[Cellosaurus:CVCL_Y494],"[kidney, connective tissue]",[excretory system],[],[],"[BiosampleType, Item]",[],EFO:0005481,cell line,d4b20864-a184-4e72-8b43-5d167c40c3fe,1,Patski,"[connective tissue cell, fibroblast]",[mesoderm],cell_line_EFO_0005481,/biosample-types/cell_line_EFO_0005481/,released,/experiments/ENCSR419OOD/


In [61]:
biosample_metadata.to_csv("metadata/experiments.csv", index=False)

In [54]:
datasets = [
    '&dataset={}'.format(e.get('@id'))
    for e in experiments
]

In [62]:
url = (
    'https://www.encodeproject.org/search/?type=File'
    '&status=released'
    '&output_type=signal p-value'
    '&field=dataset'
    '&field=biological_replicates'
    '&field=cloud_metadata.url'
    '&field=assembly'
    '&field=output_type'
    '&field=file_type'
    '&frame=object'
    '&format=json'
    '&limit=all'
    '{}'.format(''.join(datasets))
)
r = requests.get(url)
# Filter the files to only those belonging to multiple replicates.
files = [
    f
    for f in r.json()['@graph']
    # if len(f['biological_replicates']) == 1
]

In [63]:
len(files)

1013

In [64]:
# Flatten JSON.
file_metadata = pd.DataFrame(files)
file_metadata['cloud_metadata'] = file_metadata.cloud_metadata.apply(lambda x: x.get('url'))
# Rename column.
file_metadata = file_metadata.rename(columns={'@id': 'file', "@type": "file_type"})
file_metadata

Unnamed: 0,file,file_type,assembly,biological_replicates,cloud_metadata,dataset,file_type.1,output_type
0,/files/ENCFF688ZJR/,"[File, Item]",mm10,[1],https://encode-public.s3.amazonaws.com/2020/11...,/experiments/ENCSR000AIV/,bigWig,signal p-value
1,/files/ENCFF768YEP/,"[File, Item]",mm10,[1],https://encode-public.s3.amazonaws.com/2020/11...,/experiments/ENCSR000CEG/,bigWig,signal p-value
2,/files/ENCFF983AFT/,"[File, Item]",mm10,[2],https://encode-public.s3.amazonaws.com/2020/11...,/experiments/ENCSR000EQU/,bigWig,signal p-value
3,/files/ENCFF700ZGR/,"[File, Item]",mm10,[1],https://encode-public.s3.amazonaws.com/2020/11...,/experiments/ENCSR000AIE/,bigWig,signal p-value
4,/files/ENCFF222LPS/,"[File, Item]",mm10,[1],https://encode-public.s3.amazonaws.com/2020/11...,/experiments/ENCSR000DID/,bigWig,signal p-value
...,...,...,...,...,...,...,...,...
1008,/files/ENCFF909UFR/,"[File, Item]",mm10,"[1, 2]",https://encode-public.s3.amazonaws.com/2020/10...,/experiments/ENCSR000DIU/,bigWig,signal p-value
1009,/files/ENCFF282BUP/,"[File, Item]",mm10,[1],https://encode-public.s3.amazonaws.com/2020/10...,/experiments/ENCSR000CEF/,bigWig,signal p-value
1010,/files/ENCFF421PAU/,"[File, Item]",mm10,"[1, 2]",https://encode-public.s3.amazonaws.com/2020/10...,/experiments/ENCSR000CEH/,bigWig,signal p-value
1011,/files/ENCFF865NGS/,"[File, Item]",mm10,[2],https://encode-public.s3.amazonaws.com/2020/10...,/experiments/ENCSR000CFJ/,bigWig,signal p-value


In [65]:
# Join with biosample_metadata.
merged_df = file_metadata.merge(biosample_metadata, how='left', on='dataset')
# merged_df = merged_df[[
#     'file',
#     'dataset',
#     'cell_slims',
#     'developmental_slims',
#     'organ_slims',
#     'system_slims',
#     'term_name',
#     'biological_replicates',
#     'cloud_metadata',
#     'assembly'
# ]]
merged_df

Unnamed: 0,file,file_type,assembly,biological_replicates,cloud_metadata,dataset,file_type.1,output_type,dbxrefs,organ_slims,...,term_id,classification,uuid,schema_version,term_name,cell_slims,developmental_slims,name,id,status
0,/files/ENCFF688ZJR/,"[File, Item]",mm10,[1],https://encode-public.s3.amazonaws.com/2020/11...,/experiments/ENCSR000AIV/,bigWig,signal p-value,[Cellosaurus:CVCL_0188],[musculature of body],...,EFO:0001098,cell line,2946bfa5-f2b0-4845-8dcb-d5c862a84e6f,1,C2C12,"[embryonic cell, myoblast]",[mesoderm],cell_line_EFO_0001098,/biosample-types/cell_line_EFO_0001098/,released
1,/files/ENCFF768YEP/,"[File, Item]",mm10,[1],https://encode-public.s3.amazonaws.com/2020/11...,/experiments/ENCSR000CEG/,bigWig,signal p-value,[],"[gonad, testis]",...,UBERON:0000473,tissue,11e4281a-8bf0-41a1-b9bd-0f34b6489b12,1,testis,[],[],tissue_UBERON_0000473,/biosample-types/tissue_UBERON_0000473/,released
2,/files/ENCFF983AFT/,"[File, Item]",mm10,[2],https://encode-public.s3.amazonaws.com/2020/11...,/experiments/ENCSR000EQU/,bigWig,signal p-value,[Cellosaurus:CVCL_0211],"[bodily fluid, blood, lymph node]",...,EFO:0005233,cell line,afa9fd2a-c66c-4b1c-8db0-3220d665e04a,1,CH12.LX,"[B cell, hematopoietic cell, leukocyte]",[mesoderm],cell_line_EFO_0005233,/biosample-types/cell_line_EFO_0005233/,released
3,/files/ENCFF700ZGR/,"[File, Item]",mm10,[1],https://encode-public.s3.amazonaws.com/2020/11...,/experiments/ENCSR000AIE/,bigWig,signal p-value,[],[musculature of body],...,CL:0000187,in vitro differentiated cells,2ed28987-8499-4b62-b20e-26d7391f42c5,1,myocyte,[],[mesoderm],in_vitro_differentiated_cells_CL_0000187,/biosample-types/in_vitro_differentiated_cells...,released
4,/files/ENCFF222LPS/,"[File, Item]",mm10,[1],https://encode-public.s3.amazonaws.com/2020/11...,/experiments/ENCSR000DID/,bigWig,signal p-value,[Cellosaurus:CVCL_D047],[embryo],...,EFO:0002055,cell line,f18085c4-828f-44d8-9803-2eb89cae4628,1,G1E-ER4,[stem cell],[],cell_line_EFO_0002055,/biosample-types/cell_line_EFO_0002055/,released
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1008,/files/ENCFF909UFR/,"[File, Item]",mm10,"[1, 2]",https://encode-public.s3.amazonaws.com/2020/10...,/experiments/ENCSR000DIU/,bigWig,signal p-value,[Cellosaurus:CVCL_0211],"[bodily fluid, blood, lymph node]",...,EFO:0005233,cell line,afa9fd2a-c66c-4b1c-8db0-3220d665e04a,1,CH12.LX,"[B cell, hematopoietic cell, leukocyte]",[mesoderm],cell_line_EFO_0005233,/biosample-types/cell_line_EFO_0005233/,released
1009,/files/ENCFF282BUP/,"[File, Item]",mm10,[1],https://encode-public.s3.amazonaws.com/2020/10...,/experiments/ENCSR000CEF/,bigWig,signal p-value,[],"[gonad, testis]",...,UBERON:0000473,tissue,11e4281a-8bf0-41a1-b9bd-0f34b6489b12,1,testis,[],[],tissue_UBERON_0000473,/biosample-types/tissue_UBERON_0000473/,released
1010,/files/ENCFF421PAU/,"[File, Item]",mm10,"[1, 2]",https://encode-public.s3.amazonaws.com/2020/10...,/experiments/ENCSR000CEH/,bigWig,signal p-value,[],[brain],...,UBERON:0000955,tissue,1a70b14a-679c-4b29-88bf-2fa08c05fed1,1,brain,[],[ectoderm],tissue_UBERON_0000955,/biosample-types/tissue_UBERON_0000955/,released
1011,/files/ENCFF865NGS/,"[File, Item]",mm10,[2],https://encode-public.s3.amazonaws.com/2020/10...,/experiments/ENCSR000CFJ/,bigWig,signal p-value,[],"[bone element, bone marrow]",...,CL:0002476,primary cell,509a0792-26d2-472f-b6d7-bd5109d22a90,1,bone marrow macrophage,"[myeloid cell, hematopoietic cell, leukocyte]",[mesoderm],primary_cell_CL_0002476,/biosample-types/primary_cell_CL_0002476/,released


In [66]:
merged_df.to_csv("metadata/files.csv", index=False)