In [1]:
%run "../head.py"

In [2]:
import pandas as pd
from hic_basic.binnify import GenomeIdeograph
from hic_basic.sequence import count_CpG
from hic_basic.hicio import read_meta, load_json
from hic_basic.scAB_embedding import calc_color2, color2, s_color2
from hires_utils.hires_io import parse_3dg

### prepare reference track

In [3]:
Fin = False
Force = False

# do human
#binsize = 500000
binsize = 50000
bins = GenomeIdeograph(
    "GRCh38"
).bins(binsize, bed=True, order=True)

outfile = h.ddir / f"GRCh38.CpG.{h.binsize_suffix[binsize]}.tsv.gz"
if (not outfile.exists() or Force) and (not Fin):
    CpG = count_CpG(
        bins,
        "/share/Data/ychi/genome/GRCh38/raw/GRCh38.primary_assembly.genome.fa"
    )
    CpG.to_csv(
        outfile,
        index=False,
        header=False,
        sep="\t"
        )
# do mouse
bins = GenomeIdeograph(
    "mm10"
).bins(binsize, bed=True, order=True)

outfile = h.ddir / f"mm10.CpG.{h.binsize_suffix[binsize]}.tsv.gz"
if (not outfile.exists() or Force) and (not Fin):
    CpG = count_CpG(
        bins,
        "/share/Data/ychi/genome/GRCm38/raw/mm10.fa"
    )
    CpG.to_csv(
        outfile,
        index=False,
        header=False,
        sep="\t"
        )

In [3]:
if h.version in ["0","1"]:
    meta = read_meta(h.base / "notebooks/A_meta/meta/All.meta.csv.gz")
elif h.version == "2":
    meta = read_meta(h.base / "notebooks/A_meta/meta/All_formal_noPAR.meta.csv.gz")
    #meta = read_meta(h.base / "notebooks/A_meta/meta/All_formal.meta.csv.gz")
    #meta = read_meta(h.fig1 / "A_meta/meta/All_sp_formal.meta.csv.gz")

Sperm_ga = load_json(
    h.fig1 / f"Fig1S_structure_quality/Sperm_ga.v{h.version}.json"
)
Sperm_hg_ga = load_json(
    h.fig1 / f"Fig1S_structure_quality/Sperm_hg_ga.v{h.version}.json"
)
mESC_gs = load_json(
    h.fig1 / f"Fig1S_tech_analysis/mESC_samples_with_structure.v{h.version}.json"
)
Tan2018_gs = load_json(
    h.fig1 / f"Fig1S_tech_analysis/Tan2018_samples_with_structure.v{h.version}.json"
)
RS_meta = read_meta(
    h.base / "ds_pipeline" / "smk" / "config" / "RS.gs.sample_table.csv"
)
RS_gs = RS_meta.index.tolist()

#binsize = 500000
binsizes = [20000, 50000, 200000, 500000]
e_scAB_args = {}
for binsize in binsizes:
    scAB_args = {
        f"Tan2018_{h.binsize_suffix[binsize]}" : {
            "meta" : meta.loc[Tan2018_gs],
            "binsize" : binsize,
            "ref" : pd.read_table(
                h.ddir / f"GRCh38.CpG.{h.binsize_suffix[binsize]}.tsv.gz",
                names = ["chrom","start","end","CpG"]
                ),
            "outfile" : h.ddir / f"Tan2018_{h.binsize_suffix[binsize]}.color2.parquet.gz",
        },
        f"mESC_{h.binsize_suffix[binsize]}" : {
            "meta" : meta.loc[mESC_gs],
            "binsize" : binsize,
            "ref" : pd.read_table(
                h.ddir / f"mm10.CpG.{h.binsize_suffix[binsize]}.tsv.gz",
                names = ["chrom","start","end","CpG"]
                ),
            "outfile" : h.ddir / f"mESC_{h.binsize_suffix[binsize]}.color2.parquet.gz",
        },
        f"Sperm_{h.binsize_suffix[binsize]}" : {
            "meta" : meta.loc[Sperm_ga],
            "binsize" : binsize,
            "ref" : pd.read_table(
                h.ddir / f"mm10.CpG.{h.binsize_suffix[binsize]}.tsv.gz",
                names = ["chrom","start","end","CpG"]
                ),
            "outfile" : h.ddir / f"Sperm_{h.binsize_suffix[binsize]}.color2.parquet.gz",
        },
        f"Sperm_hg_{h.binsize_suffix[binsize]}" : {
            "meta" : meta.loc[Sperm_hg_ga],
            "binsize" : binsize,
            "ref" : pd.read_table(
                h.ddir / f"GRCh38.CpG.{h.binsize_suffix[binsize]}.tsv.gz",
                names = ["chrom","start","end","CpG"]
                ),
            "outfile" : h.ddir / f"Sperm_hg_{h.binsize_suffix[binsize]}.color2.parquet.gz",
        },
        f"RS_{h.binsize_suffix[binsize]}" : {
            "meta" : RS_meta,
            "binsize" : binsize,
            "ref" : pd.read_table(
                h.ddir / f"mm10.CpG.{h.binsize_suffix[binsize]}.tsv.gz",
                names = ["chrom","start","end","CpG"]
                ),
            "outfile" : h.ddir / f"RS_{h.binsize_suffix[binsize]}.color2.parquet.gz"
        }
    }
    e_scAB_args.update(scAB_args)

### structure calculationg

In [5]:
#HuS02_HuSZ032,HuS02_HuSZ028,HuS02_HuSZ025,HuS02_HuSZ022,HuS02_HuSZ018,HuS02_HuSZ015,HuS02_HuSZ012,HuS02_HuSZ010,HuS02_HuSZ009
human_samples = ["HuS02_HuSZ032","HuS02_HuSZ028","HuS02_HuSZ025","HuS02_HuSZ022","HuS02_HuSZ018","HuS02_HuSZ015","HuS02_HuSZ012","HuS02_HuSZ010","HuS02_HuSZ009"]
#BJ8019,BJ8001,BJ8024,BJ8009,BJ8012,BJ8014,BJ8015,BJ8022,BJ8023
mouse_samples = ["BJ8019","BJ8001","BJ8024","BJ8009","BJ8012","BJ8014","BJ8015","BJ8022","BJ8023"]

human_samples = Sperm_hg_ga
mouse_samples = Sperm_ga

In [20]:
def worker(sample, ref, force=False):
    outfile = h.dsdir / "scAB" / f"{sample}.20k.scAB.csv.gz"
    if outfile.exists() and not force:
        return outfile
    _3dg = parse_3dg(meta.loc[sample,"20k_g_struct1"])
    #color_data = pd.read_table(h.ddir / "mm10.CpG.20k.tsv.gz", names = ["chrom","start","end","CpG"])
    color_data = pd.read_table(ref, names = ["chrom","start","end","CpG"])
    res = s_color2(
        _3dg,
        color_data,
        n_jobs=4
    )
    res[["chr","pos","scAB"]].to_csv(
        outfile,
        index=False,
        header=False,
        sep="\t",
        )
    return outfile

In [None]:
#worker(human_samples[0], h.ddir / "GRCh38.CpG.20k.tsv.gz")

In [None]:
from concurrent.futures import ProcessPoolExecutor
with ProcessPoolExecutor(max_workers=4) as executor:
    for sample in human_samples:
        executor.submit(worker, sample, h.ddir / "GRCh38.CpG.20k.tsv.gz")
    for sample in mouse_samples:
        executor.submit(worker, sample, h.ddir / "mm10.CpG.20k.tsv.gz")

In [11]:
with ProcessPoolExecutor(max_workers=4) as executor:
    for sample in Tan2018_gs:
        executor.submit(worker, sample, h.ddir / "GRCh38.CpG.20k.tsv.gz")

In [21]:
with ProcessPoolExecutor(max_workers=4) as executor:
    for sample in mESC_gs:
        executor.submit(worker, sample, h.ddir / "mm10.CpG.20k.tsv.gz", force=True)

agg into a h5 file

In [13]:
from tqdm import tqdm

In [16]:
with pd.HDFStore(h.ddir / "Sperm_hg.scAB.20k.h5",complevel=3,complib="zlib",mode="w") as store:
    for sample in tqdm(human_samples, desc="Writing samples"):
        dat = pd.read_table(
            h.dsdir / "scAB" / f"{sample}.20k.scAB.csv.gz",
            sep="\t",
            names=["chrom","start","scAB"])
        dat = dat.assign(sample_name = sample)
        store.put(
            "main",
            dat, format="table", index=False,
            data_columns = ["chrom","start"],
            append = True,
            min_itemsize = {"sample_name": 32}
        )
    print("Indexing...")
    store.create_table_index("main",columns=["chrom","start"], optlevel=6)

Writing samples: 100%|██████████| 988/988 [05:03<00:00,  3.25it/s]


Indexing...


In [17]:
with pd.HDFStore(h.ddir / "Sperm.scAB.20k.h5",complevel=3,complib="zlib",mode="w") as store:
    for sample in tqdm(mouse_samples, desc="Writing samples"):
        dat = pd.read_table(
            h.dsdir / "scAB" / f"{sample}.20k.scAB.csv.gz",
            sep="\t",
            names=["chrom","start","scAB"])
        dat = dat.assign(sample_name = sample)
        store.put(
            "main",
            dat, format="table", index=False,
            data_columns = ["chrom","start"],
            append = True,
            min_itemsize = {"sample_name": 32}
        )
    print("Indexing...")
    store.create_table_index("main",columns=["chrom","start"], optlevel=6)

Writing samples: 100%|██████████| 718/718 [03:19<00:00,  3.60it/s]


Indexing...


In [14]:
with pd.HDFStore(h.ddir / "Tan2018.scAB.20k.h5",complevel=3,complib="zlib",mode="w") as store:
    for sample in tqdm(Tan2018_gs, desc="Writing samples"):
        dat = pd.read_table(
            h.dsdir / "scAB" / f"{sample}.20k.scAB.csv.gz",
            sep="\t",
            names=["chrom","start","scAB"])
        dat = dat.assign(sample_name = sample)
        store.put(
            "main",
            dat, format="table", index=False,
            data_columns = ["chrom","start"],
            append = True,
            min_itemsize = {"sample_name": 32}
        )
    print("Indexing...")
    store.create_table_index("main",columns=["chrom","start"], optlevel=6)

Writing samples: 100%|██████████| 14/14 [00:08<00:00,  1.66it/s]


Indexing...


In [26]:
with pd.HDFStore(h.ddir / "mESC.scAB.20k.h5",complevel=3,complib="zlib",mode="w") as store:
    for sample in tqdm(mESC_gs, desc="Writing samples"):
        dat = pd.read_table(
            h.dsdir / "scAB" / f"{sample}.20k.scAB.csv.gz",
            sep="\t",
            names=["chrom","start","scAB"])
        dat = dat.assign(sample_name = sample)
        store.put(
            "main",
            dat, format="table", index=False,
            data_columns = ["chrom","start"],
            append = True,
            min_itemsize = {"sample_name": 32}
        )
    print("Indexing...")
    store.create_table_index("main",columns=["chrom","start"], optlevel=6)

Writing samples: 100%|██████████| 46/46 [00:24<00:00,  1.86it/s]


Indexing...


### calculate use pairs files

In [None]:
def work_main(args, pairs_col="pairs_c12", **kwargs):
    outfile = args["outfile"]
    if (not Path(outfile).exists() or Force) and (not Fin):
        scAB = calc_color2(
            args["meta"],
            pairs_col,
            args["ref"],
            binsize = args["binsize"],
            # col_thresh=0.1,
            # row_thresh=0.4,
            # threads = 8
            **kwargs
        )
        scAB = scAB.T
        scAB.index = pd.MultiIndex.from_tuples(
            scAB.index
        )
        scAB.index = pd.MultiIndex.from_arrays(
            [
                scAB.index.get_level_values(0),
                scAB.index.get_level_values(1).astype(int)
            ]
        )
        scAB.index.names = ["chrom","start"]
        scAB.to_parquet(outfile, compression="gzip")
    else:
        print(f"{outfile} exists")
    return args["outfile"]

In [None]:
# scAB_files = {
#     batch : work_main(
#         args,
#         col_thresh=0, # 0 to keep all
#         row_thresh=0, # 0 to keep all
#         threads = 16,
#         fill_color = False
#         )
#     for batch, args in scAB_args.items()
# }

In [None]:
Force = False
Fin = False

e_scAB_files = {
    batch : work_main(
        args,
        "dip",
        col_thresh=0, # 0 to keep all
        row_thresh=0, # 0 to keep all
        threads = 16,
        fill_color = False,
        dropXY = False,
        merge_haplotypes = False,
        dupref=True
        )
    for batch, args in e_scAB_args.items()
    if batch.startswith("mESC")
}

In [None]:
Force = False
Fin = False

e_scAB_files = {
    batch : work_main(
        args,
        "dip",
        col_thresh=0, # 0 to keep all
        row_thresh=0, # 0 to keep all
        threads = 16,
        fill_color = False,
        dropXY = False,
        merge_haplotypes = False,
        dupref=True
        )
    for batch, args in e_scAB_args.items()
    if batch.startswith("Tan2018")
}

In [None]:
e_scAB_files = {
    batch : work_main(
        args,
        col_thresh=0, # 0 to keep all
        row_thresh=0, # 0 to keep all
        threads = 16,
        fill_color = False,
        dropXY = False,
        merge_haplotypes = False
        )
    for batch, args in e_scAB_args.items()
    if batch in ["Sperm_20k","Sperm_hg_20k","RS_20k"]
}

### test res

In [None]:
import plotly.express as px

In [None]:
import plotly.graph_objects as go
data = pd.read_parquet(scAB_files["mESC"]).loc[
    "chr3"
    ].T

data = data.sort_index(axis=1)

fig = go.Figure()
fig.add_trace(
    go.Heatmap(
        z = data.values,
        x = data.columns,
        y = data.index,
        # purple to green
        colorscale=px.colors.diverging.PRGn,
        zmax = 0.012,
        zmin = 0.005
    )
)
fig.update_layout(
    height = 300,
    width = 500,
    title = "mESC chr3 scAB"
)
fig.show(renderer="png")

In [None]:
import plotly.graph_objects as go

data = pd.read_parquet(scAB_files["Sperm"]).loc[
    "chr3"
    ].T

data = data.sort_index(axis=1)

fig = go.Figure()
fig.add_trace(
    go.Heatmap(
        z = data.values,
        x = data.columns,
        y = data.index,
        # purple to green
        colorscale=px.colors.diverging.PRGn,
        zmax = 0.012,
        zmin = 0.005
    )
)
fig.update_layout(
    height = 500,
    width = 500,
    title = "Sperm chr3 scAB"
)
fig.show(renderer="png")

In [None]:
import plotly.graph_objects as go

data = pd.read_parquet(scAB_files["Tan2018"]).loc[
    "chr3"
    ].T

data = data.sort_index(axis=1)

fig = go.Figure()
fig.add_trace(
    go.Heatmap(
        z = data.values,
        x = data.columns,
        y = data.index,
        # purple to green
        colorscale=px.colors.diverging.PRGn,
        zmax = 0.02,
        zmin = 0.005,
        showscale=False
    )
)
fig.update_layout(
    height = 200,
    width = 500,
    title = "GM12878 chr3 scAB",
)
fig.show(renderer="png")

In [None]:
import plotly.graph_objects as go
import plotly.express as px

data = pd.read_parquet(
    e_scAB_files["Tan2018_50k"]
    )
data.sort_index(inplace=True)
data = data.loc[
    ("chr3",60000000) : ("chr3",65000000)
    ].droplevel(0).T

data = data.sort_index(axis=1)

fig = go.Figure()
fig.add_trace(
    go.Heatmap(
        z = data.values,
        x = data.columns,
        y = data.index,
        # purple to green
        colorscale=px.colors.diverging.PRGn,
        zmax = 0.01,
        zmin = 0.005,
        showscale=False
    )
)
fig.update_layout(
    height = 200,
    width = 500,
    title = "GM12878 chr3 60M-65M scAB"
)
fig.show(renderer="png")

In [None]:
import plotly.graph_objects as go
import plotly.express as px

data = pd.read_parquet(
    e_scAB_files["Sperm_hg_50k"]
    )
data.sort_index(inplace=True)
data = data.loc[
    ("chr3",60000000) : ("chr3",65000000)
    ].droplevel(0).T

data = data.sort_index(axis=1)

fig = go.Figure()
fig.add_trace(
    go.Heatmap(
        z = data.values,
        x = data.columns,
        y = data.index,
        # purple to green
        colorscale=px.colors.diverging.PRGn,
        zmax = 0.01,
        zmin = 0.005,
        showscale=False
    )
)
fig.update_layout(
    height = 500,
    width = 500,
    title = "human sperm chr3 60M-65M scAB"
)
fig.show(renderer="png")

In [None]:
import plotly.graph_objects as go
import plotly.express as px

data = pd.read_parquet(
    h.ddir / "Sperm_hg.color2.parquet.gz"
    )
data.sort_index(inplace=True)
data = data.loc[
    ("chr3",60000000) : ("chr3",80000000)
    ].droplevel(0).T

data = data.sort_index(axis=1)

fig = go.Figure()
fig.add_trace(
    go.Heatmap(
        z = data.values,
        x = data.columns,
        y = data.index,
        # purple to green
        colorscale=px.colors.diverging.PRGn,
        zmax = 0.02,
        zmin = 0.005,
        showscale=False
    )
)
fig.update_layout(
    height = 500,
    width = 500,
    title = "Human Sperm chr3 60M-80M scAB"
)
fig.show(renderer="png")

In [None]:
s0 = pd.Series(scAB[0])
s0.sort_index(inplace=True)
s1 = pd.Series(scAB[1])
s1.sort_index(inplace=True)

In [None]:
s1[("chr1",4120000.0):]

In [None]:
s0

In [None]:
s1

In [None]:
pd.concat([s0, s1],axis=1).head(50)

In [None]:
pd.concat([pd.Series(scAB[0]), pd.Series(scAB[1])],axis=1)

In [None]:
import numpy as np

In [None]:
new_data = pd.DataFrame(scAB, index=read_meta("meta/tillsperm28.meta.csv.gz").head(2).index)

In [None]:
np.ceil(new_data.shape[0]*0.4)

In [None]:
new_data.dropna(axis=1,thresh=1,)

In [None]:
stack_dict([scAB[0],scAB[1]],read_meta("meta/tillsperm28.meta.csv.gz").head(2).index,col_thresh=0.4,row_thresh=0.4) 