In [1]:
%run "subhead.ipynb"

In [2]:
from pathlib import Path

import skimage
from hic_basic.plot.hic import _plot_mat, cool2mat
from hic_basic.coolstuff import cli_expected, cli_balance

In [3]:
batches = ["Sperm", "Sperm_hg", "Tan2018", "mESC"]

IS_files = {
    "Sperm_hg" : h.ddir / "Sperm_hg.d3.proximity_map.20k.IS.tsv",
    "Sperm" : h.ddir / "Sperm.d3.proximity_map.20k.IS.tsv",
    "Tan2018" : h.ddir / "Tan2018.d3.proximity_map.20k.IS.tsv",
    "mESC" : h.ddir / "mESC.d3.proximity_map.20k.IS.tsv"
}
ISs = {
    batch : pd.read_csv(IS_files[batch], sep="\t")
    for batch in batches
}

coolps = {
    "Sperm_hg" : h.ddir / "Sperm_hg.pileup.1k.mcool::resolutions/20000",
    "Sperm" : h.ddir / "Sperm.pileup.1k.mcool::resolutions/20000",
    "Tan2018" : h.ddir / "Tan2018.pileup.1k.mcool::resolutions/20000",
    "mESC" : h.ddir / "mESC.pileup.1k.mcool::resolutions/20000"
}

# balance coolp
for batch in batches:
    cli_balance(
        coolps[batch],
        ignore_diags = 0,
        force = False
    )

expected = {
    batch : cli_expected(
        coolps[batch],
        h.ddir / f"{batch}.CM.balanced.expected.20k.tsv",
        ignore_diags=0,
        balanced=True,
        force = False
        )
    for batch in batches
}

Balanced matrix 'weight' already exists. Skipping execution.
Balanced matrix 'weight' already exists. Skipping execution.
Balanced matrix 'weight' already exists. Skipping execution.
Balanced matrix 'weight' already exists. Skipping execution.
File '/shareb/ychi/repo/sperm_struct/notebooks/data2/Sperm.CM.balanced.expected.20k.tsv' already exists. Skipping execution.
File '/shareb/ychi/repo/sperm_struct/notebooks/data2/Sperm_hg.CM.balanced.expected.20k.tsv' already exists. Skipping execution.
File '/shareb/ychi/repo/sperm_struct/notebooks/data2/Tan2018.CM.balanced.expected.20k.tsv' already exists. Skipping execution.
File '/shareb/ychi/repo/sperm_struct/notebooks/data2/mESC.CM.balanced.expected.20k.tsv' already exists. Skipping execution.


In [6]:
outdir = "output_CM_balanced_pileup"

border_dfs = [ISs["Tan2018"]] * 2 + [ISs["mESC"]] * 2 + [ISs["Sperm_hg"]]*2 + [ISs["Sperm"]]*2
border_names = ["GM12878"] * 2 + ["mESC"] * 2 + ["Human sperm"]*2 + ["Mouse sperm"]*2
coolps = ["Tan2018.pileup.1k.mcool", "Sperm_hg.pileup.1k.mcool", "mESC.pileup.1k.mcool", "Sperm.pileup.1k.mcool"] * 2
expected_files = [expected["Tan2018"], expected["Sperm_hg"], expected["mESC"], expected["Sperm"]] * 2
coolp_names = ["GM12878", "Human sperm", "mESC", "Mouse sperm"] * 2
for border_df, border_name, coolp, coolp_name, expected_file in zip(border_dfs, border_names, coolps, coolp_names, expected_files):
    
    # if not (coolp_name in ["Human sperm", "Mouse sperm"]):
    #     continue
    # if not (border_name in ["GM12878", "mESC"]):
    #     continue
    outfile = Path(f"{outdir}/{coolp_name}_on_{border_name}.pdf")
    if outfile.exists():
        print(f"{outfile} exists, skip.")
        continue
    else:
        print("Processing", outfile)

    # --- get tad list --- #

    border_df = border_df.loc[border_df[f"is_boundary_200000"]].reset_index(drop=True)
    tads = IS2blocks(border_df)
    # pick TADs size from 100kb to 1Mb
    tads = tads.query('((start2 - start1) < 1000000) and ((start2 - start1) > 100000)')
    
    # --- pileup --- #
    coolp = str(h.ddir / coolp)
    coolp = str(coolp)+"::/resolutions/20000"
    expected_df = pd.read_table(expected_file)
    mat = block_pileup(coolp, tads, expected=expected_df, power=0.25, balance=True)

    # --- plot --- #
    strength = TAD_pileup_strength(mat)
    fig = _plot_mat(
        mat,
        donorm=False,
        cmap = "fall",
        ignore_diags = False,
        zmax = np.quantile(mat, 0.99),
        zmin = np.quantile(mat, 0.01),
        showscale = True
    )
    fig.update_layout(
        height = 500,
        width = 500,
        title = f"{coolp_name} pileup on {border_name} TADs<br>strength={strength:.2f}",
        xaxis = dict(
            tickvals = [30,60],
            ticktext = ["", ""],
            ticks = "outside",
        ),
        yaxis = dict(
            tickvals = [30,60],
            ticktext = ["", ""],
            ticks = "outside",
        ),
    )
    #fig.show(renderer="png")
    fig.write_image(
        outfile
    )
    mat = pd.DataFrame(mat)
    mat.to_csv(outfile.with_suffix(".tsv"), sep="\t", index=False, header=False)

output_CM_balanced_pileup/GM12878_on_GM12878.pdf exists, skip.
output_CM_balanced_pileup/Human sperm_on_GM12878.pdf exists, skip.
output_CM_balanced_pileup/mESC_on_mESC.pdf exists, skip.
output_CM_balanced_pileup/Mouse sperm_on_mESC.pdf exists, skip.
output_CM_balanced_pileup/GM12878_on_Human sperm.pdf exists, skip.
output_CM_balanced_pileup/Human sperm_on_Human sperm.pdf exists, skip.
Processing output_CM_balanced_pileup/mESC_on_Mouse sperm.pdf
ref is treated as bedpe


chrom: 100%|██████████| 13/13 [01:18<00:00,  6.05s/it]


Processing output_CM_balanced_pileup/Mouse sperm_on_Mouse sperm.pdf
ref is treated as bedpe


chrom: 100%|██████████| 13/13 [01:22<00:00,  6.32s/it]
