In [1]:
%run "subhead.ipynb"

In [2]:
if h.version in ["0","1"]:
    meta = read_meta(h.base / "notebooks/A_meta/meta/All.meta.csv.gz")
elif h.version == "2":
    meta = read_meta(h.base / "notebooks/A_meta/meta/All_formal.meta.csv.gz")

Sperm_ga = load_json(
    h.fig1 / f"Fig1S_structure_quality/Sperm_ga.v{h.version}.json"
)
Sperm_hg_ga = load_json(
    h.fig1 / f"Fig1S_structure_quality/Sperm_hg_ga.v{h.version}.json"
)
mESC_gs = load_json(
    h.fig1 / f"Fig1S_tech_analysis/mESC_samples_with_structure.v{h.version}.json"
)
Tan2018_gs = load_json(
    h.fig1 / f"Fig1S_tech_analysis/Tan2018_samples_with_structure.v{h.version}.json"
)

#binsize = 500000
binsizes = [20000, 50000, 200000, 500000]
e_scAB_args = {}
for binsize in binsizes:
    scAB_args = {
        f"Tan2018_{h.binsize_suffix[binsize]}" : {
            "meta" : meta.loc[Tan2018_gs],
            "binsize" : binsize,
            "ref" : pd.read_table(
                h.ddir / f"GRCh38.CpG.{h.binsize_suffix[binsize]}.tsv.gz",
                names = ["chrom","start","end","CpG"]
                ),
            "outfile" : h.ddir / f"Tan2018_{h.binsize_suffix[binsize]}.color2.parquet.gz",
        },
        f"mESC_{h.binsize_suffix[binsize]}" : {
            "meta" : meta.loc[mESC_gs],
            "binsize" : binsize,
            "ref" : pd.read_table(
                h.ddir / f"mm10.CpG.{h.binsize_suffix[binsize]}.tsv.gz",
                names = ["chrom","start","end","CpG"]
                ),
            "outfile" : h.ddir / f"mESC_{h.binsize_suffix[binsize]}.color2.parquet.gz",
        },
        f"Sperm_{h.binsize_suffix[binsize]}" : {
            "meta" : meta.loc[Sperm_ga],
            "binsize" : binsize,
            "ref" : pd.read_table(
                h.ddir / f"mm10.CpG.{h.binsize_suffix[binsize]}.tsv.gz",
                names = ["chrom","start","end","CpG"]
                ),
            "outfile" : h.ddir / f"Sperm_{h.binsize_suffix[binsize]}.color2.parquet.gz",
        },
        f"Sperm_hg_{h.binsize_suffix[binsize]}" : {
            "meta" : meta.loc[Sperm_hg_ga],
            "binsize" : binsize,
            "ref" : pd.read_table(
                h.ddir / f"GRCh38.CpG.{h.binsize_suffix[binsize]}.tsv.gz",
                names = ["chrom","start","end","CpG"]
                ),
            "outfile" : h.ddir / f"Sperm_hg_{h.binsize_suffix[binsize]}.color2.parquet.gz",
        },
    }
    e_scAB_args.update(scAB_args)

e_scAB_files = {
    batch : args["outfile"]
    for batch, args in e_scAB_args.items()
}

In [3]:
lr_contour = pd.read_pickle(h.fig1 / "Fig1_contour" / f"Sperm.lr_contour.v{h.version}.pkl")
dv_contour = pd.read_pickle(h.fig1 / "Fig1_contour" / f"Sperm.dv_contour.v{h.version}.pkl")
hg_lr_contour = pd.read_pickle(h.fig1 / "Fig1_contour" / f"Sperm_hg.lr_contour.v{h.version}.pkl")
hg_dv_contour = pd.read_pickle(h.fig1 / "Fig1_contour" / f"Sperm_hg.dv_contour.v{h.version}.pkl")

In [4]:
batches = [
    "Sperm_scAB",
    "Sperm_hg_scAB",
    "Sperm",
    "Sperm_hg"
]

arg_bfs_vx_files = { # single slice
    batch : (h.ddir / batch).with_suffix(".vx.bfs.h5")
    for batch in batches
}

arg_bfs_lr_ss_files = { # single slice
    batch : (h.ddir / batch).with_suffix(".lr_ss.bfs.pkl")
    for batch in batches
}

arg_bfs_dv_ss_files = {
    batch : (h.ddir / batch).with_suffix(".dv_ss.bfs.pkl")
    for batch in batches
}

arg_bfs_lr_ft_files = { # full thickness
    batch : (h.ddir / batch).with_suffix(".lr_ft.bfs.pkl")
    for batch in batches
}

arg_bfs_voxel_files = {
    batch : (h.ddir / batch).with_suffix(".voxel.bfs.pkl")
    for batch in batches
}

### plot mouse

In [5]:
regions = pd.read_csv("Sperm.AB.center.1m.csv",index_col="region_id")
feature_binsize = 1000000
scAB_file = e_scAB_files["Sperm_500k"]
ntop = 100

In [6]:
A_regions = regions.query('AB == "A"').index.tolist()
B_regions = regions.query('AB == "B"').index.tolist()

In [7]:
Bstrong = {} # agg_feature_name : sample_list
Bweak = {}
for region_id in B_regions:
    region = regions.loc[region_id, :]
    chrom, start = region["chrom"], region["center"] # start pos of center bin
    end = start + 1000000
    sorted_scAB_df = sort_scAB(
        scAB_file,
        chrom,
        (chrom,start,end),
        lambda x: x.median()
        )
    sorted_scAB_samples = sorted_scAB_df.index.tolist()
    # for B, heads are strong
    Bstrong[region_id+"_dist"] = sorted_scAB_samples[:ntop]
    Bweak[region_id+"_dist"] = sorted_scAB_samples[-ntop:]
Astrong = {}
Aweak = {}
for region_id in A_regions:
    region = regions.loc[region_id, :]
    chrom, start = region["chrom"], region["center"]
    end = start + 1000000
    sorted_scAB_df = sort_scAB(
        scAB_file,
        chrom,
        (chrom,start,end),
        lambda x: x.median()
        )
    sorted_scAB_samples = sorted_scAB_df.index.tolist()
    # for A, tails are strong
    Astrong[region_id+"_dist"] = sorted_scAB_samples[-ntop:]
    Aweak[region_id+"_dist"] = sorted_scAB_samples[:ntop]

In [10]:
batch_name = "mouse"
bfs_file = arg_bfs_vx_files["Sperm_scAB"]
density = get_density(arg_bfs_vx_files["Sperm"])

sample_dicts = [Bstrong, Bweak, Astrong, Aweak]
region_types = ["Bstrong", "Bweak", "Astrong", "Aweak"]
titles = [f"{batch_name} {region_type}, density normed (lr=0 slice)" for region_type in region_types]
#paths = [f"pngs/genomeAB/{batch_name}.{region_type}.png" for region_type in region_types]
paths = [f"output/{batch_name}.{region_type}.pdf" for region_type in region_types]
for sample_dict, region_type, title, path in zip(sample_dicts, region_types, titles, paths):
    data = scAB_feature_agg(
        bfs_file,
        sample_dict # agg_feature_name : sample_list
        )
    ratio = (data / density).fillna(0)
    fig = plot_heatmap_with_bars(
        ratio.unstack(),
        cmap = "RdBu_r",
        #zmax = 0.1,
        #zmin = 0
    )
    fig.update_layout(
        title = title,
    )
    # fig.update_xaxes(
    #     showline = True,
    #     linecolor = 'black',
    #     ticks = "outside",
    #     mirror = True,
    # )
    # fig.update_yaxes(
    #     showline = True,
    #     linecolor = 'black',
    #     ticks = "outside",
    #     mirror = True,
    # )
    fig.write_image(path)
    print(f"{region_type} done")
    #break

Bstrong done
Bweak done
Astrong done
Aweak done


### plot human

In [11]:
regions = pd.read_csv("Sperm_hg.AB.center.1m.csv",index_col="region_id")
feature_binsize = 1000000
scAB_file = e_scAB_files["Sperm_hg_500k"]
ntop = 100

In [12]:
A_regions = regions.query('AB == "A"').index.tolist()
B_regions = regions.query('AB == "B"').index.tolist()

In [14]:
Bstrong = {} # agg_feature_name : sample_list
Bweak = {}
for region_id in B_regions:
    region = regions.loc[region_id, :]
    chrom, start = region["chrom"], region["center"] # start pos of center bin
    end = start + 1000000
    sorted_scAB_df = sort_scAB(
        scAB_file,
        chrom,
        (chrom,start,end),
        lambda x: x.median()
        )
    sorted_scAB_samples = sorted_scAB_df.index.tolist()
    # for B, heads are strong
    Bstrong[region_id+"_dist"] = sorted_scAB_samples[:ntop]
    Bweak[region_id+"_dist"] = sorted_scAB_samples[-ntop:]
Astrong = {}
Aweak = {}
for region_id in A_regions:
    region = regions.loc[region_id, :]
    chrom, start = region["chrom"], region["center"]
    end = start + 1000000
    sorted_scAB_df = sort_scAB(
        scAB_file,
        chrom,
        (chrom,start,end),
        lambda x: x.median()
        )
    sorted_scAB_samples = sorted_scAB_df.index.tolist()
    # for A, tails are strong
    Astrong[region_id+"_dist"] = sorted_scAB_samples[-ntop:]
    Aweak[region_id+"_dist"] = sorted_scAB_samples[:ntop]

In [15]:
batch_name = "human"
bfs_file = arg_bfs_vx_files["Sperm_hg_scAB"]
density = get_density(arg_bfs_vx_files["Sperm_hg"])

sample_dicts = [Bstrong, Bweak, Astrong, Aweak]
region_types = ["Bstrong", "Bweak", "Astrong", "Aweak"]
titles = [f"{batch_name} {region_type}, density normed (lr=0 slice)" for region_type in region_types]
#paths = [f"pngs/genomeAB/{batch_name}.{region_type}.png" for region_type in region_types]
paths = [f"output/{batch_name}.{region_type}.pdf" for region_type in region_types]
for sample_dict, region_type, title, path in zip(sample_dicts, region_types, titles, paths):
    data = scAB_feature_agg(
        bfs_file,
        sample_dict
        )
    ratio = (data / density).fillna(0)
    fig = plot_heatmap_with_bars(
        ratio.unstack(),
        cmap = "RdBu_r",
    )
    fig.update_layout(
        title = title,
    )
    # fig.update_xaxes(
    #     showline = True,
    #     linecolor = 'black',
    #     ticks = "outside",
    #     mirror = True,
    # )
    # fig.update_yaxes(
    #     showline = True,
    #     linecolor = 'black',
    #     ticks = "outside",
    #     mirror = True,
    # )
    fig.write_image(path)
    print(f"{region_type} done")

Bstrong done
Bweak done
Astrong done
Aweak done
