In [1]:
import sys
import concurrent.futures
from pathlib import Path

sys.path.insert(0, "/share/home/ychi/dev/sperm_struct/notebooks")

import h2 as h
import numpy as np
import pandas as pd
from hic_basic.hicio import read_meta, load_pickle, load_json
from lib.struct import pileup_bf

### prepare data

In [8]:
# for _3dg paths
if h.version in ["0","1"]:
    meta = read_meta("../A_meta/meta/tillsperm55.meta.csv.gz")
    hg_meta = read_meta("../A_meta/meta/tillsperm62_hg.meta.csv.gz")
    GM_meta = read_meta("../A_meta/meta/tillsperm50_GM.meta.csv.gz")
    meta = pd.concat([meta, hg_meta, GM_meta],axis=0)
elif h.version == "2":
    meta = read_meta("../A_meta/meta/All_formal.meta.csv.gz")
# for axis
primary_views = {}
primary_views.update(load_pickle(h.dsdir / "primary_views/Sperm.64.pkl"))
primary_views.update(load_pickle(h.dsdir / "primary_views/Sperm_hg.64.pkl"))
# for sample list
if h.version in ["0","1"]:
    Sperm_samples = load_json(f"../Fig1S_tech_analysis/Sperm_samples_with_structure.v{h.version}.json")
    Sperm_hg_samples = load_json(f"../Fig1S_tech_analysis/Sperm_hg_samples_with_structure.v{h.version}.json")
elif h.version == "2":
    Sperm_samples = load_json(f"../Fig1S_structure_quality/Sperm_ga.v{h.version}.json")
    Sperm_hg_samples = load_json(f"../Fig1S_structure_quality/Sperm_hg_ga.v{h.version}.json")
# round1 targets
Sperm_round1_targets = pd.read_csv(
    f"Sperm.round1.targets.v{h.version}.csv",
    index_col = 0
)
Sperm_hg_round1_targets = pd.read_csv(
    f"Sperm_hg.round1.targets.v{h.version}.csv",
    index_col = 0
)

### pileup arguments

In [9]:
batches = [
    "Sperm_round1",
    "Sperm_hg_round1",
]

arg_targetses = { # batch_name : target df
    "Sperm_round1" : Sperm_round1_targets.loc[Sperm_samples],
    "Sperm_hg_round1" : Sperm_hg_round1_targets.loc[Sperm_hg_samples],
}

arg_featureses = { # batch_name : feature df
    "Sperm_round1" : pd.read_csv(
        h.ddir / "mm10.features.csv.gz",
        index_col = [0,1]
    )[["particle"]],
    "Sperm_hg_round1" : pd.read_csv(
        h.ddir / "GRCh38.features.csv.gz",
        index_col = [0,1]
    )[["particle"]],
}

arg_aggses = {
    "Sperm_round1" : {'density': ('particle', 'sum')},
    "Sperm_hg_round1" : {'density': ('particle', 'sum')},
}

arg_bfs_lr_ft_files = { # full thickness
    batch : (h.ddir / batch).with_suffix(".round1.lr_ft.bfs.pkl")
    for batch in batches
}

arg_bfs_dv_ft_files = {
    batch : (h.ddir / batch).with_suffix(".round1.dv_ft.bfs.pkl")
    for batch in batches
}

arg_bfs_lr_ss_files = { # single slice
    batch : (h.ddir / batch).with_suffix(".round1.lr_ss.bfs.pkl")
    for batch in batches
}

arg_bfs_dv_ss_files = {
    batch : (h.ddir / batch).with_suffix(".round1.dv_ss.bfs.pkl")
    for batch in batches
}

### do the pileups

In [12]:
Fin = False
Force = False
outfiles = arg_bfs_lr_ft_files

def process_batch(batch):
    targets, bf_file = arg_targetses[batch], outfiles[batch]
    if bf_file.exists() and not Force:
        print(f"{bf_file} exists, skipping")
        return
    bfs = pileup_bf(
        primary_views,
        targets,
        meta,
        features = arg_featureses[batch],
        agg = arg_aggses[batch],
        sub = None,
        grouping = ["ht", "dv"]
    )
    bfs = bfs.swaplevel(0,1,axis=1)
    bfs.to_pickle(bf_file)
    print(f"{bf_file} done")

if not Fin:
    with concurrent.futures.ProcessPoolExecutor(8) as executor:
        executor.map(process_batch, arg_targetses)

/shareb/ychi/repo/sperm_struct/notebooks/data2/Sperm_round1.round1.lr_ft.bfs.pkl exists, skipping/shareb/ychi/repo/sperm_struct/notebooks/data2/Sperm_hg_round1.round1.lr_ft.bfs.pkl exists, skipping



In [13]:
Fin = False
Force = False
outfiles = arg_bfs_dv_ft_files

def process_batch(batch):
    targets, bf_file = arg_targetses[batch], outfiles[batch]
    if bf_file.exists() and not Force:
        print(f"{bf_file} exists, skipping")
        return
    bfs = pileup_bf(
        primary_views,
        targets,
        meta,
        features = arg_featureses[batch],
        agg = arg_aggses[batch],
        sub = None,
        grouping = ["ht", "lr"]
    )
    bfs = bfs.swaplevel(0,1,axis=1)
    bfs.to_pickle(bf_file)
    print(f"{bf_file} done")

if not Fin:
    with concurrent.futures.ProcessPoolExecutor(8) as executor:
        executor.map(process_batch, arg_targetses)

/shareb/ychi/repo/sperm_struct/notebooks/data2/Sperm_hg_round1.round1.dv_ft.bfs.pkl exists, skipping
/shareb/ychi/repo/sperm_struct/notebooks/data2/Sperm_round1.round1.dv_ft.bfs.pkl exists, skipping
