In [1]:
import sys
import concurrent.futures
from pathlib import Path

sys.path.insert(0, "/share/home/ychi/dev/sperm_struct/notebooks")

import h2 as h
import numpy as np
import pandas as pd
from hic_basic.hicio import read_meta, load_pickle, load_json
from lib.struct import pileup_bf

### prepare data

In [2]:
meta = read_meta("../A_meta/meta/All_formal.meta.csv.gz")
# for axis
primary_views = {}
primary_views.update(load_pickle(h.dsdir / "primary_views/Sperm.64.pkl"))
primary_views.update(load_pickle(h.dsdir / "primary_views/Sperm_hg.64.pkl"))
#primary_views.update(load_pickle(h.dsdir / "primary_views/contaminate_res.64.pkl"))
# for sample list
# GM_samples = list(load_pickle(h.dsdir / "primary_views/GM.64.pkl").keys())
Sperm_samples = load_json(f"../Fig1S_tech_analysis/Sperm_samples_with_structure.v{h.version}.json")
Sperm_hg_samples = load_json(f"../Fig1S_tech_analysis/Sperm_hg_samples_with_structure.v{h.version}.json")
# random targets
# GM_random_targets = pd.DataFrame(
#         [[1,1,1] for _ in GM_samples],
#         index = GM_samples,
#         columns = ["ht", "dv", "lr"] 
#         )
Sperm_random_targets = pd.DataFrame(
        [[1,1,1] for _ in Sperm_samples],
        index = Sperm_samples,
        columns = ["ht", "dv", "lr"] 
        )
Sperm_hg_random_targets = pd.DataFrame(
        [[1,1,1] for _ in Sperm_hg_samples],
        index = Sperm_hg_samples,
        columns = ["ht", "dv", "lr"] 
        )
targets = pd.concat(
    [
        #GM_random_targets,
        Sperm_random_targets,
        Sperm_hg_random_targets,
    ],
    axis=0
)

### pileup arguments

In [3]:
batches = [
    "Sperm",
    #"GM",
    "Sperm_hg",
]

arg_targetses = { # batch_name : target df
    "Sperm" : targets.loc[Sperm_samples],
    #"GM" : targets.loc[GM_samples],
    "Sperm_hg" : targets.loc[Sperm_hg_samples],
}

arg_featureses = { # batch_name : feature df
    "Sperm" : pd.read_csv(
        h.ddir / "mm10.features.csv.gz",
        index_col = [0,1]
    )[["particle"]],
    "Sperm_hg" : pd.read_csv(
        h.ddir / "GRCh38.features.csv.gz",
        index_col = [0,1]
    )[["particle"]],
}

arg_aggses = {
    "Sperm" : {'density': ('particle', 'sum')},
    "Sperm_hg" : {'density': ('particle', 'sum')},
}

arg_bfs_lr_ft_files = { # full thickness
    batch : (h.ddir / batch).with_suffix(".random.lr_ft.bfs.pkl")
    for batch in batches
}

arg_bfs_dv_ft_files = {
    batch : (h.ddir / batch).with_suffix(".random.dv_ft.bfs.pkl")
    for batch in batches
}

arg_bfs_lr_ss_files = { # single slice
    batch : (h.ddir / batch).with_suffix(".random.lr_ss.bfs.pkl")
    for batch in batches
}

arg_bfs_dv_ss_files = {
    batch : (h.ddir / batch).with_suffix(".random.dv_ss.bfs.pkl")
    for batch in batches
}

### do the pileups

In [6]:
Fin = False
Force = False
outfiles = arg_bfs_lr_ft_files

def process_batch(batch):
    targets, bf_file = arg_targetses[batch], outfiles[batch]
    if bf_file.exists() and not Force:
        print(f"{bf_file} exists, skipping")
        return
    bfs = pileup_bf(
        primary_views,
        targets,
        meta,
        features = arg_featureses[batch],
        agg = arg_aggses[batch],
        sub = None,
        grouping = ["ht", "dv"]
    )
    bfs = bfs.swaplevel(0,1,axis=1)
    bfs.to_pickle(bf_file)
    print(f"{bf_file} done")

if not Fin:
    with concurrent.futures.ProcessPoolExecutor(8) as executor:
        executor.map(process_batch, arg_targetses)

/shareb/ychi/repo/sperm_struct/notebooks/data2/Sperm.random.lr_ft.bfs.pkl exists, skipping/shareb/ychi/repo/sperm_struct/notebooks/data2/Sperm_hg.random.lr_ft.bfs.pkl exists, skipping



In [7]:
Fin = False
Force = False
outfiles = arg_bfs_dv_ft_files

def process_batch(batch):
    targets, bf_file = arg_targetses[batch], outfiles[batch]
    if bf_file.exists() and not Force:
        print(f"{bf_file} exists, skipping")
        return
    bfs = pileup_bf(
        primary_views,
        targets,
        meta,
        features = arg_featureses[batch],
        agg = arg_aggses[batch],
        sub = None,
        grouping = ["ht", "lr"]
    )
    bfs = bfs.swaplevel(0,1,axis=1)
    bfs.to_pickle(bf_file)
    print(f"{bf_file} done")

if not Fin:
    with concurrent.futures.ProcessPoolExecutor(8) as executor:
        executor.map(process_batch, arg_targetses)

/shareb/ychi/repo/sperm_struct/notebooks/data2/Sperm.random.dv_ft.bfs.pkl exists, skipping/shareb/ychi/repo/sperm_struct/notebooks/data2/Sperm_hg.random.dv_ft.bfs.pkl exists, skipping

