In [1]:
import sys
from pathlib import Path

sys.path.insert(0, "/share/home/ychi/dev/sperm_struct/notebooks")

import h2 as h
import numpy as np
import pandas as pd
from hic_basic.binnify import GenomeIdeograph
from hic_basic.data import chromosomes, dupref_annote, fetch_cent_chromlen
from hic_basic.hicio import dump_pickle, load_pickle
from hic_basic.plot.render import centelo_relpos
from hic_basic.sequence import count_CpG
from hires_utils.hires_io import s2m_index, m2s_index

In [2]:
Force = False

# Prepare features and aggs for mm10 genome

Initiate features(pd.DataFrame) and aggs(dict).

In [3]:
# prepare base index
mm10 = GenomeIdeograph("mm10")
bins = mm10.bins(20e3,bed=True,order=True, flavor="hickit")
features = bins.set_index(["chrom","start"]).drop("end",axis=1).copy()

agg = {}

Adding CpG density.

In [4]:
Fin = False
outfile = h.ddir / "mm10.CpG.20k.tsv.gz"
if (not outfile.exists() or Force) and (not Fin):
    CpG = count_CpG(
        bins,
        "/share/Data/ychi/genome/GRCm38/raw/mm10.fa"
    )

    CpG.to_csv(
        outfile,
        index=False,
        header=False,
        sep="\t"
        )
else:
    print(f"{outfile} exists, skipping")

/shareb/ychi/repo/sperm_struct/notebooks/data2/mm10.CpG.20k.tsv.gz exists, skipping


In [5]:
CpG = pd.read_table(
    h.ddir / "mm10.CpG.20k.tsv.gz",
    names = ["chrom", "start", "end","CpG"],
    index_col = ["chrom", "start"]
).drop("end",axis=1)

# strict left
features = pd.concat([features,CpG],axis=1,join="outer").loc[features.index]

agg.update(
    {"mean_CpG" : ("CpG", "mean")}
)

In [6]:
CpG["CpG"].quantile([0.1,0.5,0.9,0.95])

0.10    0.00405
0.50    0.00730
0.90    0.01375
0.95    0.01595
Name: CpG, dtype: float64

Adding chromosome distribution

In [7]:
mm10 = chromosomes("mm10")

In [8]:
features = features.assign(**dict.fromkeys(mm10.index,0))
for chrom in mm10.index:
    features.loc[chrom,chrom] = 1

agg.update(
    dict(zip(
        [chrom+"_dist" for chrom in mm10.index],
        [(chrom,"sum") for chrom in mm10.index]
    ))
)

Adding density.

In [9]:
features = features.assign(particle=1)

agg.update(
    {"density" : ("particle", "sum")}
)

Adding per-sample depths.

In [10]:
# #depths_df = pd.read_parquet("/share/home/ychi/data/notebook/dailyT/sperm25/depth.parquet")
# depths_df = pd.read_parquet("/share/home/ychi/data/notebook/dailyT/sperm25/real_depth.parquet")

# agg.update(
#     {
#         "mean_depth" : ("depth","mean")
#     }
# )

Adding centelo.

In [11]:
relpos = centelo_relpos(s2m_index(features.index), "mm10")
relpos.columns = ["chrom","pos","centelo"]
relpos = relpos.set_index(["chrom","pos"])

In [12]:
relpos = relpos.loc[~relpos.index.get_level_values(0).isin(["chrX","chrY"])]

In [13]:
features = pd.concat([features,m2s_index(relpos)],axis=1,join="outer").loc[features.index]
agg.update(
    {"mean_centelo" : ("centelo", "mean")}
)

Adding extrem centelo

In [14]:
features.sort_index(inplace=True)
# orig_features = features.copy() # just backup
features = s2m_index(features)

In [15]:
dists = [0.5e6,1e6,2e6,5e6,10e6]
p, q = False, True
centelo = fetch_cent_chromlen("mm10")
centelo_cols = []
for dis in dists:
    centelo_col = "%.1f" % (dis/1e6) + "M_centelo"
    centelo_cols.append(centelo_col)
    features = features.assign(**{centelo_col:0})
    for chrom in centelo.index:
        if chrom.startswith("chrX") or chrom.startswith("chrY"):
            continue
        cent_start, cent_end = centelo.loc[chrom, ["start","end"]]
        # arm1 paracentric
        if p:
            features.loc[(chrom, cent_start-dis):(chrom, cent_start), centelo_col] = -1
        else:
            pass
        if q:
            # arm2 paracentric
            features.loc[(chrom, cent_end):(chrom, cent_end+dis), centelo_col] = -1
        else:
            pass

        start, end = 0, centelo.loc[chrom, "chrom_length"]
        # arm1 near telomere
        if p:
            features.loc[(chrom, start):(chrom, start+dis), centelo_col] = 1
        else:
            pass
        if q:
            features.loc[(chrom, end-dis):(chrom, end), centelo_col] = 1
        else:
            pass
agg.update(
    dict(zip(
        [centelo_col+"_dist" for centelo_col in centelo_cols],
        [(i,"sum") for i in centelo_cols]
    ))
)
agg.update(
    dict(zip(
        [centelo_col+"_mean_dist" for centelo_col in centelo_cols],
        [(i,"mean") for i in centelo_cols]
    ))
)

In [16]:
features = m2s_index(features)

add separate extreme centelo

In [17]:
features = s2m_index(features)

In [18]:
dists = [0.5e6,1e6,2e6,5e6,10e6]
p, q = False, True
centelo = fetch_cent_chromlen("mm10")
centelo_cols = []
for dis in dists:
    centelo_col = "%.1f" % (dis/1e6) + "M_cent"
    centelo_cols.append(centelo_col)
    features = features.assign(**{centelo_col:0})
    for chrom in centelo.index:
        if chrom.startswith("chrX") or chrom.startswith("chrY"):
            continue
        cent_start, cent_end = centelo.loc[chrom, ["start","end"]]
        # arm1 paracentric
        if p:
            features.loc[(chrom, cent_start-dis):(chrom, cent_start), centelo_col] = -1
        else:
            pass
        if q:
            # arm2 paracentric
            features.loc[(chrom, cent_end):(chrom, cent_end+dis), centelo_col] = -1
        else:
            pass
for dis in dists:
    centelo_col = "%.1f" % (dis/1e6) + "M_telo"
    centelo_cols.append(centelo_col)
    features = features.assign(**{centelo_col:0})
    for chrom in centelo.index:
        if chrom.startswith("chrX") or chrom.startswith("chrY"):
            continue
        start, end = 0, centelo.loc[chrom, "chrom_length"]
        # arm1 near telomere
        if p:
            features.loc[(chrom, start):(chrom, start+dis), centelo_col] = 1
        else:
            pass
        if q:
            features.loc[(chrom, end-dis):(chrom, end), centelo_col] = 1
        else:
            pass
agg.update(
    dict(zip(
        [centelo_col+"_dist" for centelo_col in centelo_cols],
        [(i,"sum") for i in centelo_cols]
    ))
)
agg.update(
    dict(zip(
        [centelo_col+"_mean_dist" for centelo_col in centelo_cols],
        [(i,"mean") for i in centelo_cols]
    ))
)

In [19]:
features = m2s_index(features)

add centelo scan

In [20]:
features = features.assign(
    **{
        "5.0Mto10.0M_telo" : features["10.0M_telo"] - features["5.0M_telo"],
        "5.0Mto10.0M_cent" : features["10.0M_cent"] - features["5.0M_cent"]
        }
)
agg.update(
    {
        "5.0Mto10.0M_telo_dist" : ("5.0Mto10.0M_telo", "sum"),
        "5.0Mto10.0M_cent_dist" : ("5.0Mto10.0M_cent", "sum")
        }
)
agg.update(
    {
        "5.0Mto10.0M_telo_mean_dist" : ("5.0Mto10.0M_telo", "mean"),
        "5.0Mto10.0M_cent_mean_dist" : ("5.0Mto10.0M_cent", "mean")
        }
)


add bulk A and B compartments

In [21]:
vec = pd.read_table(h.ddir / "Sperm.cis.vecs.tsv")

In [22]:
from hic_basic.compartment import AB_block_ends

In [23]:
features = features.assign(
    bulkA = 0,
    bulkB = 0
)

In [24]:
for i, row in AB_block_ends(vec).iterrows():
    if row["AB"] == "A":
        features.loc[(row["chrom"],row["start"]):(row["chrom"],row["end"]),"bulkA"] = 1
    elif row["AB"] == "B":
        features.loc[(row["chrom"],row["start"]):(row["chrom"],row["end"]),"bulkB"] = 1

In [25]:
agg.update(
    {
        "bulkA_dist" : ("bulkA", "sum"),
        "bulkB_dist" : ("bulkB", "sum")
    }
)

Dump to files.

In [26]:
# 0327, half bin
features.to_csv(h.ddir/"mm10.features.csv.gz")
dump_pickle(agg,h.ddir/"mm10.aggs.pkl")

# prepare features for Sperm

In [22]:
features = pd.read_csv(
    h.ddir/"mm10.features.csv.gz",
    index_col=[0,1]
)
features.index.names = ["chrom","start"]
aggs = load_pickle(h.ddir/"mm10.aggs.pkl")

### adding 1M intermingle

In [23]:
intermingling = pd.read_parquet(
    h.ddir / "Sperm.intermingling.parquet"
)
intermingling.index.names = [
    "chrom",
    "coarsen_start" #1M reso
]

FileNotFoundError: [Errno 2] No such file or directory: '/shareb/ychi/repo/sperm_struct/notebooks/data2/Sperm.intermingling.parquet'

In [None]:
grouper = GenomeIdeograph("mm10").coarsen_grouper(20000, 1000000)

In [None]:
features = pd.merge(
    features.assign(
        coarsen_start = grouper.apply(lambda x : x[1])
    ).reset_index(),
    intermingling.reset_index(),
    on=["chrom","coarsen_start"],
    how="left"
).drop("coarsen_start",axis=1).set_index(["chrom","start"])

In [None]:
aggs.update(
    {
        "mean_intermingling_ratio" : ("intermingling_ratio", "mean"),
        "mean_multi_chrom_intermingling" : ("multi_chrom_intermingling", "mean"),
        "mean_species_richness" : ("species_richness", "mean"),
    }
)

In [None]:
features.to_csv(
    h.ddir / "Sperm.mm10.features.csv.gz",
    index=True
)
dump_pickle(aggs, h.ddir/"Sperm.mm10.aggs.pkl")

# Prepare features and aggs for mm10_dip genome

Initiate features(pd.DataFrame) and aggs(dict).

In [3]:
# prepare base index
mm10_dip = GenomeIdeograph("mm10_dip")
bins = mm10_dip.bins(20e3,bed=True,order=True,flavor="hickit")
features = bins.set_index(["chrom","start"]).drop("end",axis=1).copy()

agg = {}

Adding CpG density.

In [4]:
CpG = pd.read_table(
    h.ddir / "mm10.CpG.20k.tsv.gz",
    names = ["chrom", "start", "end","CpG"],
    index_col = ["chrom", "start"]
).drop("end",axis=1)

# strict left
features = dupref_annote(features, CpG)

agg.update(
    {"mean_CpG" : ("CpG", "mean")}
)

In [5]:
CpG["CpG"].quantile([0.1,0.5,0.9,0.95])

0.10    0.00405
0.50    0.00730
0.90    0.01375
0.95    0.01595
Name: CpG, dtype: float64

Adding chromosome distribution

In [6]:
mm10_dip = chromosomes("mm10_dip")

In [7]:
features = features.assign(**dict.fromkeys(mm10_dip.index,0))
for chrom in mm10_dip.index:
    features.loc[chrom,chrom] = 1

agg.update(
    dict(zip(
        [chrom+"_dist" for chrom in mm10_dip.index],
        [(chrom,"sum") for chrom in mm10_dip.index]
    ))
)

Adding density.

In [8]:
features = features.assign(particle=1)

agg.update(
    {"density" : ("particle", "sum")}
)

Adding per-sample depths.

In [9]:
# #depths_df = pd.read_parquet("/share/home/ychi/data/notebook/dailyT/sperm25/depth.parquet")
# depths_df = pd.read_parquet("/share/home/ychi/data/notebook/dailyT/sperm25/real_depth.parquet")

# agg.update(
#     {
#         "mean_depth" : ("depth","mean")
#     }
# )

Adding centelo.

In [10]:
#relpos = centelo_relpos(s2m_index(features.index), "mm10", dupref=True) # remember to set dupref=True
relpos = centelo_relpos(s2m_index(features.index), "mm10_dip") # remember to set dupref=True
relpos.columns = ["chrom","pos","centelo"]
relpos = relpos.set_index(["chrom","pos"])

In [11]:
relpos = relpos.loc[~relpos.index.get_level_values(0).isin(
    ["chrX(mat)","chrX(pat)","chrY(mat)","chrY(pat)"])
    ]

In [12]:
features = pd.concat([features,m2s_index(relpos)],axis=1,join="outer").loc[features.index]
agg.update(
    {"mean_centelo" : ("centelo", "mean")}
)

Adding extrem centelo

In [13]:
features.sort_index(inplace=True)
features = s2m_index(features)

In [14]:
dists = [0.5e6,1e6,2e6,5e6,10e6]
p, q = False, True
centelo = fetch_cent_chromlen("mm10_dip")
centelo_cols = []
for dis in dists:
    centelo_col = "%.1f" % (dis/1e6) + "M_centelo"
    centelo_cols.append(centelo_col)
    features = features.assign(**{centelo_col:0})
    for chrom in centelo.index:
        cent_start, cent_end = centelo.loc[chrom, ["start","end"]]
        # arm1 paracentric
        if p:
            features.loc[(chrom, cent_start-dis):(chrom, cent_start), centelo_col] = -1
        else:
            pass
        if q:
            # arm2 paracentric
            features.loc[(chrom, cent_end):(chrom, cent_end+dis), centelo_col] = -1
        else:
            pass

        start, end = 0, centelo.loc[chrom, "chrom_length"]
        # arm1 near telomere
        if p:
            features.loc[(chrom, start):(chrom, start+dis), centelo_col] = 1
        else:
            pass
        if q:
            features.loc[(chrom, end-dis):(chrom, end), centelo_col] = 1
        else:
            pass
agg.update(
    dict(zip(
        [centelo_col+"_dist" for centelo_col in centelo_cols],
        [(i,"sum") for i in centelo_cols]
    ))
)
agg.update(
    dict(zip(
        [centelo_col+"_mean_dist" for centelo_col in centelo_cols],
        [(i,"mean") for i in centelo_cols]
    ))
)

In [15]:
features = m2s_index(features)

### add separate centelo

In [16]:
features.sort_index(inplace=True)
features = s2m_index(features)

In [17]:
dists = [0.5e6,1e6,2e6,5e6,10e6]
p, q = False, True
centelo = fetch_cent_chromlen("mm10_dip")
centelo_cols = []
for dis in dists:
    centelo_col = "%.1f" % (dis/1e6) + "M_cent"
    centelo_cols.append(centelo_col)
    features = features.assign(**{centelo_col:0})
    for chrom in centelo.index:
        cent_start, cent_end = centelo.loc[chrom, ["start","end"]]
        # arm1 paracentric
        if p:
            features.loc[(chrom, cent_start-dis):(chrom, cent_start), centelo_col] = -1
        else:
            pass
        if q:
            # arm2 paracentric
            features.loc[(chrom, cent_end):(chrom, cent_end+dis), centelo_col] = -1
        else:
            pass
for dis in dists:
    centelo_col = "%.1f" % (dis/1e6) + "M_telo"
    centelo_cols.append(centelo_col)
    features = features.assign(**{centelo_col:0})
    for chrom in centelo.index:
        start, end = 0, centelo.loc[chrom, "chrom_length"]
        # arm1 near telomere
        if p:
            features.loc[(chrom, start):(chrom, start+dis), centelo_col] = 1
        else:
            pass
        if q:
            features.loc[(chrom, end-dis):(chrom, end), centelo_col] = 1
        else:
            pass
agg.update(
    dict(zip(
        [centelo_col+"_dist" for centelo_col in centelo_cols],
        [(i,"sum") for i in centelo_cols]
    ))
)
agg.update(
    dict(zip(
        [centelo_col+"_mean_dist" for centelo_col in centelo_cols],
        [(i,"mean") for i in centelo_cols]
    ))
)

In [18]:
features = m2s_index(features)

Dump to files.

In [19]:
features.to_csv(h.ddir/"mm10_dip.features.csv.gz")
dump_pickle(agg,h.ddir/"mm10_dip.aggs.pkl")