In [1]:
import sys
from pathlib import Path

sys.path.insert(0, "/share/home/ychi/dev/sperm_struct/notebooks")

import h2 as h
import numpy as np
import pandas as pd
from hic_basic.binnify import GenomeIdeograph
from hic_basic.data import chromosomes, dupref_annote, fetch_cent_chromlen
from hic_basic.hicio import dump_pickle, load_pickle
from hic_basic.plot.render import centelo_relpos
from hic_basic.sequence import count_CpG
from hires_utils.hires_io import s2m_index, m2s_index

In [2]:
Force = False

# Prepare features for GRCh38 genome

Initiate features(pd.DataFrame) and aggs(dict).

In [15]:
# prepare base index
genomes = GenomeIdeograph("GRCh38")
bins = genomes.bins(20e3,bed=True, order=True, flavor="hickit")
features = bins.set_index(["chrom","start"]).drop("end",axis=1).copy()

agg = {}

Adding CpG density.

In [16]:
Fin = False
outfile = h.ddir / "GRCh38.CpG.20k.tsv.gz"
if (not outfile.exists() or Force) and (not Fin):
    CpG = count_CpG(
        bins,
        "/share/Data/ychi/genome/GRCh38/raw/GRCh38.primary_assembly.genome.fa"
    )

    CpG.to_csv(
        outfile,
        index=False,
        header=False,
        sep="\t"
        )

In [17]:
CpG = pd.read_table(
    h.ddir / "GRCh38.CpG.20k.tsv.gz",
    names = ["chrom","start","end","CpG"],
    index_col = ["chrom","start"]
).drop("end",axis=1)

# strict left
features = pd.concat([features,CpG],axis=1,join="outer").loc[features.index]
# features = dupref_annote(features, CpG)

agg.update(
    {"mean_CpG" : ("CpG", "mean")}
)

Adding chromosome distribution

In [18]:
genomes = chromosomes("GRCh38")

In [19]:
features = features.assign(**dict.fromkeys(genomes.index,0))
for chrom in genomes.index:
    features.loc[chrom,chrom] = 1

agg.update(
    dict(zip(
        [chrom+"_dist" for chrom in genomes.index],
        [(chrom,"sum") for chrom in genomes.index]
    ))
)

Adding density.

In [20]:
features = features.assign(particle=1)

agg.update(
    {"density" : ("particle", "sum")}
)

Adding centelo.

In [21]:
relpos = centelo_relpos(s2m_index(features.index), "GRCh38")
relpos.columns = ["chrom","pos","centelo"]
relpos = relpos.set_index(["chrom","pos"])

In [22]:
relpos = relpos.loc[~relpos.index.get_level_values(0).isin(["chrX","chrY"])]

In [23]:
features = pd.concat([features,m2s_index(relpos)],axis=1,join="outer").loc[features.index]
agg.update(
    {"mean_centelo" : ("centelo", "mean")}
)

Adding extrem centelo

In [24]:
features.sort_index(inplace=True)
features = s2m_index(features)

In [25]:
dists = [0.5e6,1e6,2e6,5e6,10e6]
p, q = True, True
centelo = fetch_cent_chromlen("GRCh38")
centelo_cols = []
for dis in dists:
    centelo_col = "%.1f" % (dis/1e6) + "M_centelo"
    centelo_cols.append(centelo_col)
    features = features.assign(**{centelo_col:0})
    for chrom in centelo.index:
        if chrom.startswith("chrX") or chrom.startswith("chrY"):
            continue
        cent_start, cent_end = centelo.loc[chrom, ["start","end"]]
        # arm1 paracentric
        if p:
            features.loc[(chrom, cent_start-dis):(chrom, cent_start), centelo_col] = -1
        else:
            pass
        if q:
            # arm2 paracentric
            features.loc[(chrom, cent_end):(chrom, cent_end+dis), centelo_col] = -1
        else:
            pass

        start, end = 0, centelo.loc[chrom, "chrom_length"]
        # arm1 near telomere
        if p:
            features.loc[(chrom, start):(chrom, start+dis), centelo_col] = 1
        else:
            pass
        if q:
            features.loc[(chrom, end-dis):(chrom, end), centelo_col] = 1
        else:
            pass
agg.update(
    dict(zip(
        [centelo_col+"_dist" for centelo_col in centelo_cols],
        [(i,"sum") for i in centelo_cols]
    ))
)
agg.update(
    dict(zip(
        [centelo_col+"_mean_dist" for centelo_col in centelo_cols],
        [(i,"mean") for i in centelo_cols]
    ))
)

In [26]:
features = m2s_index(features)

add separate extreme centelo

In [27]:
features = s2m_index(features)

In [28]:
dists = [0.5e6,1e6,2e6,5e6,10e6]
p, q = True, True
centelo = fetch_cent_chromlen("GRCh38")
centelo_cols = []
for dis in dists:
    centelo_col = "%.1f" % (dis/1e6) + "M_cent"
    centelo_cols.append(centelo_col)
    features = features.assign(**{centelo_col:0})
    for chrom in centelo.index:
        if chrom.startswith("chrX") or chrom.startswith("chrY"):
            continue
        cent_start, cent_end = centelo.loc[chrom, ["start","end"]]
        # arm1 paracentric
        if p:
            features.loc[(chrom, cent_start-dis):(chrom, cent_start), centelo_col] = -1
        else:
            pass
        if q:
            # arm2 paracentric
            features.loc[(chrom, cent_end):(chrom, cent_end+dis), centelo_col] = -1
        else:
            pass
for dis in dists:
    centelo_col = "%.1f" % (dis/1e6) + "M_telo"
    centelo_cols.append(centelo_col)
    features = features.assign(**{centelo_col:0})
    for chrom in centelo.index:
        if chrom.startswith("chrX") or chrom.startswith("chrY"):
            continue
        start, end = 0, centelo.loc[chrom, "chrom_length"]
        # arm1 near telomere
        if p:
            features.loc[(chrom, start):(chrom, start+dis), centelo_col] = 1
        else:
            pass
        if q:
            features.loc[(chrom, end-dis):(chrom, end), centelo_col] = 1
        else:
            pass
agg.update(
    dict(zip(
        [centelo_col+"_dist" for centelo_col in centelo_cols],
        [(i,"sum") for i in centelo_cols]
    ))
)
agg.update(
    dict(zip(
        [centelo_col+"_mean_dist" for centelo_col in centelo_cols],
        [(i,"mean") for i in centelo_cols]
    ))
)

In [29]:
features = m2s_index(features)

add centelo scan

In [30]:
features = features.assign(
    **{
        "5.0Mto10.0M_telo" : features["10.0M_telo"] - features["5.0M_telo"],
        "5.0Mto10.0M_cent" : features["10.0M_cent"] - features["5.0M_cent"]
        }
)
agg.update(
    {
        "5.0Mto10.0M_telo_dist" : ("5.0Mto10.0M_telo", "sum"),
        "5.0Mto10.0M_cent_dist" : ("5.0Mto10.0M_cent", "sum")
        }
)
agg.update(
    {
        "5.0Mto10.0M_telo_mean_dist" : ("5.0Mto10.0M_telo", "mean"),
        "5.0Mto10.0M_cent_mean_dist" : ("5.0Mto10.0M_cent", "mean")
        }
)


add GM12878 subcompartments

In [31]:
subcompartments = pd.read_table(
    "/share/Data/ychi/raw/Rao2014/GM12878_subcompartments.hg38.bed",
    names = ["chrom","start","end","subcompartment","int_label","chrom2","start2","end2","RGB"],
    )[["chrom","start","end","subcompartment"]]

In [32]:
subcompartments = subcompartments.dropna(subset="subcompartment",how="any")

In [33]:
features = features.assign(
    **dict.fromkeys(subcompartments["subcompartment"].unique(),0)
)

In [34]:
for i, row in subcompartments.iterrows():
    features.loc[(row["chrom"], row["start"]):(row["chrom"], row["end"]), row["subcompartment"]] = 1

In [35]:
#subcompartments.query('start != start2')

In [36]:
agg.update(
    dict(zip(
        subcompartments["subcompartment"].unique(),
        [(i,"sum") for i in subcompartments["subcompartment"].unique()]
    ))
)

add bulk A and B compartments

In [37]:
vec = pd.read_table(h.ddir / "Sperm_hg.cis.vecs.tsv")

In [38]:
from hic_basic.compartment import AB_block_ends

In [39]:
features = features.assign(
    bulkA = 0,
    bulkB = 0
)

In [43]:
features = features.assign(
    
)

In [45]:
for i, row in AB_block_ends(vec).iterrows():
    if row["AB"] == "A":
        features.loc[(row["chrom"],row["start"]):(row["chrom"],row["end"]),"bulkA"] = 1
    elif row["AB"] == "B":
        features.loc[(row["chrom"],row["start"]):(row["chrom"],row["end"]),"bulkB"] = 1

In [47]:
agg.update(
    {
        "bulkA_dist" : ("bulkA", "sum"),
        "bulkB_dist" : ("bulkB", "sum")
    }
)

Dump to files.

In [48]:
features.to_csv(h.ddir / "GRCh38.features.csv.gz")
dump_pickle(agg, h.ddir / "GRCh38.aggs.pkl")

# prepare features for Sperm_hg

In [21]:
features = pd.read_csv(
    h.ddir/"GRCh38.features.csv.gz",
    index_col=[0,1]
)
features.index.names = ["chrom","start"]
aggs = load_pickle(h.ddir/"GRCh38.aggs.pkl")

In [51]:
features

Unnamed: 0_level_0,Unnamed: 1_level_0,CpG,chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,...,5.0M_telo,10.0M_telo,5.0Mto10.0M_telo,5.0Mto10.0M_cent,B1,A1,B2,B4,A2,B3
chr,pos,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
chr1,0,0.030700,1,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
chr1,20000,0.017900,1,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
chr1,40000,0.006600,1,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
chr1,60000,0.006750,1,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
chr1,80000,0.008450,1,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chrY,57120000,0.007000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chrY,57140000,0.008900,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chrY,57160000,0.008900,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chrY,57180000,0.011450,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### adding 1M intermingle

In [None]:
intermingling = pd.read_parquet(
    h.ddir / "Sperm_hg.intermingling.parquet"
)
intermingling.index.names = [
    "chrom",
    "coarsen_start" #1M reso
]

FileNotFoundError: [Errno 2] No such file or directory: '/shareb/ychi/repo/sperm_struct/notebooks/data2/Sperm_hg.intermingling.parquet'

In [None]:
grouper = GenomeIdeograph("GRCh38").coarsen_grouper(20000, 1000000)

In [None]:
features = pd.merge(
    features.assign(
        coarsen_start = grouper.apply(lambda x: x[1])
    ).reset_index(),
    intermingling.reset_index(),
    on=["chrom","coarsen_start"],
    how="left"
).drop("coarsen_start",axis=1).set_index(["chrom","start"])

In [None]:
aggs.update(
    {
        "mean_intermingling_ratio" : ("intermingling_ratio", "mean"),
        "mean_multi_chrom_intermingling" : ("multi_chrom_intermingling", "mean"),
        "mean_species_richness" : ("species_richness", "mean"),
    }
)

In [None]:
features.to_csv(
    h.ddir / "Sperm_hg.GRCh38.features.csv.gz",
    index=True
)
dump_pickle(aggs, h.ddir/"Sperm_hg.GRCh38.aggs.pkl")

# Prepare features for GRCh38_dip genome

Initiate features(pd.DataFrame) and aggs(dict).

In [52]:
# prepare base index
genomes = GenomeIdeograph("GRCh38_dip")
bins = genomes.bins(20e3,bed=True,order=True,flavor="hickit")
features = bins.set_index(["chrom","start"]).drop("end",axis=1).copy()

agg = {}

Adding CpG density.

In [53]:
CpG = pd.read_table(
    h.ddir / "GRCh38.CpG.20k.tsv.gz",
    names = ["chrom", "start", "end","CpG"],
    index_col = ["chrom", "start"]
).drop("end",axis=1)
# strict left
#features = pd.concat([features,CpG],axis=1,join="outer").loc[bins.index]
features = dupref_annote(features, CpG)

agg.update(
    {"mean_CpG" : ("CpG", "mean")}
)

Adding chromosome distribution

In [54]:
genomes = chromosomes("GRCh38_dip")

In [55]:
features = features.assign(**dict.fromkeys(genomes.index,0))
for chrom in genomes.index:
    features.loc[chrom,chrom] = 1

agg.update(
    dict(zip(
        [chrom+"_dist" for chrom in genomes.index],
        [(chrom,"sum") for chrom in genomes.index]
    ))
)

Adding density.

In [56]:
features = features.assign(particle=1)

agg.update(
    {"density" : ("particle", "sum")}
)

Adding centelo.

In [57]:
relpos = centelo_relpos(s2m_index(features.index), "GRCh38_dip")
relpos.columns = ["chrom","pos","centelo"]
relpos = relpos.set_index(["chrom","pos"])

In [58]:
relpos = relpos.loc[~relpos.index.get_level_values(0).isin(
    ["chrX(mat)","chrX(pat)","chrY(mat)","chrY(pat)"])
    ]

In [59]:
features = pd.concat([features,m2s_index(relpos)],axis=1,join="outer").loc[features.index]
agg.update(
    {"mean_centelo" : ("centelo", "mean")}
)

### Adding extreme centromere and telomere

In [60]:
features.sort_index(inplace=True)
features = s2m_index(features)

In [61]:
dists = [0.5e6,1e6,2e6,5e6,10e6]
p, q = True, True
centelo = fetch_cent_chromlen("GRCh38_dip")
centelo_cols = []
for dis in dists:
    centelo_col = "%.1f" % (dis/1e6) + "M_centelo"
    centelo_cols.append(centelo_col)
    features = features.assign(**{centelo_col:0})
    for chrom in centelo.index:
        cent_start, cent_end = centelo.loc[chrom, ["start","end"]]
        # arm1 paracentric
        if p:
            features.loc[(chrom, cent_start-dis):(chrom, cent_start), centelo_col] = -1
        else:
            pass
        if q:
            # arm2 paracentric
            features.loc[(chrom, cent_end):(chrom, cent_end+dis), centelo_col] = -1
        else:
            pass

        start, end = 0, centelo.loc[chrom, "chrom_length"]
        # arm1 near telomere
        if p:
            features.loc[(chrom, start):(chrom, start+dis), centelo_col] = 1
        else:
            pass
        if q:
            features.loc[(chrom, end-dis):(chrom, end), centelo_col] = 1
        else:
            pass
agg.update(
    dict(zip(
        [centelo_col+"_dist" for centelo_col in centelo_cols],
        [(i,"sum") for i in centelo_cols]
    ))
)
agg.update(
    dict(zip(
        [centelo_col+"_mean_dist" for centelo_col in centelo_cols],
        [(i,"mean") for i in centelo_cols]
    ))
)

In [62]:
features = m2s_index(features)

add GM12878 subcompartments

In [63]:
subcompartments = pd.read_table(
    "/share/Data/ychi/raw/Rao2014/GM12878_subcompartments.hg38.bed",
    names = ["chrom","start","end","subcompartment","int_label","chrom2","start2","end2","RGB"],
    )[["chrom","start","end","subcompartment"]]

In [64]:
subcompartments = subcompartments.dropna(subset="subcompartment",how="any")

In [65]:
features = features.assign(
    **dict.fromkeys(subcompartments["subcompartment"].unique(),0)
)

In [67]:
for i, row in subcompartments.iterrows():
    for suffix in ["(mat)","(pat)"]:
        features.loc[(row["chrom"]+suffix, row["start"]):(row["chrom"]+suffix, row["end"]), row["subcompartment"]] = 1

In [68]:
features.sum()

CpG           2947.065593
chr1(mat)    12448.000000
chr1(pat)    12448.000000
chr2(mat)    12110.000000
chr2(pat)    12110.000000
                 ...     
A1           39602.000000
B2           43394.000000
B4            1080.000000
A2           57670.000000
B3           85086.000000
Length: 62, dtype: float64

In [69]:
agg.update(
    dict(zip(
        subcompartments["subcompartment"].unique(),
        [(i,"sum") for i in subcompartments["subcompartment"].unique()]
    ))
)

Dump to files.

In [14]:
features.to_csv(h.ddir / "GRCh38_dip.features.csv.gz")
dump_pickle(agg, h.ddir / "GRCh38_dip.aggs.pkl")

# Prepare features for hg19_dip genome

Initiate features(pd.DataFrame) and aggs(dict).

In [None]:
# prepare base index
genomes = GenomeIdeograph("hg19_dip")
bins = genomes.bins(20e3,bed=True)
bins = bins.set_index(["chrom","start"])
bins.drop("end",axis=1,inplace=True)
features = bins.copy()

agg = {}

Adding CpG density.

In [None]:
CpG = pd.read_table(
    "/share/home/ychi/dev/dip-c/color/hg19.cpg.20k.txt",
    names = ["chrom","pos","CpG"],
    index_col = ["chrom","pos"]
)

# strict left
#features = pd.concat([features,CpG],axis=1,join="outer").loc[bins.index]
features = dupref_annote(features, CpG)

agg.update(
    {"mean_CpG" : ("CpG", "mean")}
)

Adding chromosome distribution

In [None]:
genomes = chromosomes("hg19_dip")

In [None]:
features = features.assign(**dict.fromkeys(genomes.index,0))
for chrom in genomes.index:
    features.loc[chrom,chrom] = 1

agg.update(
    dict(zip(
        [chrom+"_dist" for chrom in genomes.index],
        [(chrom,"sum") for chrom in genomes.index]
    ))
)

Adding density.

In [None]:
features = features.assign(particle=1)

agg.update(
    {"density" : ("particle", "sum")}
)

Adding centelo.

In [None]:
# not surport hg19 yet
# relpos = centelo_relpos(features.index, "hg19")
# relpos.columns = ["chrom","pos","centelo"]
# relpos = relpos.set_index(["chrom","pos"])

In [None]:
# features = pd.concat([features,relpos],axis=1,join="outer").loc[bins.index]
# agg.update(
#     {"mean_centelo" : ("centelo", "mean")}
# )

Dump to files.

In [None]:
features.to_csv(h.ddir / "hg19_dip.features.csv.gz")
dump_pickle(agg, h.ddir / "hg19_dip.aggs.pkl")