# Micro-C Contact vs Distance plot

Reference (https://cooltools.readthedocs.io/en/latest/notebooks/contacts_vs_distance.html)

In [1]:
# INSTALL REQUIRED PACKAGES

#!pip install cooltools==0.7.0
#!pip install seaborn
#!pip install cooler

In [1]:
# Standard library imports
import os
import warnings
from itertools import combinations
from multiprocessing import Pool
import subprocess

# Third-party imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import colors
import bioframe
import cooler
import cooltools
from packaging import version

# Jupyter magic (only in notebooks)
%matplotlib inline

# ------------------------------------------------------------------------------
# Configuration
# ------------------------------------------------------------------------------

# Suppress non-critical warnings
warnings.filterwarnings('ignore')

# Determine number of CPUs
num_cpus = os.cpu_count() or 1

print(f'Using {num_cpus} CPU core{'s' if num_cpus > 1 else ''}.')

Using 10 CPU cores.


## Parameters

In [2]:
REFDIR = 'reference'
SCRDIR = 'script'

# Use bioframe to fetch the genomic features from the UCSC.
mm10_chromsizes = bioframe.fetch_chromsizes('mm10')
mm10_cens = pd.read_csv(f'{REFDIR}/mm10/mm10.centromere.txt', sep = '\t')

# create a view with chromosome arms using chromosome sizes and definition of centromeres
mm10_arms = bioframe.make_chromarms(mm10_chromsizes,  mm10_cens)


## Calculating CvD with downsampled matrix
All samples will be downsampled to same readcount and KR normalized for proper comparison.

In [3]:
def process_cvd(MCOOLDIR, COOLDIR, OUTDIR, NAME, RES, DSCOUNT, mm10_arms, num_cpus, WORKDIR):
    '''
    1. Subsamples the merged .mcool at resolution RES to DSCOUNT reads (if needed)
    2. KR-normalizes the subsampled matrix via normCool.sh (if needed)
    3. Computes the expected cis-decay curve, smooths it, masks dist<2, 
       merges duplicates, and writes out both the merged curve and its log-log derivative.

    Parameters
    ----------
    MCOOLDIR : str
        Path to the pooled .mcool file (expects {NAME}_allRes.mcool).
    COOLDIR : str
        Directory for normalized/downsamped .cool files.
    OUTDIR : str
        Directory for output TSVs.
    NAME : str
        Sample name prefix.
    RES : int
        Resolution in bp.
    DSCOUNT : int
        Number of contacts to subsample (e.g. 5_000_000).
    mm10_arms : pd.DataFrame
        A view_df defining chromosome arms.
    num_cpus : int
        Number of processes to use for cooltools.sample and expected_cis.
    WORKDIR : str
        Directory containing `normCool.sh`.
    '''
    os.makedirs(OUTDIR, exist_ok=True)

    # paths
    subsampled = f'{COOLDIR}/{NAME}_{RES}bp_downsampled-{DSCOUNT//1_000_000}M_KR.cool'

    # calculate subsampling fraction
    pre_norm = cooler.Cooler(f'{COOLDIR}/{NAME}_{RES}bp_KR.cool')
    frac = DSCOUNT / pre_norm.info['sum']
    print(f'prenorm: {pre_norm.info['sum']}')
    print(f'sampling fraction: {frac}')
    
    # 1) subsample if needed
    if not os.path.exists(subsampled):
        print(f'sampling to {subsampled}')
        mc = cooler.Cooler(f'{MCOOLDIR}/{NAME}_allRes.mcool::/resolutions/{RES}')
        cooltools.sample(mc,
                         out_clr_path=subsampled,
                         frac=frac,
                         nproc=num_cpus)
        
        script = os.path.join(WORKDIR, 'normCool.sh')
        print(f'normalizing {subsampled}')
        subprocess.run([script, subsampled, str(num_cpus)], check=True)
    else:
        print(f'skipping sample (exists): {subsampled}')

    # 2) compute expected cis-decay curve
    clr = cooler.Cooler(subsampled)
    cvd = cooltools.expected_cis(
        clr=clr,
        view_df=mm10_arms,
        smooth=True,
        aggregate_smoothed=True,
        smooth_sigma=0.1,
        nproc=num_cpus
    )

    # write smoothed curve
    out_smooth = os.path.join(
        OUTDIR,
        f'{NAME}_{RES}bp_downsampled-{DSCOUNT//1_000_000}M_cvd_smooth.tsv'
    )
    cvd.to_csv(out_smooth, sep='\t', index=False)

    # mask dist<2
    mask = cvd['dist'] < 2
    cvd.loc[mask, ['balanced.avg.smoothed',
                   'balanced.avg.smoothed.agg']] = np.nan

    # merge duplicates & save
    cvd_merged = cvd.drop_duplicates(subset=['dist'])[
        ['dist_bp', 'balanced.avg.smoothed.agg']
    ]
    out_merged = os.path.join(
        OUTDIR,
        f'{NAME}_{RES}bp_downsampled-{DSCOUNT//1_000_000}M_cvd_smooth_merged.tsv'
    )
    cvd_merged.to_csv(out_merged, sep='\t', index=False)

    # derivative in log-log space & save
    der = np.gradient(
        np.log(cvd_merged['balanced.avg.smoothed.agg']),
        np.log(cvd_merged['dist_bp'])
    )
    der_df = pd.DataFrame({'derivative': der})
    out_deriv = os.path.join(
        OUTDIR,
        f'{NAME}_{RES}bp_downsampled-{DSCOUNT//1_000_000}M_cvd_smooth_merged_derivative.tsv'
    )
    der_df.to_csv(out_deriv, sep='\t', index=False)

    print('Done.')
    return cvd, cvd_merged, der_df

In [4]:
names = [
    # 'G1DMSO_pooled',
    # 'G1dTAG_pooled',
    # 'G1A485_pooled',
    'GSE178982_AsyncUT_pooled',
    'GSE178982_AsyncAID_pooled'
]

for name in names:
    print(f"Processing {name}…")
    process_cvd(
        MCOOLDIR    = '../data/mcool_pooled',
        COOLDIR     = '../data/cool_norm_pooled',
        OUTDIR      = '../data/cvd',
        NAME        = name,
        RES         = 1000,
        DSCOUNT     = 300_000_000,
        mm10_arms   = mm10_arms,
        num_cpus    = num_cpus,
        WORKDIR     = SCRDIR
    )

INFO:root:creating a Pool of 10 workers


Processing GSE178982_AsyncUT_pooled…
prenorm: 547970606.0
sampling fraction: 0.5474746212938291
sampling to ../data/cool_norm_pooled/GSE178982_AsyncUT_pooled_1000bp_downsampled-300M_KR.cool
normalizing ../data/cool_norm_pooled/GSE178982_AsyncUT_pooled_1000bp_downsampled-300M_KR.cool


INFO:cooler.cli.balance:Balancing "../data/cool_norm_pooled/GSE178982_AsyncUT_pooled_1000bp_downsampled-300M_KR.cool"
INFO:cooler.balance:variance is 14809.524750937491
INFO:cooler.balance:variance is 15864.696217205408
INFO:cooler.balance:variance is 9281.704299816976
INFO:cooler.balance:variance is 13402.160493263798
INFO:cooler.balance:variance is 6488.1869022356805
INFO:cooler.balance:variance is 9288.78668019357
INFO:cooler.balance:variance is 4139.518913970893
INFO:cooler.balance:variance is 5911.178548852835
INFO:cooler.balance:variance is 2604.762537277713
INFO:cooler.balance:variance is 3650.058457815581
INFO:cooler.balance:variance is 1661.307450094676
INFO:cooler.balance:variance is 2241.6526975250567
INFO:cooler.balance:variance is 1079.0891425283066
INFO:cooler.balance:variance is 1383.976826842285
INFO:cooler.balance:variance is 711.4259933177158
INFO:cooler.balance:variance is 862.4654863247615
INFO:cooler.balance:variance is 473.7244511150567
INFO:cooler.balance:varianc

Done.
Processing GSE178982_AsyncAID_pooled…
prenorm: 2132479625.0
sampling fraction: 0.1406812972480335
sampling to ../data/cool_norm_pooled/GSE178982_AsyncAID_pooled_1000bp_downsampled-300M_KR.cool
normalizing ../data/cool_norm_pooled/GSE178982_AsyncAID_pooled_1000bp_downsampled-300M_KR.cool


INFO:cooler.cli.balance:Balancing "../data/cool_norm_pooled/GSE178982_AsyncAID_pooled_1000bp_downsampled-300M_KR.cool"
INFO:cooler.balance:variance is 12003.657236315594
INFO:cooler.balance:variance is 15919.708973148514
INFO:cooler.balance:variance is 9744.595582403606
INFO:cooler.balance:variance is 15403.19205516411
INFO:cooler.balance:variance is 7271.964853049178
INFO:cooler.balance:variance is 11611.215501841254
INFO:cooler.balance:variance is 4823.322328444558
INFO:cooler.balance:variance is 7882.161967074914
INFO:cooler.balance:variance is 3153.2162696807045
INFO:cooler.balance:variance is 5150.518096472962
INFO:cooler.balance:variance is 2105.378806316995
INFO:cooler.balance:variance is 3334.445654687017
INFO:cooler.balance:variance is 1441.9219011775278
INFO:cooler.balance:variance is 2165.046451176527
INFO:cooler.balance:variance is 1006.395825419514
INFO:cooler.balance:variance is 1416.3789661877029
INFO:cooler.balance:variance is 710.2895541364046
INFO:cooler.balance:varia

Done.
