# Micro-C Contact vs Distance plot

Reference (https://cooltools.readthedocs.io/en/latest/notebooks/contacts_vs_distance.html)

In [1]:
# INSTALL REQUIRED PACKAGES

#!pip install cooltools==0.7.0
#!pip install seaborn
#!pip install cooler

In [2]:
# Standard library imports
import os
import warnings
from itertools import combinations
from multiprocessing import Pool
import subprocess

# Third-party imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import colors
import bioframe
import cooler
import cooltools
from packaging import version

# Jupyter magic (only in notebooks)
%matplotlib inline

# ------------------------------------------------------------------------------
# Configuration
# ------------------------------------------------------------------------------

# Suppress non-critical warnings
warnings.filterwarnings('ignore')

# Determine number of CPUs
num_cpus = os.cpu_count() or 1

print(f'Using {num_cpus} CPU core{'s' if num_cpus > 1 else ''}.')

Using 10 CPU cores.


## Parameters

In [3]:
REFDIR = 'reference'
SCRDIR = 'script'

# Use bioframe to fetch the genomic features from the UCSC.
mm10_chromsizes = bioframe.fetch_chromsizes('mm10')
mm10_cens = pd.read_csv(f'{REFDIR}/mm10/mm10.centromere.txt', sep = '\t')

# create a view with chromosome arms using chromosome sizes and definition of centromeres
mm10_arms = bioframe.make_chromarms(mm10_chromsizes,  mm10_cens)


## Calculating CvD with downsampled matrix
All samples will be downsampled to same readcount and KR normalized for proper comparison.

In [4]:
def process_cvd(MCOOLDIR, COOLDIR, OUTDIR, NAME, RES, DSCOUNT, mm10_arms, num_cpus, WORKDIR):
    '''
    1. Subsamples the merged .mcool at resolution RES to DSCOUNT reads (if needed)
    2. KR-normalizes the subsampled matrix via normCool.sh (if needed)
    3. Computes the expected cis-decay curve, smooths it, masks dist<2, 
       merges duplicates, and writes out both the merged curve and its log-log derivative.

    Parameters
    ----------
    MCOOLDIR : str
        Path to the pooled .mcool file (expects {NAME}_allRes.mcool).
    COOLDIR : str
        Directory for normalized/downsamped .cool files.
    OUTDIR : str
        Directory for output TSVs.
    NAME : str
        Sample name prefix.
    RES : int
        Resolution in bp.
    DSCOUNT : int
        Number of contacts to subsample (e.g. 5_000_000).
    mm10_arms : pd.DataFrame
        A view_df defining chromosome arms.
    num_cpus : int
        Number of processes to use for cooltools.sample and expected_cis.
    WORKDIR : str
        Directory containing `normCool.sh`.
    '''
    os.makedirs(OUTDIR, exist_ok=True)

    # paths
    subsampled = f'{COOLDIR}/{NAME}_{RES}bp_downsampled-{DSCOUNT//1_000_000}M_KR.cool'

    # calculate subsampling fraction
    pre_norm = cooler.Cooler(f'{COOLDIR}/{NAME}_{RES}bp_KR.cool')
    frac = DSCOUNT / pre_norm.info['sum']
    print(f'prenorm: {pre_norm.info['sum']}')
    print(f'sampling fraction: {frac}')
    
    # 1) subsample if needed
    if not os.path.exists(subsampled):
        print(f'sampling to {subsampled}')
        mc = cooler.Cooler(f'{MCOOLDIR}/{NAME}_allRes.mcool::/resolutions/{RES}')
        cooltools.sample(mc,
                         out_clr_path=subsampled,
                         frac=frac,
                         nproc=num_cpus)
        
        script = os.path.join(WORKDIR, 'normCool.sh')
        print(f'normalizing {subsampled}')
        subprocess.run([script, subsampled, str(num_cpus)], check=True)
    else:
        print(f'skipping sample (exists): {subsampled}')

    # 2) compute expected cis-decay curve
    clr = cooler.Cooler(subsampled)
    cvd = cooltools.expected_cis(
        clr=clr,
        view_df=mm10_arms,
        smooth=True,
        aggregate_smoothed=True,
        smooth_sigma=0.1,
        nproc=num_cpus
    )

    # write smoothed curve
    out_smooth = os.path.join(
        OUTDIR,
        f'{NAME}_{RES}bp_downsampled-{DSCOUNT//1_000_000}M_cvd_smooth.tsv'
    )
    cvd.to_csv(out_smooth, sep='\t', index=False)

    # mask dist<2
    mask = cvd['dist'] < 2
    cvd.loc[mask, ['balanced.avg.smoothed',
                   'balanced.avg.smoothed.agg']] = np.nan

    # merge duplicates & save
    cvd_merged = cvd.drop_duplicates(subset=['dist'])[
        ['dist_bp', 'balanced.avg.smoothed.agg']
    ]
    out_merged = os.path.join(
        OUTDIR,
        f'{NAME}_{RES}bp_downsampled-{DSCOUNT//1_000_000}M_cvd_smooth_merged.tsv'
    )
    cvd_merged.to_csv(out_merged, sep='\t', index=False)

    # derivative in log-log space & save
    der = np.gradient(
        np.log(cvd_merged['balanced.avg.smoothed.agg']),
        np.log(cvd_merged['dist_bp'])
    )
    der_df = pd.DataFrame({'derivative': der})
    out_deriv = os.path.join(
        OUTDIR,
        f'{NAME}_{RES}bp_downsampled-{DSCOUNT//1_000_000}M_cvd_smooth_merged_derivative.tsv'
    )
    der_df.to_csv(out_deriv, sep='\t', index=False)

    print('Done.')
    return cvd, cvd_merged, der_df

In [14]:
names = [
    'G1DMSO_pooled',
    'G1dTAG_pooled',
    'G1A485_pooled',
    'GSE178982_AsyncUT_pooled',
    'GSE178982_AsyncAID_pooled'
]

for name in names:
    print(f"Processing {name}…")
    process_cvd(
        MCOOLDIR    = '../data/mcool_pooled',
        COOLDIR     = '../data/cool_norm_pooled',
        OUTDIR      = '../data/cvd',
        NAME        = name,
        RES         = 1000,
        DSCOUNT     = 300_000_000,
        mm10_arms   = mm10_arms,
        num_cpus    = num_cpus,
        WORKDIR     = SCRDIR
    )

INFO:root:creating a Pool of 10 workers


Processing G1DMSO_pooled…
prenorm: 1387522117.0
sampling fraction: 0.21621276974570922
sampling to ../data/cool_norm_pooled/G1DMSO_pooled_1000bp_downsampled-300M_KR.cool
normalizing ../data/cool_norm_pooled/G1DMSO_pooled_1000bp_downsampled-300M_KR.cool


INFO:cooler.cli.balance:Balancing "../data/cool_norm_pooled/G1DMSO_pooled_1000bp_downsampled-300M_KR.cool"
INFO:cooler.balance:variance is 19269.25735339472
INFO:cooler.balance:variance is 8704.289939100387
INFO:cooler.balance:variance is 2599.7889517330455
INFO:cooler.balance:variance is 4173.937561209409
INFO:cooler.balance:variance is 1260.2737061540067
INFO:cooler.balance:variance is 2046.703383754511
INFO:cooler.balance:variance is 702.3725307028188
INFO:cooler.balance:variance is 1039.6095829031628
INFO:cooler.balance:variance is 416.8074917404319
INFO:cooler.balance:variance is 545.2793512280296
INFO:cooler.balance:variance is 251.6071623936816
INFO:cooler.balance:variance is 293.18185218412856
INFO:cooler.balance:variance is 151.75276459543286
INFO:cooler.balance:variance is 160.57867411932236
INFO:cooler.balance:variance is 90.97064169501076
INFO:cooler.balance:variance is 89.17148932577115
INFO:cooler.balance:variance is 54.1690429575004
INFO:cooler.balance:variance is 50.032

Done.
Processing G1dTAG_pooled…
prenorm: 1236239723.0
sampling fraction: 0.24267138033065777
sampling to ../data/cool_norm_pooled/G1dTAG_pooled_1000bp_downsampled-300M_KR.cool
normalizing ../data/cool_norm_pooled/G1dTAG_pooled_1000bp_downsampled-300M_KR.cool


INFO:cooler.cli.balance:Balancing "../data/cool_norm_pooled/G1dTAG_pooled_1000bp_downsampled-300M_KR.cool"
INFO:cooler.balance:variance is 93514.96799323727
INFO:cooler.balance:variance is 4003.7700768558057
INFO:cooler.balance:variance is 982.4550812370488
INFO:cooler.balance:variance is 1594.0050313942093
INFO:cooler.balance:variance is 480.685888556365
INFO:cooler.balance:variance is 709.9711603816203
INFO:cooler.balance:variance is 263.4042241403402
INFO:cooler.balance:variance is 335.58033404062763
INFO:cooler.balance:variance is 148.19301700314438
INFO:cooler.balance:variance is 164.7929389243706
INFO:cooler.balance:variance is 83.2063566361227
INFO:cooler.balance:variance is 83.03360621043974
INFO:cooler.balance:variance is 46.31760570803385
INFO:cooler.balance:variance is 42.59008442905958
INFO:cooler.balance:variance is 25.561645647289968
INFO:cooler.balance:variance is 22.122097782947545
INFO:cooler.balance:variance is 14.007538974847384
INFO:cooler.balance:variance is 11.594

Done.
Processing G1A485_pooled…
prenorm: 1798711513.0
sampling fraction: 0.16678605648086492
sampling to ../data/cool_norm_pooled/G1A485_pooled_1000bp_downsampled-300M_KR.cool
normalizing ../data/cool_norm_pooled/G1A485_pooled_1000bp_downsampled-300M_KR.cool


INFO:cooler.cli.balance:Balancing "../data/cool_norm_pooled/G1A485_pooled_1000bp_downsampled-300M_KR.cool"
INFO:cooler.balance:variance is 48870.789731506295
INFO:cooler.balance:variance is 5518.36546215804
INFO:cooler.balance:variance is 1559.7893265834355
INFO:cooler.balance:variance is 1844.0038529490473
INFO:cooler.balance:variance is 578.8677788134617
INFO:cooler.balance:variance is 669.2959445458052
INFO:cooler.balance:variance is 254.9711661613593
INFO:cooler.balance:variance is 260.77079575729243
INFO:cooler.balance:variance is 117.88264008849424
INFO:cooler.balance:variance is 106.8211305748193
INFO:cooler.balance:variance is 54.782094299735384
INFO:cooler.balance:variance is 45.2241367102713
INFO:cooler.balance:variance is 25.315735417867767
INFO:cooler.balance:variance is 19.559986094792585
INFO:cooler.balance:variance is 11.62071925202241
INFO:cooler.balance:variance is 8.57849526073353
INFO:cooler.balance:variance is 5.305145953779798
INFO:cooler.balance:variance is 3.7968

Done.
Processing GSE178982_AsyncUT_pooled…
prenorm: 601264599.0
sampling fraction: 0.4989483839543329
sampling to ../data/cool_norm_pooled/GSE178982_AsyncUT_pooled_1000bp_downsampled-300M_KR.cool
normalizing ../data/cool_norm_pooled/GSE178982_AsyncUT_pooled_1000bp_downsampled-300M_KR.cool


INFO:cooler.cli.balance:Balancing "../data/cool_norm_pooled/GSE178982_AsyncUT_pooled_1000bp_downsampled-300M_KR.cool"
INFO:cooler.balance:variance is 770978.5036486239
INFO:cooler.balance:variance is 9302.415593120215
INFO:cooler.balance:variance is 5015.958420804777
INFO:cooler.balance:variance is 5433.101900412451
INFO:cooler.balance:variance is 2730.6780770134515
INFO:cooler.balance:variance is 2770.595673944817
INFO:cooler.balance:variance is 1381.7090447588614
INFO:cooler.balance:variance is 1345.2758879656487
INFO:cooler.balance:variance is 692.0793802963112
INFO:cooler.balance:variance is 648.3486507841446
INFO:cooler.balance:variance is 349.965879810914
INFO:cooler.balance:variance is 315.24444815794686
INFO:cooler.balance:variance is 179.28597940560434
INFO:cooler.balance:variance is 155.4972775357999
INFO:cooler.balance:variance is 92.91196360535264
INFO:cooler.balance:variance is 77.87309823717132
INFO:cooler.balance:variance is 48.58828190752461
INFO:cooler.balance:variance

Done.
Processing GSE178982_AsyncAID_pooled…
prenorm: 2088765526.0
sampling fraction: 0.14362550332516355
sampling to ../data/cool_norm_pooled/GSE178982_AsyncAID_pooled_1000bp_downsampled-300M_KR.cool
normalizing ../data/cool_norm_pooled/GSE178982_AsyncAID_pooled_1000bp_downsampled-300M_KR.cool


INFO:cooler.cli.balance:Balancing "../data/cool_norm_pooled/GSE178982_AsyncAID_pooled_1000bp_downsampled-300M_KR.cool"
INFO:cooler.balance:variance is 207648.8928140819
INFO:cooler.balance:variance is 11337.341290633969
INFO:cooler.balance:variance is 6512.623180673498
INFO:cooler.balance:variance is 7458.935719944595
INFO:cooler.balance:variance is 3677.2096575985875
INFO:cooler.balance:variance is 4048.94145225271
INFO:cooler.balance:variance is 1908.6164267195784
INFO:cooler.balance:variance is 2059.50755482569
INFO:cooler.balance:variance is 989.2054282202909
INFO:cooler.balance:variance is 1036.7663274515735
INFO:cooler.balance:variance is 523.8189490107254
INFO:cooler.balance:variance is 527.3743424281306
INFO:cooler.balance:variance is 283.6187567065722
INFO:cooler.balance:variance is 272.89118012501825
INFO:cooler.balance:variance is 156.19230917983376
INFO:cooler.balance:variance is 143.73785815471854
INFO:cooler.balance:variance is 87.01030572728119
INFO:cooler.balance:varian

Done.
