# Micro-C Contact vs Distance plot

Reference (https://cooltools.readthedocs.io/en/latest/notebooks/contacts_vs_distance.html)

In [1]:
# INSTALL REQUIRED PACKAGES

#!pip install cooltools==0.7.0
#!pip install seaborn
#!pip install cooler

In [1]:
# Standard library imports
import os
import warnings
from itertools import combinations
from multiprocessing import Pool
import subprocess

# Third-party imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import colors
import bioframe
import cooler
import cooltools
from packaging import version

# Jupyter magic (only in notebooks)
%matplotlib inline

# ------------------------------------------------------------------------------
# Configuration
# ------------------------------------------------------------------------------

# Suppress non-critical warnings
warnings.filterwarnings('ignore')

# Determine number of CPUs
num_cpus = os.cpu_count() or 1

print(f'Using {num_cpus} CPU core{'s' if num_cpus > 1 else ''}.')

os.chdir("/Volumes/UKJIN_SSD/MtoG1_analysis_code")

Using 10 CPU cores.


## Parameters

In [2]:
REFDIR = 'reference'
SCRDIR = 'script'

# Use bioframe to fetch the genomic features from the UCSC.
mm10_chromsizes = bioframe.fetch_chromsizes('mm10')
mm10_cens = pd.read_csv(f'{REFDIR}/mm10/mm10.centromere.txt', sep = '\t')

# create a view with chromosome arms using chromosome sizes and definition of centromeres
mm10_arms = bioframe.make_chromarms(mm10_chromsizes,  mm10_cens)


## Calculating CvD with downsampled matrix
All samples will be downsampled to same readcount and KR normalized for proper comparison.

In [3]:
def process_cvd(MCOOLDIR, COOLDIR, OUTDIR, NAME, RES, DSCOUNT, mm10_arms, num_cpus, WORKDIR):
    '''
    1. Subsamples the merged .mcool at resolution RES to DSCOUNT reads (if needed)
    2. KR-normalizes the subsampled matrix via normCool.sh (if needed)
    3. Computes the expected cis-decay curve, smooths it, masks dist<2, 
       merges duplicates, and writes out both the merged curve and its log-log derivative.

    Parameters
    ----------
    MCOOLDIR : str
        Path to the pooled .mcool file (expects {NAME}_allRes.mcool).
    COOLDIR : str
        Directory for normalized/downsamped .cool files.
    OUTDIR : str
        Directory for output TSVs.
    NAME : str
        Sample name prefix.
    RES : int
        Resolution in bp.
    DSCOUNT : int
        Number of contacts to subsample (e.g. 5_000_000).
    mm10_arms : pd.DataFrame
        A view_df defining chromosome arms.
    num_cpus : int
        Number of processes to use for cooltools.sample and expected_cis.
    WORKDIR : str
        Directory containing `normCool.sh`.
    '''
    os.makedirs(OUTDIR, exist_ok=True)

    # paths
    subsampled = f'{COOLDIR}/{NAME}_{RES}bp_downsampled-{DSCOUNT//1_000_000}M_KR.cool'

    # calculate subsampling fraction
    pre_norm = cooler.Cooler(f'{COOLDIR}/{NAME}_{RES}bp_KR.cool')
    frac = DSCOUNT / pre_norm.info['sum']
    print(f'prenorm: {pre_norm.info['sum']}')
    print(f'sampling fraction: {frac}')
    
    # 1) subsample if needed
    if not os.path.exists(subsampled):
        print(f'sampling to {subsampled}')
        mc = cooler.Cooler(f'{MCOOLDIR}/{NAME}_allRes.mcool::/resolutions/{RES}')
        cooltools.sample(mc,
                         out_clr_path=subsampled,
                         frac=frac,
                         nproc=num_cpus)
        
        script = os.path.join(WORKDIR, 'normCool.sh')
        print(f'normalizing {subsampled}')
        subprocess.run([script, subsampled, str(num_cpus)], check=True)
    else:
        print(f'skipping sample (exists): {subsampled}')

    # 2) compute expected cis-decay curve
    clr = cooler.Cooler(subsampled)
    cvd = cooltools.expected_cis(
        clr=clr,
        view_df=mm10_arms,
        smooth=True,
        aggregate_smoothed=True,
        smooth_sigma=0.1,
        nproc=num_cpus
    )

    # write smoothed curve
    out_smooth = os.path.join(
        OUTDIR,
        f'{NAME}_{RES}bp_downsampled-{DSCOUNT//1_000_000}M_cvd_smooth.tsv'
    )
    cvd.to_csv(out_smooth, sep='\t', index=False)

    # mask dist<2
    mask = cvd['dist'] < 2
    cvd.loc[mask, ['balanced.avg.smoothed',
                   'balanced.avg.smoothed.agg']] = np.nan

    # merge duplicates & save
    cvd_merged = cvd.drop_duplicates(subset=['dist'])[
        ['dist_bp', 'balanced.avg.smoothed.agg']
    ]
    out_merged = os.path.join(
        OUTDIR,
        f'{NAME}_{RES}bp_downsampled-{DSCOUNT//1_000_000}M_cvd_smooth_merged.tsv'
    )
    cvd_merged.to_csv(out_merged, sep='\t', index=False)

    # derivative in log-log space & save
    der = np.gradient(
        np.log(cvd_merged['balanced.avg.smoothed.agg']),
        np.log(cvd_merged['dist_bp'])
    )
    der_df = pd.DataFrame({'derivative': der})
    out_deriv = os.path.join(
        OUTDIR,
        f'{NAME}_{RES}bp_downsampled-{DSCOUNT//1_000_000}M_cvd_smooth_merged_derivative.tsv'
    )
    der_df.to_csv(out_deriv, sep='\t', index=False)

    print('Done.')
    return cvd, cvd_merged, der_df

In [6]:
names = [
    # 'G1DMSO_pooled',
    # 'G1dTAG_pooled',
    # 'G1A485_pooled',
    # 'GSE178982_AsyncUT_pooled',
    # 'GSE178982_AsyncAID_pooled',
    'EpiG1DMSO_pooled',
    'EpiG1dTAG_pooled'
]

for name in names:
    print(f"Processing {name}…")
    process_cvd(
        MCOOLDIR    = '../data/mcool_pooled',
        COOLDIR     = '../data/cool_norm_pooled',
        OUTDIR      = '../data/cvd',
        NAME        = name,
        RES         = 1000,
        DSCOUNT     = 300_000_000,
        mm10_arms   = mm10_arms,
        num_cpus    = num_cpus,
        WORKDIR     = SCRDIR
    )

INFO:root:creating a Pool of 10 workers


Processing EpiG1DMSO_pooled…
prenorm: 589766630.0
sampling fraction: 0.5086757790958772
sampling to ../data/cool_norm_pooled/EpiG1DMSO_pooled_1000bp_downsampled-300M_KR.cool
normalizing ../data/cool_norm_pooled/EpiG1DMSO_pooled_1000bp_downsampled-300M_KR.cool


INFO:cooler.cli.balance:Balancing "../data/cool_norm_pooled/EpiG1DMSO_pooled_1000bp_downsampled-300M_KR.cool"
INFO:cooler.balance:variance is 6760.251403039081
INFO:cooler.balance:variance is 2798.9136028891235
INFO:cooler.balance:variance is 684.8314254002188
INFO:cooler.balance:variance is 1125.1477831387447
INFO:cooler.balance:variance is 368.7805334096497
INFO:cooler.balance:variance is 518.7708326840657
INFO:cooler.balance:variance is 208.56923040059777
INFO:cooler.balance:variance is 252.16862199816552
INFO:cooler.balance:variance is 118.4558167479216
INFO:cooler.balance:variance is 126.59163761986355
INFO:cooler.balance:variance is 66.85743878172588
INFO:cooler.balance:variance is 64.98058326493741
INFO:cooler.balance:variance is 37.4469387356199
INFO:cooler.balance:variance is 33.889269013814925
INFO:cooler.balance:variance is 20.833208634204457
INFO:cooler.balance:variance is 17.878009721664345
INFO:cooler.balance:variance is 11.527854705362584
INFO:cooler.balance:variance is 

Done.
Processing EpiG1dTAG_pooled…
prenorm: 598040187.0
sampling fraction: 0.5016385295190873
sampling to ../data/cool_norm_pooled/EpiG1dTAG_pooled_1000bp_downsampled-300M_KR.cool
normalizing ../data/cool_norm_pooled/EpiG1dTAG_pooled_1000bp_downsampled-300M_KR.cool


INFO:cooler.cli.balance:Balancing "../data/cool_norm_pooled/EpiG1dTAG_pooled_1000bp_downsampled-300M_KR.cool"
INFO:cooler.balance:variance is 7025.482840831921
INFO:cooler.balance:variance is 2357.45524550447
INFO:cooler.balance:variance is 565.6461770468138
INFO:cooler.balance:variance is 800.4343320924423
INFO:cooler.balance:variance is 265.577426443142
INFO:cooler.balance:variance is 312.9050547268929
INFO:cooler.balance:variance is 129.68344302802453
INFO:cooler.balance:variance is 130.05837471228332
INFO:cooler.balance:variance is 63.04318760092147
INFO:cooler.balance:variance is 56.123236134835054
INFO:cooler.balance:variance is 30.311976509657388
INFO:cooler.balance:variance is 24.814357102059432
INFO:cooler.balance:variance is 14.431299914368408
INFO:cooler.balance:variance is 11.14820930606221
INFO:cooler.balance:variance is 6.818849976629974
INFO:cooler.balance:variance is 5.061802778146457
INFO:cooler.balance:variance is 3.204253488998575
INFO:cooler.balance:variance is 2.31

Done.
