# Downsampling
Downsampling Micro-C to EpiLC G1 DMSO which has the least valid read pairs (474,973,552)

In [4]:
# Standard library imports
import os
import warnings
from itertools import combinations
from multiprocessing import Pool
import subprocess

# Third-party imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import colors
import bioframe
import cooler
import cooltools
from packaging import version

# Jupyter magic (only in notebooks)
%matplotlib inline

# ------------------------------------------------------------------------------
# Configuration
# ------------------------------------------------------------------------------

# Suppress non-critical warnings
warnings.filterwarnings('ignore')

# Determine number of CPUs
num_cpus = os.cpu_count() or 1

print(f'Using {num_cpus} CPU core{'s' if num_cpus > 1 else ''}.')

os.chdir("/Volumes/UKJIN_SSD/MtoG1_analysis_code")

Using 10 CPU cores.


## Parameters

In [5]:
REFDIR = '../reference'
SCRDIR = 'script'

# Use bioframe to fetch the genomic features from the UCSC.
mm10_chromsizes = bioframe.fetch_chromsizes('mm10')
mm10_cens = pd.read_csv(f'{REFDIR}/mm10/mm10.centromere.txt', sep = '\t')

# create a view with chromosome arms using chromosome sizes and definition of centromeres
mm10_arms = bioframe.make_chromarms(mm10_chromsizes,  mm10_cens)


## Downsampling

In [6]:
MCOOLDIR = '../data/mcool_pooled'
COOLDIR = '../data/cool_norm_pooled'

In [7]:
RES = '5000'

NAMES = [
    # 'G1DMSO_pooled',
    # 'G1dTAG_pooled',
    'G1A485_pooled',
    'GSE178982_AsyncUT_pooled',
    'GSE178982_AsyncAID_pooled',
    # 'EpiG1DMSO_pooled',
    # 'EpiG1dTAG_pooled'
]


# NAME = 'EpiG1DMSO_pooled'
for NAME in NAMES:
	pre_norm = cooler.Cooler(f'{COOLDIR}/{NAME}_{RES}bp_KR.cool')
	print(f'{NAME} valid readcounts in 5kb: {pre_norm.info['sum']}')


G1A485_pooled valid readcounts in 5kb: 1798822049.0
GSE178982_AsyncUT_pooled valid readcounts in 5kb: 624875662.0
GSE178982_AsyncAID_pooled valid readcounts in 5kb: 2168284483.0


In [8]:
DSCOUNT = 589828407

In [10]:
for NAME in NAMES:
    # paths
    subsampled = f'{COOLDIR}/{NAME}_{RES}bp_downsampled-{DSCOUNT//1_000_000}M_KR.cool'

    # calculate subsampling fraction
    pre_norm = cooler.Cooler(f'{COOLDIR}/{NAME}_{RES}bp_KR.cool')
    frac = DSCOUNT / pre_norm.info['sum']
    print(f'prenorm: {pre_norm.info['sum']}')
    print(f'sampling fraction: {frac}')
    
    # 1) subsample if needed
    if not os.path.exists(subsampled):
        print(f'sampling to {subsampled}')
        mc = cooler.Cooler(f'{MCOOLDIR}/{NAME}_allRes.mcool::/resolutions/{RES}')
        cooltools.sample(mc,
                         out_clr_path=subsampled,
                         frac=frac,
                         nproc=num_cpus)
        
        script = os.path.join(SCRDIR, 'normCool.sh')
        print(f'normalizing {subsampled}')
        subprocess.run([script, subsampled, str(num_cpus)], check=True)
    else:
        print(f'skipping sample (exists): {subsampled}')




INFO:root:creating a Pool of 10 workers


prenorm: 1798822049.0
sampling fraction: 0.32789702979674784
sampling to ../data/cool_norm_pooled/G1A485_pooled_5000bp_downsampled-589M_KR.cool
normalizing ../data/cool_norm_pooled/G1A485_pooled_5000bp_downsampled-589M_KR.cool


INFO:cooler.cli.balance:Balancing "../data/cool_norm_pooled/G1A485_pooled_5000bp_downsampled-589M_KR.cool"
INFO:cooler.balance:variance is 1398484.0306356624
INFO:cooler.balance:variance is 871701.3182940386
INFO:cooler.balance:variance is 112113.55067759093
INFO:cooler.balance:variance is 164502.07088804635
INFO:cooler.balance:variance is 39672.718435856186
INFO:cooler.balance:variance is 47260.059147578955
INFO:cooler.balance:variance is 16366.396675249416
INFO:cooler.balance:variance is 15848.217110704563
INFO:cooler.balance:variance is 6898.039404729178
INFO:cooler.balance:variance is 5740.432572767155
INFO:cooler.balance:variance is 2881.4790712768468
INFO:cooler.balance:variance is 2170.841747298077
INFO:cooler.balance:variance is 1190.5107952010405
INFO:cooler.balance:variance is 842.174918074949
INFO:cooler.balance:variance is 488.02029781882476
INFO:cooler.balance:variance is 331.90417697488294
INFO:cooler.balance:variance is 199.08847152695512
INFO:cooler.balance:variance is 

prenorm: 624875662.0
sampling fraction: 0.9439132340539133
sampling to ../data/cool_norm_pooled/GSE178982_AsyncUT_pooled_5000bp_downsampled-589M_KR.cool
normalizing ../data/cool_norm_pooled/GSE178982_AsyncUT_pooled_5000bp_downsampled-589M_KR.cool


INFO:cooler.cli.balance:Balancing "../data/cool_norm_pooled/GSE178982_AsyncUT_pooled_5000bp_downsampled-589M_KR.cool"
INFO:cooler.balance:variance is 419927.0338484352
INFO:cooler.balance:variance is 632648.4777388093
INFO:cooler.balance:variance is 194433.3564141205
INFO:cooler.balance:variance is 294017.38089562196
INFO:cooler.balance:variance is 97488.22581863504
INFO:cooler.balance:variance is 139797.23210399886
INFO:cooler.balance:variance is 50051.86609441799
INFO:cooler.balance:variance is 67369.65956709087
INFO:cooler.balance:variance is 27058.991961527652
INFO:cooler.balance:variance is 33337.52362732791
INFO:cooler.balance:variance is 15136.142327620237
INFO:cooler.balance:variance is 16966.65534206902
INFO:cooler.balance:variance is 8602.676512499096
INFO:cooler.balance:variance is 8847.742780196364
INFO:cooler.balance:variance is 4915.235887096963
INFO:cooler.balance:variance is 4704.089834658719
INFO:cooler.balance:variance is 2809.156707706091
INFO:cooler.balance:variance

prenorm: 2168284483.0
sampling fraction: 0.27202537841525476
sampling to ../data/cool_norm_pooled/GSE178982_AsyncAID_pooled_5000bp_downsampled-589M_KR.cool
normalizing ../data/cool_norm_pooled/GSE178982_AsyncAID_pooled_5000bp_downsampled-589M_KR.cool


INFO:cooler.cli.balance:Balancing "../data/cool_norm_pooled/GSE178982_AsyncAID_pooled_5000bp_downsampled-589M_KR.cool"
INFO:cooler.balance:variance is 457550.60959858046
INFO:cooler.balance:variance is 756007.2241496583
INFO:cooler.balance:variance is 255004.35391039462
INFO:cooler.balance:variance is 437314.30865829287
INFO:cooler.balance:variance is 134109.9032381488
INFO:cooler.balance:variance is 230292.98612782813
INFO:cooler.balance:variance is 72965.30731353287
INFO:cooler.balance:variance is 119710.66542773525
INFO:cooler.balance:variance is 42585.87645179388
INFO:cooler.balance:variance is 63276.31380055596
INFO:cooler.balance:variance is 25865.475834448218
INFO:cooler.balance:variance is 34206.851770914094
INFO:cooler.balance:variance is 15919.41731223413
INFO:cooler.balance:variance is 18872.717068728227
INFO:cooler.balance:variance is 9798.229638918738
INFO:cooler.balance:variance is 10585.738198054565
INFO:cooler.balance:variance is 6001.593938898641
INFO:cooler.balance:va