# Downsampling
Downsampling Micro-C to EpiLC G1 DMSO which has the least valid read pairs (474,973,552)

In [6]:
# Standard library imports
import os
import warnings
from itertools import combinations
from multiprocessing import Pool
import subprocess

# Third-party imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import colors
import bioframe
import cooler
import cooltools
from packaging import version

# Jupyter magic (only in notebooks)
%matplotlib inline

# ------------------------------------------------------------------------------
# Configuration
# ------------------------------------------------------------------------------

# Suppress non-critical warnings
warnings.filterwarnings('ignore')

# Determine number of CPUs
num_cpus = os.cpu_count() or 1

print(f'Using {num_cpus} CPU core{'s' if num_cpus > 1 else ''}.')

os.chdir("/Volumes/UKJIN_SSD/MtoG1_analysis_code")

Using 10 CPU cores.


## Parameters

In [7]:
REFDIR = '../reference'
SCRDIR = 'script'

# Use bioframe to fetch the genomic features from the UCSC.
mm10_chromsizes = bioframe.fetch_chromsizes('mm10')
mm10_cens = pd.read_csv(f'{REFDIR}/mm10/mm10.centromere.txt', sep = '\t')

# create a view with chromosome arms using chromosome sizes and definition of centromeres
mm10_arms = bioframe.make_chromarms(mm10_chromsizes,  mm10_cens)


## Downsampling

In [8]:
MCOOLDIR = '../data/mcool_pooled'
COOLDIR = '../data/cool_norm_pooled'

In [None]:
RES = '5000'

NAMES = [
    # 'G1DMSO_pooled',
    # 'G1dTAG_pooled',
    'G1A485_pooled',
    'GSE178982_AsyncUT_pooled',
    'GSE178982_AsyncAID_pooled',
    # 'EpiG1DMSO_pooled',
    # 'EpiG1dTAG_pooled'
]


# NAME = 'EpiG1DMSO_pooled'
for NAME in NAMES:
	pre_norm = cooler.Cooler(f'{COOLDIR}/{NAME}_{RES}bp_KR.cool')
	print(f'{NAME} valid readcounts in 5kb: {pre_norm.info['sum']}')


G1DMSO_pooled valid readcounts in 5kb: 1387814791.0
G1dTAG_pooled valid readcounts in 5kb: 1235278421.0


In [10]:
DSCOUNT = 589828407

In [None]:
for NAME in NAMES:
    # paths
    subsampled = f'{COOLDIR}/{NAME}_{RES}bp_downsampled-{DSCOUNT//1_000_000}M_KR.cool'

    # calculate subsampling fraction
    pre_norm = cooler.Cooler(f'{COOLDIR}/{NAME}_{RES}bp_KR.cool')
    frac = DSCOUNT / pre_norm.info['sum']
    print(f'prenorm: {pre_norm.info['sum']}')
    print(f'sampling fraction: {frac}')
    
    1) subsample if needed
    if not os.path.exists(subsampled):
        print(f'sampling to {subsampled}')
        mc = cooler.Cooler(f'{MCOOLDIR}/{NAME}_allRes.mcool::/resolutions/{RES}')
        cooltools.sample(mc,
                         out_clr_path=subsampled,
                         frac=frac,
                         nproc=num_cpus)
        
        script = os.path.join(SCRDIR, 'normCool.sh')
        print(f'normalizing {subsampled}')
        subprocess.run([script, subsampled, str(num_cpus)], check=True)
    else:
        print(f'skipping sample (exists): {subsampled}')




prenorm: 1387814791.0
sampling fraction: 0.42500513096203196
sampling to ../data/cool_norm_pooled/G1DMSO_pooled_5000bp_downsampled-589M_KR.cool
normalizing ../data/cool_norm_pooled/G1DMSO_pooled_5000bp_downsampled-589M_KR.cool


INFO:root:creating a Pool of 10 workers
  groups = dict(iter(bins.groupby("chrom")[clr_weight_name]))


prenorm: 1235278421.0
sampling fraction: 0.47748620632627403
sampling to ../data/cool_norm_pooled/G1dTAG_pooled_5000bp_downsampled-589M_KR.cool
normalizing ../data/cool_norm_pooled/G1dTAG_pooled_5000bp_downsampled-589M_KR.cool


INFO:root:creating a Pool of 10 workers
  groups = dict(iter(bins.groupby("chrom")[clr_weight_name]))
