# Setting

In [1]:
%load_ext autoreload
%autoreload 0

In [None]:
# Python 3.10

# mamba instsall python==3.12.4
# mamba install jupyterlab==4.2.4

# THESE NEED TO BE INSTALLED IN SEQUENTIAL ORDER
# %pip install numpy==1.26.4
# %pip install matplotlib==3.8.0
# %pip install svgutils==0.3.4
# %pip install seaborn==0.13.2
# %pip install coolpuppy==1.1.0
# %pip install pathos

In [2]:
# Import standard python libraries
import matplotlib as mpl
%matplotlib inline
mpl.rcParams['figure.dpi'] = 300
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os, subprocess

from coolpuppy import coolpup
from coolpuppy.lib import numutils
from coolpuppy.lib.puputils import divide_pups
from coolpuppy import plotpup
import cooler
import bioframe
import cooltools
from cooltools import expected_cis, expected_trans
from cooltools.lib import plotting

# Multiprocessing
from pathos.multiprocessing import ProcessingPool as Pool


# Loop score calculation

In [3]:
# LOADING COOL FILE
coolDir = '/Volumes/UKJIN_SSD/data/cool_norm_pooled'
dataDir = '/Volumes/UKJIN_SSD/data/'

res = 10

### Defining function for extracting observed and O/E value of loops

In [4]:
# Defining function
def extract_obs_exp(loop, matrix, expected_df, bin_size):
    mid1 = (loop.start1 + loop.end1) // 2
    mid2 = (loop.start2 + loop.end2) // 2
    obs_matrix = matrix.fetch(
        (loop.chrom1, mid1,   mid1+1),
        (loop.chrom2, mid2,   mid2+1))
    observed_values = float(obs_matrix)

    # Calculate genomic distance and look up expected value
    bin1 = loop.start1 // bin_size
    bin2 = loop.start2 // bin_size
    diag_bin = abs(bin2 - bin1)
    expected_row = expected_df[(expected_df['region1'] == loop.chrom1) & (expected_df['dist'] == diag_bin)]
    expected_value = expected_row['balanced.avg'].values[0] if not expected_row.empty else None

    # Calculate O/E
    ooe_value = observed_values / expected_value if expected_value is not None else None
    return observed_values, expected_value, ooe_value


### Chromosight loops

In [7]:
loopDir = dataDir + "loop_chromosight"
resolutions = [25000, 10000, 5000]


if __name__ == "__main__":
    # Loop through each resolution
    for res in resolutions:
        # File paths based on current resolution
        clr_files = [
            os.path.join(coolDir, f'G1DMSO_pooled_{res}bp_KR.cool'),
            os.path.join(coolDir, f'G1dTAG_pooled_{res}bp_KR.cool'),
            os.path.join(coolDir, f'G1A485_pooled_{res}bp_KR.cool'),
            os.path.join(coolDir, f'EpiG1DMSO_pooled_{res}bp_KR.cool'),
            os.path.join(coolDir, f'EpiG1dTAG_pooled_{res}bp_KR.cool'),
            os.path.join(coolDir, f'GSE178982_AsyncUT_pooled_{res}bp_KR.cool'),
            os.path.join(coolDir, f'GSE178982_AsyncAID_pooled_{res}bp_KR.cool'),
        ]
        
        exp_files = [
            os.path.join(coolDir, f'G1DMSO_pooled_{res}bp_KR_exp.tsv'),
            os.path.join(coolDir, f'G1dTAG_pooled_{res}bp_KR_exp.tsv'),
            os.path.join(coolDir, f'G1A485_pooled_{res}bp_KR_exp.tsv'),
            os.path.join(coolDir, f'EpiG1DMSO_pooled_{res}bp_KR_exp.tsv'),
            os.path.join(coolDir, f'EpiG1dTAG_pooled_{res}bp_KR_exp.tsv'),
            os.path.join(coolDir, f'GSE178982_AsyncUT_pooled_{res}bp_KR_exp.tsv'),
            os.path.join(coolDir, f'GSE178982_AsyncAID_pooled_{res}bp_KR_exp.tsv'),
        ]
        
        # Load loops file based on resolution
        loops = pd.read_csv(os.path.join(loopDir, f'chromo_union_allRes_postprocessed_{res}bp.bedpe'), delimiter='\t', 
                            header=None, names=['chrom1', 'start1', 'end1', 'chrom2', 'start2', 'end2'])
        
        # Process each matrix and expected file combination
        for clr_file, exp_file in zip(clr_files, exp_files):
            # Extract condition name (e.g., "G1.DMSO") from file path
            condition = os.path.basename(clr_file).split('_pooled')[0] + '_pooled'

            print("Processing...")
            print(clr_file)
            print(exp_file)

            # Load matrix and expected values
            clr = cooler.Cooler(clr_file)
            matrix = clr.matrix(balance=True)
            expected_df = pd.read_csv(exp_file, delimiter='\t')
            
            # Get bin size and prepare loop data
            bin_size = clr.binsize
            loops_list = [row for _, row in loops.iterrows()]
            
            # Parallel processing for each loop
            args = [(loop, matrix, expected_df, bin_size) for loop in loops_list]

            with Pool() as pool:
                results = pool.map(lambda arg: extract_obs_exp(*arg), args)
            
            # Add observed and O/E values to the DataFrame
            loops_df = loops.copy()
            loops_df['obs'], loops_df['exp'], loops_df['ooe'] = zip(*results)

            
            # Save each processed DataFrame to a TSV file
            output_file = os.path.join(loopDir, f'chromo_union_allRes_postprocessed_{res}bp_obxexp_{condition}.tsv')
            loops_df.to_csv(output_file, sep='\t', index=False)

Processing...
/Volumes/UKJIN_SSD/data/cool_norm_pooled/G1DMSO_pooled_25000bp_KR.cool
/Volumes/UKJIN_SSD/data/cool_norm_pooled/G1DMSO_pooled_25000bp_KR_exp.tsv
Processing...
/Volumes/UKJIN_SSD/data/cool_norm_pooled/G1dTAG_pooled_25000bp_KR.cool
/Volumes/UKJIN_SSD/data/cool_norm_pooled/G1dTAG_pooled_25000bp_KR_exp.tsv
Processing...
/Volumes/UKJIN_SSD/data/cool_norm_pooled/G1A485_pooled_25000bp_KR.cool
/Volumes/UKJIN_SSD/data/cool_norm_pooled/G1A485_pooled_25000bp_KR_exp.tsv
Processing...
/Volumes/UKJIN_SSD/data/cool_norm_pooled/EpiG1DMSO_pooled_25000bp_KR.cool
/Volumes/UKJIN_SSD/data/cool_norm_pooled/EpiG1DMSO_pooled_25000bp_KR_exp.tsv
Processing...
/Volumes/UKJIN_SSD/data/cool_norm_pooled/EpiG1dTAG_pooled_25000bp_KR.cool
/Volumes/UKJIN_SSD/data/cool_norm_pooled/EpiG1dTAG_pooled_25000bp_KR_exp.tsv
Processing...
/Volumes/UKJIN_SSD/data/cool_norm_pooled/GSE178982_AsyncUT_pooled_25000bp_KR.cool
/Volumes/UKJIN_SSD/data/cool_norm_pooled/GSE178982_AsyncUT_pooled_25000bp_KR_exp.tsv
Processing..

### HiCDC+ loops

In [6]:
loopDir = dataDir + "loop_hicdcp"
resolutions = [10000]


if __name__ == "__main__":
    # Loop through each resolution
    for res in resolutions:
        # File paths based on current resolution
        clr_files = [
            os.path.join(coolDir, f'G1DMSO_pooled_{res}bp_KR.cool'),
            os.path.join(coolDir, f'G1dTAG_pooled_{res}bp_KR.cool'),
            os.path.join(coolDir, f'G1A485_pooled_{res}bp_KR.cool'),
            os.path.join(coolDir, f'EpiG1DMSO_pooled_{res}bp_KR.cool'),
            os.path.join(coolDir, f'EpiG1dTAG_pooled_{res}bp_KR.cool'),
            os.path.join(coolDir, f'GSE178982_AsyncUT_pooled_{res}bp_KR.cool'),
            os.path.join(coolDir, f'GSE178982_AsyncAID_pooled_{res}bp_KR.cool'),
        ]
        
        exp_files = [
            os.path.join(coolDir, f'G1DMSO_pooled_{res}bp_KR_exp.tsv'),
            os.path.join(coolDir, f'G1dTAG_pooled_{res}bp_KR_exp.tsv'),
            os.path.join(coolDir, f'G1A485_pooled_{res}bp_KR_exp.tsv'),
            os.path.join(coolDir, f'EpiG1DMSO_pooled_{res}bp_KR_exp.tsv'),
            os.path.join(coolDir, f'EpiG1dTAG_pooled_{res}bp_KR_exp.tsv'),
            os.path.join(coolDir, f'GSE178982_AsyncUT_pooled_{res}bp_KR_exp.tsv'),
            os.path.join(coolDir, f'GSE178982_AsyncAID_pooled_{res}bp_KR_exp.tsv'),
        ]
        
        # Load loops file based on resolution
        loops = pd.read_csv(os.path.join(loopDir, f'hicdcp_union_{res}bp.bedpe'), delimiter='\t', 
                            header=None, names=['chrom1', 'start1', 'end1', 'chrom2', 'start2', 'end2'])
        
        # Process each matrix and expected file combination
        for clr_file, exp_file in zip(clr_files, exp_files):
            # Extract condition name (e.g., "G1.DMSO") from file path
            condition = os.path.basename(clr_file).split('_pooled')[0] + '_pooled'

            print("Processing...")
            print(clr_file)
            print(exp_file)

            # Load matrix and expected values
            clr = cooler.Cooler(clr_file)
            matrix = clr.matrix(balance=True)
            expected_df = pd.read_csv(exp_file, delimiter='\t')
            
            # Get bin size and prepare loop data
            bin_size = clr.binsize
            loops_list = [row for _, row in loops.iterrows()]
            
            # Parallel processing for each loop
            args = [(loop, matrix, expected_df, bin_size) for loop in loops_list]

            with Pool() as pool:
                results = pool.map(lambda arg: extract_obs_exp(*arg), args)
            
            # Add observed and O/E values to the DataFrame
            loops_df = loops.copy()
            loops_df['obs'], loops_df['exp'], loops_df['ooe'] = zip(*results)

            
            # Save each processed DataFrame to a TSV file
            output_file = os.path.join(loopDir, f'hicdcp_union_{res}bp_obxexp_{condition}.tsv')
            loops_df.to_csv(output_file, sep='\t', index=False)

Processing...
/Volumes/UKJIN_SSD/data/cool_norm_pooled/G1DMSO_pooled_10000bp_KR.cool
/Volumes/UKJIN_SSD/data/cool_norm_pooled/G1DMSO_pooled_10000bp_KR_exp.tsv
Processing...
/Volumes/UKJIN_SSD/data/cool_norm_pooled/G1dTAG_pooled_10000bp_KR.cool
/Volumes/UKJIN_SSD/data/cool_norm_pooled/G1dTAG_pooled_10000bp_KR_exp.tsv
Processing...
/Volumes/UKJIN_SSD/data/cool_norm_pooled/G1A485_pooled_10000bp_KR.cool
/Volumes/UKJIN_SSD/data/cool_norm_pooled/G1A485_pooled_10000bp_KR_exp.tsv
Processing...
/Volumes/UKJIN_SSD/data/cool_norm_pooled/EpiG1DMSO_pooled_10000bp_KR.cool
/Volumes/UKJIN_SSD/data/cool_norm_pooled/EpiG1DMSO_pooled_10000bp_KR_exp.tsv
Processing...
/Volumes/UKJIN_SSD/data/cool_norm_pooled/EpiG1dTAG_pooled_10000bp_KR.cool
/Volumes/UKJIN_SSD/data/cool_norm_pooled/EpiG1dTAG_pooled_10000bp_KR_exp.tsv
Processing...
/Volumes/UKJIN_SSD/data/cool_norm_pooled/GSE178982_AsyncUT_pooled_10000bp_KR.cool
/Volumes/UKJIN_SSD/data/cool_norm_pooled/GSE178982_AsyncUT_pooled_10000bp_KR_exp.tsv
Processing..

### Hansen loops

In [None]:
loopDir = dataDir + "loop_Hansen"
resolutions = [5000, 2000, 1000]

if __name__ == "__main__":
    # Loop through each resolution
    for res in resolutions:
        # File paths based on current resolution
        clr_files = [
            os.path.join(coolDir, f'G1DMSO_pooled_{res}bp_KR.cool'),
            os.path.join(coolDir, f'G1dTAG_pooled_{res}bp_KR.cool'),
            os.path.join(coolDir, f'G1A485_pooled_{res}bp_KR.cool'),
            os.path.join(coolDir, f'EpiG1DMSO_pooled_{res}bp_KR.cool'),
            os.path.join(coolDir, f'EpiG1dTAG_pooled_{res}bp_KR.cool'),
            os.path.join(coolDir, f'GSE178982_AsyncUT_pooled_{res}bp_KR.cool'),
            os.path.join(coolDir, f'GSE178982_AsyncAID_pooled_{res}bp_KR.cool'),
        ]
        
        exp_files = [
            os.path.join(coolDir, f'G1DMSO_pooled_{res}bp_KR_exp.tsv'),
            os.path.join(coolDir, f'G1dTAG_pooled_{res}bp_KR_exp.tsv'),
            os.path.join(coolDir, f'G1A485_pooled_{res}bp_KR_exp.tsv'),
            os.path.join(coolDir, f'EpiG1DMSO_pooled_{res}bp_KR_exp.tsv'),
            os.path.join(coolDir, f'EpiG1dTAG_pooled_{res}bp_KR_exp.tsv'),
            os.path.join(coolDir, f'GSE178982_AsyncUT_pooled_{res}bp_KR_exp.tsv'),
            os.path.join(coolDir, f'GSE178982_AsyncAID_pooled_{res}bp_KR_exp.tsv'),
        ]
        
        # Load loops file based on resolution
        loops = pd.read_csv(os.path.join(loopDir, f'Hansen_union_allRes_postprocessed_{res}bp.bedpe'), delimiter='\t', 
                            header=None, names=['chrom1', 'start1', 'end1', 'chrom2', 'start2', 'end2'])
        
        # Process each matrix and expected file combination
        for clr_file, exp_file in zip(clr_files, exp_files):
            # Extract condition name (e.g., "G1.DMSO") from file path
            condition = os.path.basename(clr_file).split('_pooled')[0] + '_pooled'

            print("Processing...")
            print(clr_file)
            print(exp_file)

            # Load matrix and expected values
            clr = cooler.Cooler(clr_file)
            matrix = clr.matrix(balance=True)
            expected_df = pd.read_csv(exp_file, delimiter='\t')
            
            # Get bin size and prepare loop data
            bin_size = clr.binsize
            loops_list = [row for _, row in loops.iterrows()]
            
            # Parallel processing for each loop
            args = [(loop, matrix, expected_df, bin_size) for loop in loops_list]

            with Pool() as pool:
                results = pool.map(lambda arg: extract_obs_exp(*arg), args)
            
            # Add observed and O/E values to the DataFrame
            loops_df = loops.copy()
            loops_df['obs'], loops_df['exp'], loops_df['ooe'] = zip(*results)

            
            # Save each processed DataFrame to a TSV file
            output_file = os.path.join(loopDir, f'Hansen_union_allRes_postprocessed_{res}bp_obxexp_{condition}.tsv')
            loops_df.to_csv(output_file, sep='\t', index=False)

Processing...
/Volumes/UKJIN_SSD/data/cool_norm_pooled/G1DMSO_pooled_2000bp_KR.cool
/Volumes/UKJIN_SSD/data/cool_norm_pooled/G1DMSO_pooled_2000bp_KR_exp.tsv
Processing...
/Volumes/UKJIN_SSD/data/cool_norm_pooled/G1dTAG_pooled_2000bp_KR.cool
/Volumes/UKJIN_SSD/data/cool_norm_pooled/G1dTAG_pooled_2000bp_KR_exp.tsv
Processing...
/Volumes/UKJIN_SSD/data/cool_norm_pooled/G1A485_pooled_2000bp_KR.cool
/Volumes/UKJIN_SSD/data/cool_norm_pooled/G1A485_pooled_2000bp_KR_exp.tsv
Processing...
/Volumes/UKJIN_SSD/data/cool_norm_pooled/EpiG1DMSO_pooled_2000bp_KR.cool
/Volumes/UKJIN_SSD/data/cool_norm_pooled/EpiG1DMSO_pooled_2000bp_KR_exp.tsv
Processing...
/Volumes/UKJIN_SSD/data/cool_norm_pooled/EpiG1dTAG_pooled_2000bp_KR.cool
/Volumes/UKJIN_SSD/data/cool_norm_pooled/EpiG1dTAG_pooled_2000bp_KR_exp.tsv
Processing...
/Volumes/UKJIN_SSD/data/cool_norm_pooled/GSE178982_AsyncUT_pooled_2000bp_KR.cool
/Volumes/UKJIN_SSD/data/cool_norm_pooled/GSE178982_AsyncUT_pooled_2000bp_KR_exp.tsv
Processing...
/Volumes/U

ValueError: Unknown sequence label: chr4_GL456216_random