# Load Dataset

In [10]:
import os
import numpy as np
import scipy.io
from scipy.spatial.distance import pdist, squareform

# Load dataset from file
folder_path = '/mnt/home/tudomlumleart/ceph/05_Sox9Dataset/'

# Only the first 80 probes are relevant. 

# List all .mat files in the folder and load them
cnc_maps = scipy.io.loadmat(folder_path + 'cncMaps.mat')['cncMaps'][:80, :80, :]
esc_maps = scipy.io.loadmat(folder_path + 'escMaps.mat')['escMaps'][:80, :80, :]

# Load polys data and then perform linear interpolation
# List all .mat files in the folder and load them
cnc_polys = scipy.io.loadmat(folder_path + 'cncPols.mat')['cncPols'][:80, :, :]
esc_polys = scipy.io.loadmat(folder_path + 'escPols.mat')['escPols'][:80, :, :]


In [4]:
print(cnc_maps.shape)

(80, 80, 1757)


In [5]:
print(cnc_polys.shape)

(80, 3, 1757)


# Data Structure 

`cnc_maps` and `esc_maps` are 3-dimensional numpy arrays containing **distance maps** of CNC and ESC cell types, respectively. The dimensions of these arrays are (number of probes $\times$ number of probes $\times$ number of cells).

Similarly, `cnc_polys` and `esc_polys` contain **monomer coordinates** of chromatin in CNC and ESC cell types. The dimensions of these are (number of probes $\times$ 3 $\times$ number of cells).


# Linear Imputation to fill in the missing value 
To fill in the missing values, I use linear interpolation to fill in the missing monomers.

In [11]:
def interpolate_polymers(polys):
    num_probes, num_coords, num_cells = polys.shape
    new_polys = np.zeros((num_probes, num_coords, num_cells))
    for c in range(num_cells):
        curr_cells = polys[:, :, c]
        for x in range(num_coords):
            curr_coords = curr_cells[:, x]
            missing_indices = np.isnan(curr_coords)
            valid_indices = ~missing_indices
            interp_coords = np.interp(np.flatnonzero(missing_indices), np.flatnonzero(valid_indices), curr_coords[valid_indices])
            new_polys[missing_indices, x, c] = interp_coords
            new_polys[valid_indices, x, c] = curr_coords[valid_indices]
    return new_polys

In [17]:
esc_polys_interp = interpolate_polymers(esc_polys)
cnc_polys_interp = interpolate_polymers(cnc_polys)

esc_maps_interp = np.array([squareform(pdist(esc_polys_interp[:, :, i])) for i in range(esc_polys_interp.shape[2])]).transpose(1, 2, 0)
cnc_maps_interp = np.array([squareform(pdist(cnc_polys_interp[:, :, i])) for i in range(cnc_polys_interp.shape[2])]).transpose(1, 2, 0)