####  Generation of a common boundary for any two instruments

In [None]:
import pandas as pd
import numpy as np

def process_file(file_path):
    df = pd.read_csv(file_path,  header=None, delimiter='\t')
    df.iloc[1:, 0] = df.iloc[1:, 0] - 1
    df_np = np.array(df).flatten()
    df_np = np.unique(df_np)
    return df_np

if __name__ == "__main__":
    callers = ['CaTCH', 'CHAC', 'deDoc', 'DI', 'TopDom','Arrowhead']
    chrs = list(range(1, 23)) + ['X']
    result_dict = {}
    for chr in chrs:
        for caller in callers:
            file_path = f'/home/wangxiaoyan/deepTAD/work/all_TADs/bin/{caller}/HIC002_{caller}.chr{chr}'    
            result_dict[caller] = process_file(file_path)
        all_data = np.concatenate(list(result_dict.values()))
        unique_data, counts = np.unique(all_data, return_counts=True)
        result = unique_data[counts >= 2]
        print("Data appearing more than twice in six documents:")
        print(result)
        if len(result) == 0:
            print("No data available to satisfy the conditions")
        else:
            formatted_data = []
            for i in range(0, len(result)):
                formatted_data.append(f"{result[i]} {result[i]+1}")
            converted_list = [[int(num) for num in string.split()] for string in formatted_data]
            new_lines = []
            skip_next = 0
            for i in range(len(converted_list)):
                if skip_next:  
                    skip_next -= 1
                    continue
                if i + 2 < len(converted_list) and converted_list[i][1] == converted_list[i+1][0] and converted_list[i+1][1] == converted_list[i+2][0]:
                    new_lines.append(converted_list[i+1])
                    skip_next = 2
                elif i + 1 < len(converted_list) and converted_list[i][1] == converted_list[i+1][0] :
                    new_lines.append(converted_list[i+1])
                    skip_next = 1
                else:
                    new_lines.append(converted_list[i])
            print(new_lines)
            result = '\n'.join([' '.join(map(str, line)) for line in new_lines])
            with open(f'/home/wangxiaoyan/deepTAD/work_public/boundary-generate/HIC002/chr{chr}.txt', 'w') as f:
                f.write(result)

### chr1-12 Generation of training samples for all positive and negative samples

In [None]:
import numpy as np

def openreadtxt(file_name):
    
    with open(file_name, 'r') as file:
        file_data = file.readlines()
        data = [row.replace('\n', '').split() for row in file_data]
        return data

if __name__ == '__main__':
    chrs=list(range(1,13))
    for chr in chrs:
        all_data = []
        #boundary_indices = []
        lable_T = 1
        lable_F = 0
        row = openreadtxt('/home/wangxiaoyan/deepTAD/work_public/boundary-generate/HIC002/chr%s.txt'%(chr))
        big_matrix = np.array(openreadtxt('/home/wangxiaoyan/deepTAD/prepare_data/HIC002/HIC002_25k_KR.chr%s'%(chr)))
        print(big_matrix.shape)
        padded_matrix = np.pad(big_matrix, ((4, 5), (4, 5)), 'constant')
        print(padded_matrix.shape)
        ww = 0
        for m in range(5, 5 + len(big_matrix)):
            if ww < len(row):
                num_key = int(row[ww][0]) - 1
            if m == num_key + 5:
                label = lable_T
                ww += 1
            else:
                label = lable_F
            midx = m
            bdy = m - 4
            matrix = padded_matrix[midx - 5:midx + 5, midx - 5:midx + 5]
            all_data.append(np.append(matrix.flatten(), label))

        all_data = np.array(all_data)
        all_data = all_data.astype('float32')
        print(all_data.shape)

        np.save('/home/wangxiaoyan/deepTAD/work_public/samples-generate/train/HIC002_chr%s-matrix.npy'%(chr),all_data) 


## Chr1-12 produce 1:4 positive and negative samples

In [None]:
import numpy as np

chrs = list(range(1, 13))
train_samples = []

def rotate_matrix(matrix):
    return np.rot90(matrix, k=-1) 

for chr in chrs:
    train_data = np.load('/home/wangxiaoyan/deepTAD/work_public/samples-generate/train/HIC002_chr%s-matrix.npy' % chr)
    pos_num = np.sum(train_data[:, -1] == 1)
    pos_samples = train_data[train_data[:, -1] == 1]

    rotated_matrices = []
    for sample in pos_samples:
        matrix = sample[:-1].reshape((10, 10))
        rotated_matrix = rotate_matrix(matrix)
        rotated_matrices.append(rotated_matrix)
    rotated_data = np.vstack((pos_samples, np.hstack((np.array(rotated_matrices).reshape((-1, 100)), np.ones((len(rotated_matrices), 1))))))

    neg_samples = train_data[train_data[:, -1] == 0]
    random_indices = np.random.choice(len(neg_samples), size=pos_num * 4, replace=False)
    relneg_samples = neg_samples[random_indices]
    train_samples.append(np.concatenate((rotated_data, relneg_samples), axis=0))

train_samples = np.concatenate(train_samples, axis=0)
np.save('/home/wangxiaoyan/deepTAD/work_public/samples-generate/data enhancement/HIC002_train_data-1bi2.npy', train_samples)

## chr13-19 Generation of all positive and negative samples Validation set

In [None]:
import numpy as np

def openreadtxt(file_name):
    
    with open(file_name, 'r') as file:
        file_data = file.readlines()
        data = [row.replace('\n', '').split() for row in file_data]
        return data

if __name__ == '__main__':
    chrs=list(range(13,20))
    for chr in chrs:
        all_data = []
        #boundary_indices = []
        lable_T = 1
        lable_F = 0
        row = openreadtxt('/home/wangxiaoyan/deepTAD/work_public3/boundary-generate/HIC002/chr%s.txt'%(chr))
        big_matrix = np.array(openreadtxt('/home/wangxiaoyan/deepTAD/prepare_data/HIC002/HIC002_25k_KR.chr%s'%(chr)))
        print(big_matrix.shape)
        padded_matrix = np.pad(big_matrix, ((4, 5), (4, 5)), 'constant')
        print(padded_matrix.shape)
        ww = 0
        for m in range(5, 5 + len(big_matrix)):
            if ww < len(row):
                num_key = int(row[ww][0]) - 1
            if m == num_key + 5:
                label = lable_T
                ww += 1
            else:
                label = lable_F
            midx = m
            bdy = m - 4
            matrix = padded_matrix[midx - 5:midx + 5, midx - 5:midx + 5]
            all_data.append(np.append(matrix.flatten(), label))
        all_data = np.array(all_data)
        all_data = all_data.astype('float32')
        print(all_data.shape)

        np.save('/home/wangxiaoyan/deepTAD/work_public/samples-generate/validation/HIC002_chr%s-matrix.npy'%(chr),all_data) 

## Chr13-19 produce 1:4 positive and negative samples

In [None]:
import numpy as np

chrs = list(range(13, 20))
train_samples = []

def rotate_matrix(matrix):
    return np.rot90(matrix, k=-1)  

for chr in chrs:
    train_data = np.load('/home/wangxiaoyan/deepTAD/work_public/samples-generate/validation/HIC002_chr%s-matrix.npy' % chr)
    pos_num = np.sum(train_data[:, -1] == 1)
    pos_samples = train_data[train_data[:, -1] == 1]
    rotated_matrices = []
    for sample in pos_samples:
        matrix = sample[:-1].reshape((10, 10))
        rotated_matrix = rotate_matrix(matrix)
        rotated_matrices.append(rotated_matrix)

    rotated_data = np.vstack((pos_samples, np.hstack((np.array(rotated_matrices).reshape((-1, 100)), np.ones((len(rotated_matrices), 1))))))
    neg_samples = train_data[train_data[:, -1] == 0]
    random_indices = np.random.choice(len(neg_samples), size=pos_num * 4,replace=False)
    relneg_samples = neg_samples[random_indices]
    train_samples.append(np.concatenate((rotated_data, relneg_samples), axis=0))
   
  
train_samples = np.concatenate(train_samples, axis=0)

np.save('/home/wangxiaoyan/deepTAD/work_public/samples-generate/data enhancement/HIC002_validation_data-1bi2.npy', train_samples)

### chr20-23 generation of prediction data

In [None]:
import numpy as np

def openreadtxt(file_name):
    
    with open(file_name, 'r') as file:
        file_data = file.readlines()
        data = [row.replace('\n', '').split() for row in file_data]
        return data

if __name__ == '__main__':
    chrs=list(range(20,23))
    chrs = [str(x) for x in chrs]
    chrs.append('X')
    for chr in chrs:
        all_data = []
        #boundary_indices = []
        lable_T = 1
        lable_F = 0
        row = openreadtxt('/home/wangxiaoyan/deepTAD/work_public/boundary-generate/HIC002/chr%s.txt'%(chr))
        big_matrix = np.array(openreadtxt('/home/wangxiaoyan/deepTAD/prepare_data/HIC002/HIC002_25k_KR.chr%s'%(chr)))
        print(big_matrix.shape)
        padded_matrix = np.pad(big_matrix, ((4, 5), (4, 5)), 'constant')
        print(padded_matrix.shape)
        ww = 0
        for m in range(5, 5 + len(big_matrix)):
            if ww < len(row):
                num_key = int(row[ww][0]) - 1
            if m == num_key + 5:
                label = lable_T
                ww += 1
            else:
                label = lable_F
            midx = m
            bdy = m - 4
            matrix = padded_matrix[midx - 5:midx + 5, midx - 5:midx + 5]
            all_data.append(np.append(matrix.flatten(), label))
        all_data = np.array(all_data)
        all_data = all_data.astype('float32')
        print(all_data.shape)

        np.save('/home/wangxiaoyan/deepTAD/work_public/samples-generate/predict/HIC002_chr%s-matrix.npy'%(chr),all_data) 


### Prediction of all samples generated at different resolutions, unlabeled

### *This code can also be used to generate predicted samples of other cell lines

In [None]:
import numpy as np
np.set_printoptions(threshold=np.inf)  
def openreadtxt(file_name):
    
    with open(file_name, 'r') as file:
        file_data = file.readlines()
        data = [row.replace('\n', '').split() for row in file_data]
        return data

if __name__ == '__main__':
    chrs=list(range(20,23))
    chrs = [str(x) for x in chrs]
    chrs.append('X')
    resolutions=[10000,25000,50000,100000]
    for chr in chrs:
        for resolution in resolutions:
            display_reso=resolution/1000
            all_data = []
            big_matrix = openreadtxt("/home/wangxiaoyan/deepTAD/prepare_data/HIC002/HIC002_%dk_KR.chr%s"%(display_reso,chr))
            big_matrix = np.array(big_matrix) 
            print(big_matrix.shape)
            top = len(big_matrix)
            padded_matrix = np.pad(big_matrix, ((4, 5), (4, 5)), 'constant')
            for i in range(5,5+len(big_matrix)):
                    num_key=i-4
                    midx=i
                    matrix=np.zeros((10,10))
                    matrix=padded_matrix[midx-5:midx+5,midx-5:midx+5]
                    matrix1 = np.array(matrix)
                    matrix1 = np.reshape(matrix1,(1,-1))
                    all_data.append(matrix1)
            all_data = np.concatenate(all_data,axis=0)
            Matrix = np.array(all_data)
            print(Matrix.shape)
            np.save('/home/wangxiaoyan/deepTAD/work_public/samples-generate/predict_diffres-withoutlabel/HIC002_chr%s-matrix_%dk.npy'%(chr,display_reso),Matrix) 
