In [1]:
import pandas as pd
import numpy as np
import os
import pybedtools
from pybedtools import BedTool
from itertools import combinations

In [2]:
chr_list = [
    "chr1", "chr2", "chr3", "chr4", "chr5", "chr6",
    "chr7", "chr8", "chr9", "chr10", "chr11", "chr12",
    "chr13", "chr14", "chr15", "chr16", "chr17", "chr18",
    "chr19", "chr20", "chr21", "chr22", "chrX",
]

In [3]:
def func(key, value):
    """contact matrix make processing"""
    Dict_format = {"read_name":[], "chrom1":[], "pos1":[], "frag1":[],
                   "chrom2":[], "pos2":[], "frag2":[], "HType":[]}
    
    list1 = value["hp"].values
    list2 = list(combinations(list1, 2))
    
    index = value.index
    index_list = list(combinations(index,2))
    index_list = [list(row) for row in index_list]
    
    tmp = []
    for i in range(0,len(list2)):
        Dict_format["read_name"].append(key)
        contact = value.loc[index_list[i]]
        Type = contact.loc[index_list[i][0], "HType"]
        Dict_format["HType"].append(Type)
        
        Dict_format["chrom1"].append("tmp")
        b1 = int(contact.loc[index_list[i][0], "pos"])
        Dict_format["pos1"].append(b1)
        d1 = contact.loc[index_list[i][0], "fid"]
        Dict_format["frag1"].append(d1)
        
        Dict_format["chrom2"].append("tmp")
        b2 = int(contact.loc[index_list[i][1], "pos"]) 
        Dict_format["pos2"].append(b2)
        d2 = contact.loc[index_list[i][1], "fid"]
        Dict_format["frag2"].append(d2)

    return pd.DataFrame(Dict_format)
import multiprocessing
def func1_wrapper(args):
    """pass parameter"""
    return func(*args)
def file_tackle(df):
    """contact matrix workflow"""
    readID_dict = {}
    group = df.groupby(df.rid)

    for readID, gdf in group:
        readID_dict[readID] = gdf.reset_index(drop = True)  

    if __name__ == '__main__':
        with multiprocessing.Pool(processes=20) as pool:
            args_list = [(key, value) for key, value in readID_dict.items()]
            result = pool.map(func1_wrapper, args_list)
    fhap_df = pd.concat(result)
    return fhap_df

In [4]:
def LoadMatrixReader(filename, Chunksize,  sepstr="\t"):
    file_reader = pd.read_table(filename, sep=sepstr,
                                chunksize=Chunksize, iterator=True,
                                header=0, index_col=None, low_memory=False)
    return(file_reader)

def CompleteDF(df, dfhold, Chunksize):
    lastread = df.iloc[-1]["rid"]
    df = pd.concat([dfhold, df]) # concat dfhold and df
    # last read df
    P = df["rid"] == lastread
    dfhold = df.loc[P, :].copy()
    if len(df) >= Chunksize: # not the last iterally loading
        df = df.drop(df.loc[P].index.to_list() , axis=0)

    return(df, dfhold)

In [None]:
Chunksize = 2500000
Rawdir="./figure_processing_data/Fig6"
for file in files:
    fhappath = f"{Rawdir}/{file}"
    fhaps = ['h1', 'h2']
    for fhap in fhaps:
        paffile = fhappath + '/' + fhap + '_fhap.txt'
        print(f"Loading {paffile}")
        reader  =  LoadMatrixReader(paffile, Chunksize,  sepstr="\t")
        df_hold = pd.DataFrame()
        for chunk in reader:
        
            chunk, df_hold = CompleteDF(chunk, df_hold, Chunksize)
            result = file_tackle(chunk)
            result['chrom1'] = file
            result['chrom2'] = file
            modify_list = ['read_name', 'pos1', 'frag1', 'pos2', 'frag2']
            result[modify_list] = result[modify_list].astype('int')
            result.to_csv(f"{Rawdir}/{file}/contact_matrix_{fhap}.txt", sep = "\t", index = 0, mode = 'a')
            result = None
        print(f"write contact_matrix_{fhap}.txt done!")

In [None]:
for file in chr_list:
    path = f"{Rawdir}/{file}"
    print(f"Loading {path}")
    contact_matrix(path)

In [None]:
def contact_matrix(path):
    h1_df = pd.read_csv(path + '/' + "h1_matrix.txt", sep = "\t",
                       header = None)
    h2_df = pd.read_csv(path + '/' + "h2_matrix.txt", sep = "\t",
                       header = None)
    h1_df['strand1'] = 0
    h1_df['fragid1'] = 0
    h1_df['strand2'] = 0
    h1_df['fragid2'] = 1
    h1_df['mapq1'] = 30
    h1_df['mapq2'] = 60

    h2_df['strand1'] = 0
    h2_df['fragid1'] = 0
    h2_df['strand2'] = 0
    h2_df['fragid2'] = 1
    h2_df['mapq1'] = 30
    h2_df['mapq2'] = 60
    h1_df = h1_df[[0, "strand1", 1, 2, "fragid1", "strand2", 4, 5, "fragid2", "mapq1", "mapq2"]]
    h2_df = h2_df[[0, "strand1", 1, 2, "fragid1", "strand2", 4, 5, "fragid2", "mapq1", "mapq2"]]
    print(f"writing to {path}/contact_matrix_h1.txt")
    h1_df.to_csv(f"{path}/contact_matrix_h1.txt",
                sep = "\t", index = None, header = None)
    print(f"writing to {path}/contact_matrix_h2.txt")
    h2_df.to_csv(f"{path}/contact_matrix_h2.txt",
                sep = "\t", index = None, header = None)