# stitch chromosomal contigs based on nucmer results
Input: a pad file & a fasta file

In [1]:
import pandas as pd
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

file_in ='Broken-to-be-gapped/Bol29_yang_luft_8_chrom.fasta'
file_out='Broken-to-be-gapped/Bol29-chrom-gapped.fasta'

df_pad = pd.read_csv("Broken-to-be-gapped/Bol29.pad", delimiter="\t", header=None)
df_pad.columns = ['seq_id', 'revcom', 'pad']
print(df_pad.head())
print(df_pad['pad'].sum())


                          seq_id  revcom   pad
0  unitig_10|quiver|quiver|pilon      -1  1114
1  unitig_18|quiver|quiver|pilon      -1   269
2  unitig_15|quiver|quiver|pilon      -1   -33
3  unitig_19|quiver|quiver|pilon      -1   354
4   unitig_4|quiver|quiver|pilon       1  2737
6841


In [3]:
seqs = {}
for seq_record in SeqIO.parse(open (file_in, mode='r'), 'fasta'):
    seqs[seq_record.id] = seq_record

seq_str = ""
contig_len = 0
pad_len = 0
for index, row in df_pad.iterrows():
    id = row['seq_id']
    revcom = row['revcom']
    pad = row['pad']
    
    seq_contig = ''
    if revcom < 0:
        seq_contig = str(seqs[id].reverse_complement().seq)
    else:
        seq_contig = str(seqs[id].seq)
        
    contig_len += len(seqs[id].seq)
    
    # pad or merge
    if pad >= 0: # gap, pad N's
        for i in range(pad):
            seq_contig += 'N'
    else: # overlap, merge
        if revcom < 0:
            seq_contig = seq_contig[abs(pad):] # 4:end (inclusive from 5th, remove 4 bases from revcom start)
        else:
            seq_contig = seq_contig[:pad] # 0:-4 (remove 4 bases from seq end)          
    
    seq_str += seq_contig
    pad_len += pad
        
seq_out = SeqRecord(id = "Bol29_chromosome_gapped", seq = Seq(seq_str))
# check sum:
print(contig_len)
print(pad_len)
print(len(seq_str))

with open(file_out, "w") as f_out:
    f_out.write(seq_out.format('fasta'))


904558
6841
911399
