# stitch chromosomal contigs based on nucmer results
Input: a pad file & a fasta file

Sherwood (April 7, 2023): Remove wraparound 2337 bp at left end.  Has telomere after this removal.
Remove wraparound 6099 bp at right end.  Has telomere after this removal.  Has extension very similar to that of 217_5.

Z9.pad:

unitig_22|quiver|pilon  -1      1244
unitig_18|quiver|pilon  1       1191
unitig_27|quiver|pilon  -1      -4
unitig_32|quiver|pilon  1       2086
unitig_25|quiver|pilon  1       2766
unitig_20|quiver|pilon  1       1308
unitig_23|quiver|pilon  -1      2958
unitig_4|quiver|pilon   -1      1893
unitig_11|quiver|pilon  -1      1907
unitig_19|quiver|pilon  -1      371
unitig_16|quiver|pilon  -1      1869
unitig_12|quiver|pilon  1       6222
unitig_31|quiver|pilon  1       1061
unitig_40|quiver|pilon  1       512
unitig_5|quiver|pilon   1       3762
unitig_24|quiver|pilon  1       785
unitig_36|quiver|pilon  1       457
unitig_33|quiver|pilon  1       3320
unitig_17|quiver|pilon  -1      1616
unitig_28|quiver|pilon  -1      2266
unitig_21|quiver|pilon  1       1421
unitig_26|quiver|pilon  1       3429
unitig_10|quiver|pilon  -1      1024
unitig_35|quiver|pilon  1       2306
unitig_13|quiver|pilon  1       0

In [32]:
import pandas as pd
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

file_in ='Broken-to-be-gapped/Z9_yang_luft_7.pilon_polishedMay2020.fasta'
file_out='Broken-to-be-gapped/Z9-chrom-gapped-v2.fasta'

df_pad = pd.read_csv("Broken-to-be-gapped/Z9.pad", delimiter="\t", header=None)
df_pad.columns = ['seq_id', 'revcom', 'pad']
print(df_pad.head())
print(df_pad['pad'].sum())


                   seq_id  revcom   pad
0  unitig_22|quiver|pilon      -1  1244
1  unitig_18|quiver|pilon       1  1191
2  unitig_27|quiver|pilon      -1    -4
3  unitig_32|quiver|pilon       1  2086
4  unitig_25|quiver|pilon       1  2766
45770


In [35]:
seqs = {}
for seq_record in SeqIO.parse(open (file_in, mode='r'), 'fasta'):
    seqs[seq_record.id] = seq_record

seq_str = ""
contig_len = 0
pad_len = 0
for index, row in df_pad.iterrows():
    id = row['seq_id']
    revcom = row['revcom']
    pad = row['pad']
    
    seq_contig = ''
    if revcom < 0:
        seq_contig = str(seqs[id].reverse_complement().seq)
    else:
        seq_contig = str(seqs[id].seq)
        
    contig_len += len(seqs[id].seq)
    
    # pad or merge
    if pad >= 0: # gap, pad N's
        for i in range(pad):
            seq_contig += 'N'
    else: # overlap, merge
        if revcom < 0:
            seq_contig = seq_contig[abs(pad):] # 4:end (inclusive from 5th, remove 4 bases from revcom start)
        else:
            seq_contig = seq_contig[:pad] # 0:-4 (remove 4 bases from seq end)          
    
    seq_str += seq_contig
    pad_len += pad

# remove wrap-arounds 2337 at left and 6099 at right:
seq_str = seq_str[2337:-6099]

seq_out = SeqRecord(id = "Z9_chromosome_gapped", seq = Seq(seq_str))
# check sum:
print(contig_len)
print(pad_len)
print(len(seq_str))

with open(file_out, "w") as f_out:
    f_out.write(seq_out.format('fasta'))


872737
45770
910071
