# stitch chromosomal contigs based on nucmer results
Input: a pad file & a fasta file

Sherwood (April 7, 2023): Remove 25 bp wraparound at left end;  has telomere here after removal
No telomere consensus at right end; it has N’s at right end.

217_5_chr.pad:

unitig_12|quiver|quiver|pilon   1       1629
unitig_14|quiver|quiver|pilon   -1      2601
unitig_1|quiver|quiver|pilon    -1      2382
unitig_3|quiver|quiver|pilon    -1      5523
unitig_0|quiver|quiver|pilon    -1      1272
unitig_2|quiver|quiver|pilon    1       1383
unitig_13|quiver|quiver|pilon   -1      1419
unitig_11|quiver|quiver|pilon   1       820
unitig_15|quiver|quiver|pilon   1       0

In [6]:
import pandas as pd
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

file_in ='Broken-to-be-gapped/217_5_yang_luft_8_chrom.fasta'
file_out='Broken-to-be-gapped/217_5_chrom-gapped-v2.fasta'

df_pad = pd.read_csv("Broken-to-be-gapped/217_5_chr.pad", delimiter="\t", header=None)
df_pad.columns = ['seq_id', 'revcom', 'pad']
df_pad.head()
print(df_pad['pad'].sum())

17029


In [7]:
seqs = {}
for seq_record in SeqIO.parse(open (file_in, mode='r'), 'fasta'):
    seqs[seq_record.id] = seq_record

seq_str = ""
contig_len = 0
pad_len = 0
for index, row in df_pad.iterrows():
    id = row['seq_id']
    revcom = row['revcom']
    pad = row['pad']
    
    if revcom < 0:
        seq_str += str(seqs[id].reverse_complement().seq)
    else:
        seq_str += str(seqs[id].seq)
    
    contig_len += len(seqs[id].seq)
        
    for i in range(pad):
        seq_str += 'N'
        pad_len += 1

# remove wrap-around left 25 bp:
seq_str = seq_str[25:]

seq_out = SeqRecord(id = "217_5_chrom_gapped", seq = Seq(seq_str))
print(contig_len)
print(pad_len)
print(len(seq_str))

with open(file_out, "w") as f_out:
    f_out.write(seq_out.format('fasta'))


892794
17029
909798
