In [1]:
import Bio
from Bio import SeqIO
from Bio.Seq import Seq
import numpy as np
import pandas as pd
from pandas import DataFrame as dtf
import seaborn as sns
import matplotlib.pyplot as plt
import subprocess
import os
from mothur_py import Mothur
from shutil import copy
import warnings
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("../../analysis/blast_results/zymo/zymoseptoria_tritici.outfmt6", sep="\t", header=None, names=["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"])

In [3]:
forward_df = df.loc[(df['qseqid']=='Forward_primer')]
forward_df = forward_df.loc[df['evalue'] < 0.01]

In [4]:
reverse_df = df.loc[(df['qseqid']=='Reverse_primer')]
reverse_df = reverse_df.loc[df['evalue'] < 0.01]

In [5]:
forward_df

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
0,Forward_primer,7,100.0,21,0,0,1,21,1677192,1677212,5.8e-05,42.1
1,Forward_primer,7,100.0,21,0,0,1,21,1685462,1685482,5.8e-05,42.1


In [6]:
reverse_df

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
75342,Reverse_primer,7,100.0,17,0,0,1,17,1680031,1680015,0.008,34.2
75343,Reverse_primer,7,100.0,17,0,0,1,17,1688301,1688285,0.008,34.2


In [7]:
forward_bed = pd.DataFrame(columns=['chrom', 'chromStart', 'chromEnd'])
for i in range(0, len(forward_df)):
    if forward_df['sstart'][i] <= forward_df['send'][i]:
        forward_bed = forward_bed.append({'chrom': forward_df['sseqid'][i],'chromStart': (forward_df['sstart'][i])-1, 'chromEnd': forward_df['send'][i]}, ignore_index=True)
    else:
        forward_bed = forward_bed.append({'chrom': forward_df['sseqid'][i],'chromStart': (forward_df['send'][i])-1, 'chromEnd': forward_df['sstart'][i]}, ignore_index=True)

In [8]:
forward_bed

Unnamed: 0,chrom,chromStart,chromEnd
0,7,1677191,1677212
1,7,1685461,1685482


In [9]:
reverse_bed = pd.DataFrame(columns=['chrom', 'chromStart', 'chromEnd'])
for i in reverse_df.index:
    if reverse_df['sstart'][i] < reverse_df['send'][i]:
        reverse_bed = reverse_bed.append({'chrom': reverse_df['sseqid'][i],'chromStart': (reverse_df['sstart'][i])-1, 'chromEnd': reverse_df['send'][i]}, ignore_index=True)
    else:
        reverse_bed = reverse_bed.append({'chrom': reverse_df['sseqid'][i],'chromStart': (reverse_df['send'][i])-1, 'chromEnd': reverse_df['sstart'][i]}, ignore_index=True)

In [10]:
reverse_bed

Unnamed: 0,chrom,chromStart,chromEnd
0,7,1680014,1680031
1,7,1688284,1688301


In [11]:
intervals = pd.DataFrame(data=[[forward_bed['chrom'][0], forward_bed['chromEnd'][0], reverse_bed['chromStart'][0]], [forward_bed['chrom'][1], forward_bed['chromEnd'][1], reverse_bed['chromStart'][1]]], columns=['chrom', 'chromStart', 'chromEnd'])

In [12]:
intervals

Unnamed: 0,chrom,chromStart,chromEnd
0,7,1677212,1680014
1,7,1685482,1688284


In [13]:
intervals.to_csv('../../analysis/blast_results/zymo/BED/intervals.bed', sep='\t')

In [14]:
gdict = SeqIO.to_dict(SeqIO.parse("/media/MassStorage/tmp/TE/summer_project/analysis/blast_results/zymo/BED/bed_output.fasta", "fasta"))

In [15]:
seqs = []
for key in gdict:
    seqs.append(gdict[key].seq)

In [16]:
alignments = pairwise2.align.globalms(seqs[0], seqs[1], 1, -1, -1, 0)

In [17]:
assert len(alignments[0][0]) != alignments[0][4], "The two regions of the genome are identical"

AssertionError: The two regions of the genome are identical

In [20]:
print(format_alignment(*alignments[0]))

GCGGTAATTCCAGCTCCAATAGCGTATATTAAAGTTGTTGCAGTTAAAAAGCTCGTAGTTGAACCTTGGGCCTGGCTGGCCGGTCCGCCTCACCGCGTGTACTGGTCCGGCCGGGCCTTTCCTTCTGGGGAGCCGCATGCCCTTCACTGGGCGTGTCGGGGAACCAGGACTTTTACTTTGAAAAAATTAGAGTGTTCAAAGCAGGCCTTTGCTCGAATACATTAGCATGGAATAATAGAATAGGACGTGTGGTTCTATTTTGTTGGTTTCTAGGACCGCCGTAATGATTAATAGGGATAGTCGGGGGCATCCGTATTCAATTGTCAGAGGTGAAATTCTTGGATTTATTGAAGACGAACTACTGCGAAAGCATTTGCCAAGGATGTTTTCATTAATCAGTGAACGAAAGTTAGGGGATCGAAGACGATCAGATACCGTCGTAGTCTTAACCATAAACTATGCCGACTAGGGATCGGTGGATGTTATCTTTTTGACTCCATCGGCACCTTACGAGAAATCAAAGTTTTTGGGTTCTGGGGGGAGTATGGTCGCAAGGCTGAAACTTAAAGAAATTGACGGAAGGGCACCACCAGGCGTGGAGCCTGCGGCTTAATTTGACTCAACACGGGGAAACTCACCAGGTCCAGACACAAGTAGGATTGACAGATTGAGAGCTCTTTCTTGATTTTGTGGGTGGTGGTGCATGGCCGTTCTTAGTTGGTGGAGTGATTTGTCTGCTTAATTGCGATAACGAACGAGACCTTAACCTGCTAAATAGCCAGGCCCGCTTTGGCGGGTCGCCGGCTTCTTAGAGGGACTATCGGCTCAAGCCGATGGAAGTTTGAGGCAATAACAGGTCTGTGATGCCCTTAGATGTTCTGGGCCGCACGCGCGCTACACTGACGGAGCCAACGAGTTCATCACCTTGGCCGAAAGGTCTGGGTAATCTTGTTAAACTCCGTCGTGCTGGGGATAGAGCATTGCAATTATTGCTCTTCAA