# Test alignment algorithm
Take the top results and make an alignment from it

1. Make some fake data
2. Run it through the scoring and search part and get the results
3. align them

## 1. Make fake data

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
prot1seq = 'MALWARSMVTQYP'
prot1name = 'prot1'
prot2seq = 'LLQVYAMEMS'
prot2name = 'prot2'

hybpep = 'LLQVYSMVTQ'
nonhybpep1 = 'MALWAR'
nonhybpep2 = 'AMEMS'

### 1.1 make spectra from these

In [2]:
from src.spectra.gen_spectra import gen_spectrum

hybpepspec = gen_spectrum(hybpep)
nonhybpep1spec = gen_spectrum(nonhybpep1)
nonhybpep2spec = gen_spectrum(nonhybpep2)

## 2. Run the data through the search

In [3]:
from src.database.database import Database

db = Database()
db.add_entry(prot1name, prot1seq)
db.add_entry(prot2name, prot2seq)
db.index()

24 unique kmers


In [4]:
from src.alignment.search import search_database

hybresb, hybresy = search_database(hybpepspec, db)
nonhyb1resb, nonhyb1resy = search_database(nonhybpep1spec, db)
nonhyb2resb, nonhyb2resy = search_database(nonhybpep2spec, db)

## 3. Make alignment

In [32]:
from src.scoring.scoring import score_subsequence
import copy

def align_overlap(b_entry: dict, y_entry: dict) -> list:
    # if there's a gap, fill it with X's for now
    gap_len = y_entry['starting_position'] - b_entry['ending_position']
    if gap_len > 0:
        return b_entry['sequence'] + ['X' for _ in range(gap_len)] + y_entry['sequence']
    # check to see if either fully encompasses the other
    if b_entry['starting_position'] <= y_entry['starting_position'] and b_entry['ending_position'] >= y_entry['ending_position']:
        return b_entry['sequence']
    elif y_entry['starting_position'] <= b_entry['starting_position'] and y_entry['ending_position'] >= b_entry['ending_position']:
        return y_entry['sequence']
    # align the overlap
    else:
        for i in range(len(b_entry['sequence'])):
            if b_entry['sequence'][i:] in y_entry['sequence']:
                overlapped_seq = b_entry['sequence'][i:]
                b_cont = b_entry['sequence'][:i]
                y_cont = y_entry['sequence'][len(overlapped_seq):]
                return b_cont + overlapped_seq + y_cont

def make_alignment(spectrum: list, b_entry: dict, y_entry: dict, actual_sequence='', missing_overlap=5) -> dict:
    if b_entry['protein_name'] == y_entry['protein_name'] and y_entry['starting_position'] - b_entry['ending_position'] < missing_overlap:
        spliced_name = b_entry['protein_name']
    else:
        spliced_name = '{}-{}-hybrid'.format(b_entry['protein_name'], y_entry['protein_name'])
    if 'hybrid' not in spliced_name:
        # find the overlap
        spliced_seq = align_overlap(b_entry, y_entry)
    else:
        spliced_seq = b_entry['sequence'] + y_entry['sequence']
    bscore, yscore = score_subsequence(spectrum, spliced_seq)
    hybrid = 'hybrid' in spliced_name
    print('predicted sequence: {} \t b score: {} \t y score: {} \t actual sequence: {}'.format(spliced_seq, bscore, yscore, actual_sequence))
    return {
        'sequence': spliced_seq,
        'name': spliced_name,
        'score': bscore+yscore,
        'hybrid': hybrid,
        'junction_sequence': spliced_seq if not hybrid else b_entry['sequence'] + '-' + y_entry['sequence']
    }
        

In [33]:
hybalignments = [make_alignment(hybpepspec['spectrum'], hybresb[i], hybresy[i], hybpep) for i in range(min(len(hybresb), len(hybresy)))]
nonhyb1alignments = [make_alignment(nonhybpep1spec['spectrum'], nonhyb1resb[i], nonhyb1resy[i], nonhybpep1) for i in range(min(len(nonhyb1resb), len(nonhyb1resy)))]
nonhyb2alignments = [make_alignment(nonhybpep2spec['spectrum'], nonhyb2resb[i], nonhyb2resy[i], nonhybpep2) for i in range(min(len(nonhyb2resb), len(nonhyb2resy)))]


predicted sequence: LLQVYSMVTQ 	 b score: 1.0 	 y score: 1.0 	 actual sequence: LLQVYSMVTQ
predicted sequence: LWALLQ 	 b score: 0.1 	 y score: 0.075 	 actual sequence: LLQVYSMVTQ
predicted sequence: MALWAR 	 b score: 1.0 	 y score: 1.0 	 actual sequence: MALWAR
predicted sequence: AMEMS 	 b score: 1.0 	 y score: 1.0 	 actual sequence: AMEMS
predicted sequence: ALWMS 	 b score: 0.2 	 y score: 0.35 	 actual sequence: AMEMS
predicted sequence: ARS 	 b score: 0.2 	 y score: 0.2 	 actual sequence: AMEMS


In [34]:
print(hybalignments)

[{'sequence': 'LLQVYSMVTQ', 'name': 'prot2-prot1-hybrid', 'score': 2.0, 'hybrid': True, 'junction_sequence': 'LLQVY-SMVTQ'}, {'sequence': 'LWALLQ', 'name': 'prot1-prot2-hybrid', 'score': 0.175, 'hybrid': True, 'junction_sequence': 'LWA-LLQ'}]


In [11]:
print(hybresb)

{0: {'k': 5, 'sequence': 'LLQVY', 'starting_position': 0, 'ending_position': 4, 'b_score': 0.5, 'y_score': 0.0, 'protein_name': 'prot2'}, 1: {'b_score': 0.1, 'y_score': 0.0, 'k': 3, 'sequence': 'LWA', 'starting_position': 2, 'ending_position': 4, 'protein_name': 'prot1'}}


In [12]:
print(hybresy)

{0: {'k': 5, 'sequence': 'SMVTQ', 'starting_position': 6, 'ending_position': 10, 'b_score': 0.0, 'y_score': 0.5, 'protein_name': 'prot1'}, 1: {'b_score': 0.3, 'y_score': 0.075, 'k': 3, 'sequence': 'LLQ', 'starting_position': 0, 'ending_position': 2, 'protein_name': 'prot2'}}


In [26]:
print(score_subsequence(hybpepspec, 'LLQVY'))

(0.0, 0.0)
