In [None]:
from Bio import Seq, SeqIO

import sys
sys.path.insert(1, "../")

from src.algorithms import NeedlemanWunschAlgorithm, WatermanSmithAlgorithm

## Score functions

In my tests I use two variants of score function:
* first function gives 1 for match, -1 for unmatch and -2 for gap
* second function gives 2 for match, -1 for unmatch and -1 for gap

In [2]:
def score1(x, y):
    if x == "-" or y == "-":
        return -2
    elif x == y:
        return 1
    elif x != y:
        return -1

In [3]:
def score2(x, y):
    if x == "-" or y == "-":
        return -1
    elif x == y:
        return 2
    elif x != y:
        return -1

## Insulin comparison

In [4]:
proteins = list(SeqIO.parse("insulin.fasta", "fasta"))

In [5]:
proteins[0]

SeqRecord(seq=Seq('MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKT...YCN', SingleLetterAlphabet()), id='sp|P01308.1|INS_HUMAN', name='sp|P01308.1|INS_HUMAN', description='sp|P01308.1|INS_HUMAN RecName: Full=Insulin; Contains: RecName: Full=Insulin B chain; Contains: RecName: Full=Insulin A chain; Flags: Precursor', dbxrefs=[])

In [6]:
proteins[1]

SeqRecord(seq=Seq('MTLWMRLLPLLALLVLWEPNPAQAFVNQHLCGSHLVEALYLVCGERGFFYTPKS...YCN', SingleLetterAlphabet()), id='XP_012968060.1', name='XP_012968060.1', description='XP_012968060.1 insulin [Mesocricetus auratus]', dbxrefs=[])

In [7]:
human_ins = proteins[0].seq
hamster_ins = proteins[1].seq

### NeedlemanWunschAlgorithm

In [8]:
nwa1 = NeedlemanWunschAlgorithm(human_ins, hamster_ins, score1)
nwa2 = NeedlemanWunschAlgorithm(human_ins, hamster_ins, score2)

In [9]:
print(nwa1.calculate_score())
print(nwa2.calculate_score())

100%|███████████████████████████████████████████████████████████████████████████████| 110/110 [00:00<00:00, 562.82it/s]


74


100%|███████████████████████████████████████████████████████████████████████████████| 110/110 [00:00<00:00, 567.48it/s]


166


In [10]:
nwa1.best_alignments_()

MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN
MTLWMRLLPLLALLVLWEPNPAQAFVNQHLCGSHLVEALYLVCGERGFFYTPKSRRGVEDPQVAQLELGGGPGADDLQTLALEVAQQKRGIVDQCCTSICSLYQLENYCN


In [11]:
nwa2.best_alignments_()

MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN
MTLWMRLLPLLALLVLWEPNPAQAFVNQHLCGSHLVEALYLVCGERGFFYTPKSRRGVEDPQVAQLELGGGPGADDLQTLALEVAQQKRGIVDQCCTSICSLYQLENYCN


### WatermanSmithAlgorithm

In [12]:
wsa1 = WatermanSmithAlgorithm(human_ins, hamster_ins, score1)
wsa2 = WatermanSmithAlgorithm(human_ins, hamster_ins, score2)

In [13]:
print(wsa1.calculate_score())
print(wsa2.calculate_score())

100%|███████████████████████████████████████████████████████████████████████████████| 110/110 [00:00<00:00, 481.69it/s]


74


100%|███████████████████████████████████████████████████████████████████████████████| 110/110 [00:00<00:00, 506.66it/s]


166


In [14]:
wsa1.best_alignments_()

LWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN
LWMRLLPLLALLVLWEPNPAQAFVNQHLCGSHLVEALYLVCGERGFFYTPKSRRGVEDPQVAQLELGGGPGADDLQTLALEVAQQKRGIVDQCCTSICSLYQLENYCN
MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN
MTLWMRLLPLLALLVLWEPNPAQAFVNQHLCGSHLVEALYLVCGERGFFYTPKSRRGVEDPQVAQLELGGGPGADDLQTLALEVAQQKRGIVDQCCTSICSLYQLENYCN


In [15]:
wsa2.best_alignments_()

MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN
MTLWMRLLPLLALLVLWEPNPAQAFVNQHLCGSHLVEALYLVCGERGFFYTPKSRRGVEDPQVAQLELGGGPGADDLQTLALEVAQQKRGIVDQCCTSICSLYQLENYCN


## Choosen homologies comparision 

In [16]:
genes = list(SeqIO.parse("homologies.fasta", "fasta"))

In [17]:
genes[0]

SeqRecord(seq=Seq('ATGACGATAAAGGCACGGCCTCCAACGAGACCTGTGGGCACGGCCATGTTGGGG...TCA', SingleLetterAlphabet()), id='NC_000001.11:c11806103-11785723', name='NC_000001.11:c11806103-11785723', description='NC_000001.11:c11806103-11785723 Homo sapiens chromosome 1, GRCh38.p13 Primary Assembly', dbxrefs=[])

In [18]:
genes[1]

SeqRecord(seq=Seq('CGCCTCCCGCTCTTTCTGTTTCCCTTAGCAACCGACAGCTAAGCCCCGCCTTCT...CAT', SingleLetterAlphabet()), id='NC_005104.4:164844642-164864360', name='NC_005104.4:164844642-164864360', description='NC_005104.4:164844642-164864360 Rattus norvegicus strain mixed chromosome 5, Rnor_6.0', dbxrefs=[])

In [19]:
human_gen = genes[0].seq
rat_gen = genes[1].seq

In [20]:
len(human_gen)

5180

In [21]:
len(rat_gen)

5180

### NeedlemanWunschAlgorithm

In [22]:
nwa3 = NeedlemanWunschAlgorithm(human_gen, rat_gen, score1)
nwa4 = NeedlemanWunschAlgorithm(human_gen, rat_gen, score2)

In [23]:
print(nwa3.calculate_score())

100%|██████████████████████████████████████████████████████████████████████████████| 5180/5180 [07:21<00:00, 12.04it/s]


-565


In [24]:
print(nwa4.calculate_score())

100%|██████████████████████████████████████████████████████████████████████████████| 5180/5180 [07:35<00:00, 12.15it/s]


4331


In [25]:
nwa3.best_alignments_()

ATGACGATAAAGGCACGGCCTCCAACGAGACCTGTGGGCACGGCCATGTTGGGGGCGGGGCTTC-CGGTCACCCGCGCCGGTGGTTTCCGCCCTGTAGGCCCGCCTCTCCAGCAACCTGACACCTGCGCCGCGCCCCTTCACTGCGTTCCCCGCCCCTGCAGCGGCCACAGTGGTG-CGGCCGGCGGC-CGA-GCGTTCTGAGTCACCCGGGACTGGAGGGTG-AGTG-ACGGCGAGGCCGGGGTCGCCGGGAGGGAGATCCTGGAGCCGGCAAACAACCTCCCGGGGGCAAGGACGTGCTTGTGGGCGGGGAGCGCTGGAGGCC-GGCCTGCCTCTCTTCTTGGGGGGGGCTGCCGCCTCCCTTGCGCACCCTTCGCGGGATTAGTGTAACTCCCAATGGCTACCACTTCCAGCGACCGCCAACCCTCAAGCGAAGACTGACTTTGGCTCCCTGCCTGG-ACGGAGGGGCCCCTGAGCCAGGGGTGACGATCCCGCCCCTCTGACCGGCCCAGGCCCGTGTCCTCGCCC-CCATCGGTGACTCAGTGACCTGGT--GACT-GGATTCTCG-GCCA----C-CTGGGC-GCCG-AGACGGCTTCCGG--CTC-CTGCCTTTTAAACCTGCCTCCCCGGC-GATCACCTGGAGAAGAGCGCTGGGCCCGGGGCACTGCGGT-CCCTGGCGCCCACTGCGTCCCGCTGCGCACGGGGGTCCGCCGG-GACCTTTCTGGGAGTCGTAGGCTTAGTATCCCAGTGCTTGGCGCAGACT-AGTTGTTCAGTAAGTGGCAGAGGCTTATTTTGAGAGAGTGGCAGCACCTGGCCCTTTGGCGCTCAGTGAATGTTGGCTATCAC-CGTGTGCCAAA-CTCTGGGGAT-ACCCCAGGCAGGACACCGGTCCTGTCTCAGGGAACTGGGGAAAGAGAAAGGAGACAGGCCTTTTCACCCACAGTTACAACCCAGGGTGCTATGGGAGTCCAGCTGATAACGGATAAAT

In [26]:
nwa4.best_alignments_()

ATGACGATAAAGGCACGGCCTCCAAC-GAGACC-TGTGGGCA-CGGCCATGTTGGGGGCGGGGCTTC-C-G---GT-CACCCGCGCCGGTGGTTTCCGCCCTGTAGGC-CC-GCC-TCTCCA-GCA--ACCTGA--CAC----CT-G---CGCCGCGCCCC--TTCACTGCGTTCCCCGC-C-CCT-GCAGC-GGCCACAG-TGGTGCGGCCGGCGGC-C-G-AGCGTTCTGAGTCACC-CGG--GA-C-TGGAGGG--TGAGTGACGGCG-A-GGCCGGG-GTCGCCGGGAGGGAGA-T-CCTGGAGCCGGCAAAC-AACCT--CCCGGGGGCAAGGACGTGCTTGTGGGCGGGGAGC-GCTGGAGGCCGGCC--TGCCTCTCTTCTTGGGGGGGGCTGCCGCCTCCCTTGCGCACCCTTCGCGGGATTAGTGTAACTCCCAATGGCTACCACTTCCAGCGACCGCCAA--CCC-TCAAGCGAAGACTGACTTTGGCTCCCTGCCTGGACGG-AGGG-GCC--CCTGAGCCAGGGGTGACGA-TCCCGCCCCT-CTGA-C-CGGCCCAGGCCCGTGTCCTCGCCCCC-ATC-GGTGACTCAGTGACCTGGTGACTGGATTCTCG-GCCA----C-CTG-G--GCGCCGAGACGGCTTCCGG--CTCCTGCCTTTTAAACCTG-CCTCCCCG--GCGATCACCTGGAGAAGAGCGCTGGGCC-CG--GGG-CACT-GCG---GT-CCCTGGCGCCCACT-GCGTCCCGCTGCGCACGGGGGTCCG-C-CGG-GACCTTTCTGGGAGTCGTAGGCTTAGTATCCCAGTGCTTGGCGCAGACT-AGTTGTTCAGTAAGTGGCAGAGG--CTTATTTTGAGAGAGTGGCAGCACCTGGCCCTTTGGCGCTCAGTGAA-TGTTGGCTAT-C-AC-CGTGTGCCAAA-CTCTGGG---GATACCCCAGGCAGGACACCGGTCCTGTCTCAGGGAACTGGGGAAAG

### WatermanSmithAlgorithm

In [29]:
wsa3 = WatermanSmithAlgorithm(human_gen, rat_gen, score1)
wsa4 = WatermanSmithAlgorithm(human_gen, rat_gen, score2)

In [30]:
print(wsa3.calculate_score())

100%|██████████████████████████████████████████████████████████████████████████████| 5180/5180 [08:49<00:00, 11.66it/s]


487


In [31]:
print(wsa4.calculate_score())

100%|██████████████████████████████████████████████████████████████████████████████| 5180/5180 [13:19<00:00,  6.48it/s]


4590


In [32]:
wsa3.best_alignments_()

CAGTAGCTG-GGTCACTGTGATGATTT-GAATTGAATTCTGTGATGTGTAAGAAGAGCAGCCTGCAAGGCAAGCACAGATGGGGCAGCTTTTGTTCTGAGA---A-ATTC-G--TG-CCCTTACTGAACTTGGGTCTGGCTATTTTTGGAACAT-GGCCAGCATCAAGTTCTAAC-C-C-ACAA-CACG---GTCTTTTTGGAGTAGCATGAATTCAGGAGAAATCTGGCTGCATAGTCAAGCCCTCACCCCTTCCATCCTGTGCACGAACTGTTTCAAGTAACAGATGTTCCAGGCAGAGCCAGCCAGAGTGAGCTGTTCCTTCTCTGGAGGGTGAT-C-TGGT-ATCCCTGAACGCCTGTTGGCCTCATCTCCACCAACCCCTGCAGTCTCTGCCC-CTGAGTCC-CCC-TCCT-TCC---ATCCGCCTCCC--CTT--ACTAGAGCCTCAGCCCTCCCTCCTCGCCTGGAAGCCTTGCCCCCGCCCCCTTGTGCTGGCTGGAGCTCAAGC-CTCTTC-CTTTGTCGCAGCTCCGCCCAGTTGAACACACCCGCTGGGGAAGGTGCCTCTGTTCCCTCCCCACGCACTCTGGGCCTGAGCTGACAGAGATGGACCATCGAAAAGCCAGGGTCCTCCCAGCTGGGCACTACTGCCCCTCGCTAGGAATATGGGCCTCGCAGGTCGGCAGCGTGAGGTCCT-CTGTGCCACCTTCCATCAGGTAGCT-GTCACCGAGGAGCATGTTGCAG-TGCCGGGTGGGGGCTGCCTTGCATGCAAGGAGCCTGGCAGCAGCGGAGGGCAAGGCTTTGAGTGAGGCGGCCCGGACAGCCATAGCTGAGGAGCATGGAGCCACTGGGAGGGGGCAG-TGTCACCTTTTTTGGCCTTCTTCCTGTGTGGAAATACAGCGCCTCCGGCTTGAACCTGCC-ACTCAGGTGTCTTGATGTGTCGGGGGTGTGGCTGCCTGCCCCCTGATGCTCCCTGCCCC-ACCCTGTGCA

In [33]:
wsa4.best_alignments_()

C-CCGCGCCGGTGGTTTCCGCCCTGTAGGCCCGCCTCTCCAGCAACCTGACACCT--GCGCCGCGCCCCTTCACTGCGTTCCCCGCCCCTGCAGCGGCCACAGTGGTGCGGCCGGCGGCCGAGCGTTCTGAGTCACCCGGGACTGGAGGGTGAGTGACGGCGA--GGCCGGGGTCGCCG-GGAGGGAGATC-CTGGAGCCGGCAAACAACCTC-CCGGGGGC-AAGGACGTGCTTGTGGGCGGGGAGCGCTGGAGGCCGGC-CTGCCTCTCTT-CTTGGGGGGGGCTGCCGCCTCCCTTGCGCACCCTTCGCGGGATTAGTGTAACTCCCAATGGCTACCACTTCCAG-CGACCGCCAACCCTCAAGCGAAGACTGACTTTGGCTCCCTGCCTGGACGGAGGG-GCCCCTGAGCCAGGGGTGACGATCCCGCCCCTCTGACCGGCCCAGGCCCGTG-TCCTCGCCCCCATCGGTGACTCAGTGACCTGGTGACTGGATTCTCGGCCACCTGGGCGCCGAGACGGCTTCCGGCTCCTGCCTTTTAAACCTGCCTCCCCGGCGATCACCTGGAGAAGAGCGCTGGGCCCGGGGCACTGCGGTCCC-TGGCGCC-CACTGCGTCC-CGCTGCG-CACGGGGGTCC-GCCGGGACCTTTCTGGGAGTCGTAGGCTTAG-TATCCCAGTGCTTGGC-G-C-AG--A-CTAGT-TG---TTCAGTAAGTGGCAGAGGCTTATTTTGAGAGAGTGGCAGCACCTGGCCCTTTGGCGCTCAGTGAATGTTGGCTATCACCGTGTGCCAAACTCTGGGGATACCCCAGGCAGGACACCGGTCCTGTCTCAGG-GAACTGGGGAAAGAGAAAGGAGACAGGCCTTTTCACCCACAGTTACAACCCAGGGTGCT-ATGGGAGTCCAGCTGATAACGGATAAATCG-TGGGAGTTGGCTTACAAATATGGCACATGCGTGG-CATATACTAG-GAATGCAATAAGTCTTTGA