<a href="https://colab.research.google.com/github/viktoriapatouna/Thesis-work-2025/blob/main/levenshtein_distance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!touch human1.fasta arabidopsis1.fasta

ARABIDOPSIS



>gene1
AYYYYHHHGG
>gene1.1
AYYYYHHHGG
>prot2
GGHYPLSKYYY
>prot3
LLLYY
>prot3.1
LLLYK
YKKK
>gene67
PPLLYYH
>gene67.1
PPLLYYH

HUMAN



>gene1
AYYYYHHHGG
>prot2
GGHYPLSKYYY
>prot3
LLLYY
YKKK
>gene67
PPLLYYH




In [None]:
import numpy as np

def parse_fasta(species_proteins_list):
    sequences = {}
    with open(species_proteins_list, "r") as file:
        entry_name = None
        sequence = []
        for line in file:
            line = line.strip()
            if line.startswith(">"):
                if entry_name:
                    sequences[entry_name] = "".join(sequence)
                entry_name = line[1:]
                sequence = []
            else:
                sequence.append(line)
        if entry_name:
            sequences[entry_name] = "".join(sequence)
    return sequences

def levenshtein(q, t, lowercase=True):
    """Calculate Levenshtein distance between two strings."""
    if not isinstance(q, str):
        raise TypeError("First argument is not a string!")
    if not isinstance(t, str):
        raise TypeError("Second argument is not a string!")

    if lowercase:
        q = q.lower()
        t = t.lower()

    n = len(q)
    m = len(t)
    lev = np.zeros((n + 1, m + 1))


    for i in range(n + 1):
        lev[i, 0] = i
    for j in range(m + 1):
        lev[0, j] = j


    for i in range(1, n + 1):
        for j in range(1, m + 1):
            insertion = lev[i - 1, j] + 1
            deletion = lev[i, j - 1] + 1
            substitution = lev[i - 1, j - 1] + (1 if q[i - 1] != t[j - 1] else 0)
            lev[i, j] = min(insertion, deletion, substitution)

    return int(lev[n, m])

def find_identical_sequences(species_proteins_list):
    """Find identical sequences across multiple FASTA files."""
    parsed_data = {file: parse_fasta(file) for file in species_proteins_list}

    # Compare sequences between all pairs of species
    for i in range(len(species_proteins_list)):
        query_species = species_proteins_list[i]
        query_proteins = parsed_data[query_species]
        for j in range(i, len(species_proteins_list)):
            target_species = species_proteins_list[j]
            target_proteins = parsed_data[target_species]

            qnames = list(query_proteins.keys())
            tnames = list(target_proteins.keys())

            print(f"Comparing {query_species} with {target_species}:")
            for q_idx in range(len(qnames)):
                for t_idx in range(len(tnames)):
                    # Avoid self-comparisons
                    if i == j and q_idx == t_idx:
                        continue

                    qname = qnames[q_idx]
                    tname = tnames[t_idx]
                    distance = levenshtein(query_proteins[qname], target_proteins[tname])
                    print(f"  {qname} vs {tname}: Distance = {distance}")


species_proteins_list = ["arabidopsis1.fasta", "human1.fasta"]
find_identical_sequences(species_proteins_list)

Comparing arabidopsis1.fasta with arabidopsis1.fasta:
  gene1 vs gene1.1: Distance = 0
  gene1 vs prot2: Distance = 10
  gene1 vs prot3: Distance = 8
  gene1 vs prot3.1: Distance = 9
  gene1 vs gene67: Distance = 8
  gene1 vs gene67.1: Distance = 8
  gene1.1 vs gene1: Distance = 0
  gene1.1 vs prot2: Distance = 10
  gene1.1 vs prot3: Distance = 8
  gene1.1 vs prot3.1: Distance = 9
  gene1.1 vs gene67: Distance = 8
  gene1.1 vs gene67.1: Distance = 8
  prot2 vs gene1: Distance = 10
  prot2 vs gene1.1: Distance = 10
  prot2 vs prot3: Distance = 8
  prot2 vs prot3.1: Distance = 9
  prot2 vs gene67: Distance = 7
  prot2 vs gene67.1: Distance = 7
  prot3 vs gene1: Distance = 8
  prot3 vs gene1.1: Distance = 8
  prot3 vs prot2: Distance = 8
  prot3 vs prot3.1: Distance = 4
  prot3 vs gene67: Distance = 3
  prot3 vs gene67.1: Distance = 3
  prot3.1 vs gene1: Distance = 9
  prot3.1 vs gene1.1: Distance = 9
  prot3.1 vs prot2: Distance = 9
  prot3.1 vs prot3: Distance = 4
  prot3.1 vs gene67: D

In [None]:
import numpy as np

def parse_fasta(species_proteins_list):
    """Parse a FASTA file into a dictionary."""
    sequences = {}
    with open(species_proteins_list, "r") as file:
        entry_name = None
        sequence = []
        for line in file:
            line = line.strip()
            if line.startswith(">"):
                if entry_name:
                    sequences[entry_name] = "".join(sequence)
                entry_name = line[1:]
                sequence = []
            else:
                sequence.append(line)
        if entry_name:
            sequences[entry_name] = "".join(sequence)
    return sequences

def levenshtein(q, t, lowercase=True):
    """Calculate Levenshtein distance between two strings."""
    if not isinstance(q, str):
        raise TypeError("First argument is not a string!")
    if not isinstance(t, str):
        raise TypeError("Second argument is not a string!")

    if lowercase:
        q = q.lower()
        t = t.lower()

    n = len(q)
    m = len(t)
    lev = np.zeros((n + 1, m + 1))


    for i in range(n + 1):
        lev[i, 0] = i
    for j in range(m + 1):
        lev[0, j] = j


    for i in range(1, n + 1):
        for j in range(1, m + 1):
            insertion = lev[i - 1, j] + 1
            deletion = lev[i, j - 1] + 1
            substitution = lev[i - 1, j - 1] + (1 if q[i - 1] != t[j - 1] else 0)
            lev[i, j] = min(insertion, deletion, substitution)

    return int(lev[n, m])

def find_identical_sequences(species_proteins_list):
    """Find identical sequences across multiple FASTA files."""
    parsed_data = {file: parse_fasta(file) for file in species_proteins_list}

    # Compare sequences between all pairs of species
    for i in range(len(species_proteins_list)):
        query_species = species_proteins_list[i]
        query_proteins = parsed_data[query_species]
        for j in range(i, len(species_proteins_list)):
            target_species = species_proteins_list[j]
            target_proteins = parsed_data[target_species]

            qnames = list(query_proteins.keys())
            tnames = list(target_proteins.keys())

            print(f"Comparing {query_species} with {target_species}:")
            for q_idx in range(len(qnames)):
                for t_idx in range(len(tnames)):
                    # Avoid self-comparisons
                    if i == j and q_idx == t_idx:
                        continue

                    qname = qnames[q_idx]
                    tname = tnames[t_idx]
                    distance = levenshtein(query_proteins[qname], target_proteins[tname])
                    print(f"  {qname} vs {tname}: Distance = {distance}")


species_proteins_list = ["Human", "arabidopsis.fasta", "c.elegans.fasta", "drosophila.fasta", "mouse.fasta", "s.cerevisiae.fasta"]
find_identical_sequences(species_proteins_list)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  sp|Q4LDR2|CTXN3_HUMAN Cortexin-3 OS=Homo sapiens OX=9606 GN=CTXN3 PE=1 SV=1 vs sp|Q2V4F0|DF266_ARATH Defensin-like protein 266 OS=Arabidopsis thaliana OX=3702 GN=At1g65352 PE=2 SV=1: Distance = 74
  sp|Q4LDR2|CTXN3_HUMAN Cortexin-3 OS=Homo sapiens OX=9606 GN=CTXN3 PE=1 SV=1 vs sp|Q2V4F3|DEF35_ARATH Defensin-like protein 35 OS=Arabidopsis thaliana OX=3702 GN=At1g64195 PE=2 SV=2: Distance = 67
  sp|Q4LDR2|CTXN3_HUMAN Cortexin-3 OS=Homo sapiens OX=9606 GN=CTXN3 PE=1 SV=1 vs sp|Q2V4F4|DF269_ARATH Defensin-like protein 269 OS=Arabidopsis thaliana OX=3702 GN=At1g64107 PE=3 SV=1: Distance = 76
  sp|Q4LDR2|CTXN3_HUMAN Cortexin-3 OS=Homo sapiens OX=9606 GN=CTXN3 PE=1 SV=1 vs sp|Q2V4F6|DF279_ARATH Putative defensin-like protein 279 OS=Arabidopsis thaliana OX=3702 GN=At1g63535 PE=3 SV=1: Distance = 64
  sp|Q4LDR2|CTXN3_HUMAN Cortexin-3 OS=Homo sapiens OX=9606 GN=CTXN3 PE=1 SV=1 vs sp|Q2V4F7|DF278_ARATH Defensin-like protein 278 OS