In [5]:
import pandas as pd
import numpy as np
import os
import glob
import ast
import time
from Bio import Align
THRESHOLD = 0.9
import sys

def edit_distance(str1, str2):
    # Get the length of both strings
    len_str1 = len(str1)
    len_str2 = len(str2)

    # Create a 2D array to store results of subproblems
    dp = [[0 for _ in range(len_str2 + 1)] for _ in range(len_str1 + 1)]

    # Initialize the dp array
    for i in range(len_str1 + 1):
        for j in range(len_str2 + 1):

            # If the first string is empty, only option is to insert all characters of the second string
            if i == 0:
                dp[i][j] = j    # Min. operations = j

            # If the second string is empty, only option is to remove all characters of the first string
            elif j == 0:
                dp[i][j] = i    # Min. operations = i

            # If the last characters are the same, ignore the last character and recur for the remaining substring
            elif str1[i-1] == str2[j-1]:
                dp[i][j] = dp[i-1][j-1]

            # If the last characters are different, consider all possibilities and find the minimum
            else:
                dp[i][j] = 1 + min(dp[i][j-1],    # Insert
                                   dp[i-1][j],    # Remove
                                   dp[i-1][j-1])  # Replace

    # The answer is in the cell dp[len_str1][len_str2]
    return dp[len_str1][len_str2]

def similarity_seqs(seq1, seq2):
#     aligner = Align.PairwiseAligner()
#     aligner.mode = 'global'
#     scores = [alignment.score for alignment in aligner.align(seq1, seq2)]
    return 1-(edit_distance(seq1,seq2) / min(len(seq1), len(seq2)))

def similarity_score_vecs(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))




In [6]:
t1 = time.time()
##start the analysis and sim seqs and sim vecs
path = os.getcwd() + '/../paired_data_sim_seqs/'
naming_format = '*_pairs.csv'
files = glob.glob(f'{path}/{naming_format}')
sim_s = []
sim_v = []
delta_km = []
j = 0


In [7]:
len(files)

2587

In [None]:
nums = 0
for file in files:
    df = pd.read_csv(file)
    df['sim_seqs'] = df['sim_seqs'].astype(float)
    for index, row in df.iterrows():
        sim_vec = row['sim_vecs']
        sim_seq = similarity_seqs(row['seq_w'], row['seq_m'])
        sim_s.append(sim_seq)
        sim_v.append(sim_vec)
        delta_km.append(row['value_diff'])
        df.at[index, 'sim_seqs'] = sim_seq
    df.to_csv(file)
    j += 1
    if j % 10 == 0:
        print(f"Processed {j} files and time elapsed: {time.time()-t1}")




Processed 120 files and time elapsed: 2253.2428987026215
Processed 130 files and time elapsed: 2360.0298643112183
Processed 140 files and time elapsed: 2367.470332622528
Processed 150 files and time elapsed: 2380.6686141490936
Processed 160 files and time elapsed: 2411.3219137191772
Processed 170 files and time elapsed: 2448.2720518112183
Processed 180 files and time elapsed: 2454.405106306076
Processed 190 files and time elapsed: 2461.0572085380554
Processed 200 files and time elapsed: 2486.7208683490753
Processed 210 files and time elapsed: 2520.399113178253
Processed 220 files and time elapsed: 2542.006586790085
Processed 230 files and time elapsed: 2635.7387948036194
Processed 240 files and time elapsed: 2712.2564821243286
Processed 250 files and time elapsed: 2715.508184194565
Processed 260 files and time elapsed: 2718.3499619960785
Processed 270 files and time elapsed: 2773.8002276420593
Processed 280 files and time elapsed: 2797.237571954727
Processed 290 files and time elapsed:

In [None]:
print(f"Mean similarity score for seqs: {np.mean(sim_s)}")
logger.info(f"Mean similarity score for seqs: {np.mean(sim_s)}")
print(f"Mean similarity score for vecs: {np.mean(sim_v)}")
logger.info(f"Mean similarity score for vecs: {np.mean(sim_v)}")
print(f"Mean delta km: {np.mean(delta_km)}")
logger.info(f"Mean delta km: {np.mean(delta_km)}")

#find the correlation between the similarity scores and the delta km
from scipy.stats import pearsonr
corr, _ = pearsonr(sim_s, delta_km)
print(f"Pearson correlation- sim_s - del_km: {corr}")
logger.info(f"Pearson correlation- sim_s - del_km: {corr}")
corr, _ = pearsonr(sim_v, delta_km)
print(f"Pearson correlation- sim_v - del_km: {corr}")
logger.info(f"Pearson correlation - sim_v - del_km: {corr}")

# also for similarity scores
corr, _ = pearsonr(sim_s, sim_v)

print(f"Pearson correlation sim_s and sim_v: {corr}")
logger.info(f"Pearson correlation sim_s and sim_v: {corr}")

# now find r2 values

from sklearn.metrics import r2_score
r2 = r2_score(sim_v, sim_s)
print(f"R2 score sim_s and sim_v: {r2}")
logger.info(f"R2 score between sim_s and sim_v: {r2}")

#save sim_s, sim_v and delta_km
np.savez(os.getcwd() + '/plots/similarity_scores.npz', sim_s = sim_s, sim_v = sim_v, delta_km = delta_km)

# plot sim_s
import matplotlib.pyplot as plt
plt.hist(sim_s, bins=20)
plt.xlabel('Similarity score seq')
# save it
plt.savefig(os.getcwd() + '/plots/sim_s.png')

# plot sim_v
plt.hist(sim_v, bins=20)
plt.xlabel('Similarity score vec')
# save it
plt.savefig(os.getcwd() + '/plots/sim_v.png')

# make a scatter plot of sim_s and sim_v
plt.scatter(sim_s, sim_v)
plt.xlabel('Similarity score seq')
plt.ylabel('Similarity score vec')
plt.title('Similarity score seq vs vec')
# save it
plt.savefig(os.getcwd() + '/plots/sim_s_vs_sim_v.png')

In [None]:
seq1 = "MRAVRLVEIGKPLSLQEIGVPKPKGPQVLIKVEAAGVCHSDVHMRQGRFGNLRIVEDLGVKLPVTLGHEIAGKIEEVGDEVVGYSKGDLVAVNPWQGEGNCYYCRIGEEHLCDSPRWLGINFDGAYAEYVIVPHYKYMYKLRRLNAVEAAPLTCSGITTYRAVRKASLDPTKTLLVVGAGGGLGTMAVQIAKAVSGATIIGVDVREEAVEAAKRAGADYVINASMQDPLAEIRRITESKGVDAVIDLNNSEKTLSVYPKALAKQGKYVMVGLFGADLHYHAPLITLSEIQFVGSLVGNQSDFLGIMRLAEAGKVKPMITKTMKLEEANEAIDNLENFKAIGRQVLIP"
seq2 = "MSIPETQKGVIFYESHGKLEYKDIPVPKPKANELLINVKYSGVCHTDLHAWHGDWPLPTKLPLVGGHEGAGVVVGMGENVKGWKIGDYAGIKWLNGSCMACEYCELGNESNCPHADLSGYTHDGSFQEYATADAVQAAHIPQGTDLAEVAPVLCAGITVYKALKSANLMAGHWVAISGAAGGLGSLAVQYAKAMGYRVLGIDGGEGKEELFRSIGGEVFIDFTKEKDIVGAVLKATDGGAHGVINVSVSEAAIEASTRYVRANGTTVLVGMPAGAKCCSDVFNQVVKSISIVGSYVGNRADTREALDFFARGLVKSPIKVVGLSTLPEIYEKMEKGQIVGRYVVDTSK"

score = similarity_seqs(seq1,seq2)

print(f"the score is {score}")

In [None]:
seq_a = "MATATVLEKANIGVFTNTKHDLWVADAKPTLEEVKNGQGLQPGEVTIEVRSTGICGSDVHFWHAGCIGPMIVTGDHILGHESAGQVVAVAPDVTSLKPGDRVAVEPNIICNACEPCLTGRYNGCENVQFLSTPPVDGLLRRYVNHPAIWCHKIGDMSYEDGALLEPLSVSLAGIERSGLRLGDPCLVTGAGPIGLITLLSARAAGASPIVITDIDEGRLEFAKSLVPDVRTYKVQIGLSAEQNAEGIINVFNDGQGSGPGALRPRIAMECTGVESSVASAIWSVKFGGKVFVIGVGKNEMTVPFMRLSTWEIDLQYQYRYCNTWPRAIRLVRNGVIDLKKLVTHRFLLEDAIKAFETAANPKTGAIKVQIMSSEDDVKAASAGQKI"
seq_b = "MATATVLEKANIGVFTNTKHDLWVADAKPTLEEVKNGQGLQPGEVTIEVRSTGICGSDVHFWHAGCIGPFIVTGDHILGHESAGQVVAVAPDVTSLKPGDRVAVEPNIICNACEPCLTGRYNGCENVQFLSTPPVDGLLRRYVNHPAIWCHKIGDMSYEDGALLEPLSVSLAGIERSGLRLGDPCLVTGAGPIGLITLLSARAAGASPIVITDIDEGRLEFAKSLVPDVRTYKVQIGLSAEQNAEGIINVFNDGQGSGPGALRPRIAMECTGVESSVASAIWSVKFGGKVFVIGVGKNEMTVPFMRLSTWEIDLQYQYRYCNTWPRAIRLVRNGVIDLKKLVTHRFLLEDAIKAFETAANPKTGAIKVQIMSSEDDVKAASAGQKI"
seq1 = "MRAVRLVEIGKPLSLQEIGVPKPKGPQVLIKVEAAGVCHSDVHMRQGRFGNLRIVEDLGVKLPVTLGHEIAGKIEEVGDEVVGYSKGDLVAVNPWQGEGNCYYCRIGEEHLCDSPRWLGINFDGAYAEYVIVPHYKYMYKLRRLNAVEAAPLTCSGITTYRAVRKASLDPTKTLLVVGAGGGLGTMAVQIAKAVSGATIIGVDVREEAVEAAKRAGADYVINASMQDPLAEIRRITESKGVDAVIDLNNSEKTLSVYPKALAKQGKYVMVGLFGADLHYHAPLITLSEIQFVGSLVGNQSDFLGIMRLAEAGKVKPMITKTMKLEEANEAIDNLENFKAIGRQVLIP"
seq2 = "MSIPETQKGVIFYESHGKLEYKDIPVPKPKANELLINVKYSGVCHTDLHAWHGDWPLPTKLPLVGGHEGAGVVVGMGENVKGWKIGDYAGIKWLNGSCMACEYCELGNESNCPHADLSGYTHDGSFQEYATADAVQAAHIPQGTDLAEVAPVLCAGITVYKALKSANLMAGHWVAISGAAGGLGSLAVQYAKAMGYRVLGIDGGEGKEELFRSIGGEVFIDFTKEKDIVGAVLKATDGGAHGVINVSVSEAAIEASTRYVRANGTTVLVGMPAGAKCCSDVFNQVVKSISIVGSYVGNRADTREALDFFARGLVKSPIKVVGLSTLPEIYEKMEKGQIVGRYVVDTSK"
seq1_ = seq1[300:]
seq2_ = seq2[300:]

#score = similarity_seqs(seq1_,seq2_)

print(f"the score is {score}")

In [None]:
seq2_

In [None]:
seq1_='DTREALDFFARGLVKSPIKVVGLSTLPELEEANEAIDNLENFKAIGRQVLIP'

In [None]:
score = similarity_seqs(seq1_,seq2_)

In [5]:
seq_c = "MITMAPHQNQFLLFIGVSLVLLSSYATANNSFNRSAFPDDFIFGASAAAYQYEGEANKSGRGPSVWDIFTHEYPEKITDRSNGDEAIDFYHRYKEDIQRMKNMNLDAFRFSISWTRIIPNGQISAGVNQEGIDFYNDLIDELISNGLEPFVTIFHWDSPQGLEDKYTGFLSRSIVKDFQDFAELCYKTFGDRVKYWTTFNEPRAYATRGYDSGLGAPGRCSEWVDRSCEAGNSATEPYIVSHHIILAHAAAVQVYRQKYQASQNGKIGITLNAYWYVPYSNNTVDEEAAQVAFDFFTGWHLDPITYGHYPRTMQALVGDRLPKFTEEEFMVIKGSYDFLGLNYYGAYYAYFNDHPDPNPLHKRYTTDSHVNTTGKRDGKPMGPQGTTSMFNIYPEGIRYLLNYTKDAYRNPTMYITENGYNQDDNGTVPMSILLNDTRRIIYYETHLENVLRSIKEYNVDVKGFIAWSFEDNFEWSSGYTQRFGLYYIDYKNHLERHAKNSTEWFTNFLQKNQSSTISEGSGSRWIRPFGYSIRSAAA"
seq_d = "QSGLYQQCGGIGWTGATTCVSGATCTVLNPYYSQCLPGAATTSVSSSHSSSSSVSSHSSSASSSSISSTSTSPPAPSQTVANVSPEWAAAYVKAQAAVAKLSVTDMVNLATGVQWEKGPCVGNTPAISSIPGFTGLCLQDSPVGVRYADGTSVFPPEINVAATWNRTLMRQRGAAMGAEFKGKGVHVALGPMMNLMRVPAAGRNWEGGGGDPFLSGELAFETITGIQSSGAQACAKHFINNEQEHFRDSSSSNVDDRTEHELYGHPFLRSVQANVASVMCSYNQINGTFSCENEKTLSGLLKGEYGFQGYVMSDWWATHSGAPAVNAGLDMTMPGDETTNSGTTYFGQNLVNAVNSGQVSQARIKDMATRILAAWYLLGQDQNFPAVNFNSWNSGQGQHVNVSGNHASLIRTIGAASQILLKNVNGALPLKKPKTIGIIGNGAGSNPSGPNAFSDRAGDVGVLALGWGSGTANFPYLVAPVDAITARASQDGTTVSSSLSDTDLTGAANTATGKDVAMVFITADSGEGYLTVEGNAGDRNDLQAWHGGDALVQQVASHNKNTIVVINSVGPINMEAWVNHPNVTAIVWSGLPGQEAGNAVTDVLFGAVNPGGKLPFTIGKSISDYSAQIITTGSGIVPIPYNEGLFIDYRHFDQAGIAPRFEFGFGLSYTTFDYSNLVITGSTAGGTRQPPGPGSSLDPWLHDSVVTVSFTLTNNGTVDGTEVPQLYLSPPTSAKSAPQNLKGFDSGFLPAGASTTVSFELSRYSFSVWDVVSQSWQIPAGVTGISVGASSRDLRLKGSITN"
seq_e = "MKIQNILVALTCGLVSQVFATSWSEADEKAKSFMSDLSESEKIDIVTGYMNMQGTCVGNIKPLDRKNFKGLCLQDGPAGVRFNGGTSTTWQAGINNAATFNKDLLYKIGKDQGAEFYAKGINIALAPSMNILRAPASGRVWENFGEDPYLSGVCGAQITKGYQDSGVIVAAKHYVANDIEHNREASSSNMDDQTLMEIHVEPFYRTIKDGDAGSVMASYNAVNNIYVVQNKKVLTEILKEGIGFQGFVMSDWWAIHDLEGSFNAGMDMNMPGGKAWGPDYVNNSFWGSNISNAIRSGQVSSSRLDDAVRRIIRTLYRFDQMSGYPNVNLKAPSMHADTNRQAAIESSVLLKNADDILPLTKKYRKIAIIGKDADKAQSCTDTACSGGNIIQGWGSGTTDFTGISDPITAIKNRASKEGISIVSSISDSANEGANVAKDADVAVVFVRATSGEEYIVVDNNKGDRNNLDLWHGGNDLVKSVAAVNKNTVVVIHAPATVNLPFLNNVKAIIHAGMPGAESGNAIASILFGDSNPSGHLPFTWAAREDYCCDVSYPAELPHGGNSKTAYDYKEGLFVGYRWFDKKNKTPIFPFGHGLSYTTFDYSNLSVSLKKSGTQVTGLEATVTVANTGSYEGATVPMLFLGFPAVSELGDYPVRNLKAFEKVNLKAGEKKTVTLTVDQHGLSYYNTSKKSFVVPTGGEFTVYVGKSAGDLPLKKAIKNTQGTNESSSSVGDENNNNPNNNADCSVNGYKCCSNSNAEVVYTDGDGNWGVENGQWCIIKEQQQQQTCFSIKLGYPCCKGNEVAYTDNDGQWGFENGQWCGIATATSGAGGCPYTSKNGYPVCQTTTKVEYVDSDKWGVENGNWCIMCN"

sc = similarity_seqs(seq_e,seq_d)

In [6]:
sc

0.23566084788029928

In [None]:
st = set()
for x in seq2:
    st.add(x)
print(len(st))

In [None]:
def edit_distance(str1, str2):
    # Get the length of both strings
    len_str1 = len(str1)
    len_str2 = len(str2)

    # Create a 2D array to store results of subproblems
    dp = [[0 for _ in range(len_str2 + 1)] for _ in range(len_str1 + 1)]

    # Initialize the dp array
    for i in range(len_str1 + 1):
        for j in range(len_str2 + 1):

            # If the first string is empty, only option is to insert all characters of the second string
            if i == 0:
                dp[i][j] = j    # Min. operations = j

            # If the second string is empty, only option is to remove all characters of the first string
            elif j == 0:
                dp[i][j] = i    # Min. operations = i

            # If the last characters are the same, ignore the last character and recur for the remaining substring
            elif str1[i-1] == str2[j-1]:
                dp[i][j] = dp[i-1][j-1]

            # If the last characters are different, consider all possibilities and find the minimum
            else:
                dp[i][j] = 1 + min(dp[i][j-1],    # Insert
                                   dp[i-1][j],    # Remove
                                   dp[i-1][j-1])  # Replace

    # The answer is in the cell dp[len_str1][len_str2]
    return dp[len_str1][len_str2]

# Test the function
str1 = "kitten"
str2 = "sitting"
print(f"The edit distance between '{seq_a}' and '{seq_b}' is {1 - edit_distance(seq_a, seq_b)/min(len(seq_a),len(seq_b))}")

