In [None]:
import pandas as pd
import numpy as np
import os
import glob
import ast
import time
from Bio import Align

mean_type = 'num_value_am'
path = os.getcwd() + '/../refined_data_esm/'
naming_format = '*sub_seq_esm.csv'
files = glob.glob(f'{path}/{naming_format}')
THRESHOLD = 0.9


import sys
import logging
import logging.handlers

def setup_logger(log_file_path):
    # Create a logger
    logger = logging.getLogger('my_logger')
    logger.setLevel(logging.DEBUG)  # Set the logging level

    # Create a file handler to log messages to a file
    file_handler = logging.FileHandler(log_file_path)
    file_handler.setLevel(logging.DEBUG)  # Set the logging level for the file handler

    # Create a console handler to log messages to the console
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(logging.INFO)  # Set the logging level for the console handler

    # Create a formatter and set it for both handlers
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(formatter)
    console_handler.setFormatter(formatter)

    # Add the handlers to the logger
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)

    return logger


#start logger   
log_file_path = os.getcwd() + '/../logs/make_pairs_sim_seq.log'
logger = setup_logger(log_file_path)
logger.info('Starting make_pairs.py')




def similarity_seqs(seq1, seq2):
    aligner = Align.PairwiseAligner()
    aligner.mode = 'global'
    scores = [alignment.score for alignment in aligner.align(seq1, seq2)]
    return max(scores) / max(len(seq1), len(seq2))






def similarity_score_vecs(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

i = 0
start = time.time()
sim_s = []
sim_v = []
delta_km = []


# print number of files
print(f"Number of files: {len(files)}")
logger.info(f"Number of files: {len(files)}")
num_pairs = 0
saved = 0
for file in files:
    # print(f"Processing file: {file}")
    pairs = pd.DataFrame(columns=['km_wild','value_diff', 'esm_wild', 'esm_diff', 'seq_w','seq_m','sim_vecs','sim_seqs'])
    ec_num = file.strip().split('/')[-1].split('_')[0] 
    sub_id = file.strip().split('_')[-4]
    df = pd.read_csv(file)
    # print(df.head())
    df['esm'] = df['esm'].apply(ast.literal_eval)
    df['num_value_gm'] = df['num_value_gm'].apply(lambda x: float(x))
    df['num_value_am'] = df['num_value_am'].apply(lambda x: float(x))
    
    if len(df) < 2:
        continue
    saved += 1
    for index, row in df.iterrows():
        for index2, row2 in df.iterrows():
            if index == index2:
                continue
            # print the rows
            # print(row)
            # print(row2)
            t1 = row['sim_seqs']
            t2 = row2['sim_seqs']
            esm_w = np.array(row['esm'])
            esm_m = np.array(row2['esm'])
            diff = esm_m - esm_w
            seq1 = row['seq_str']
            seq2 = row2['seq_str']
            # sim_seq = similarity_seqs(seq1, seq2)
            sim_vec = similarity_score_vecs(esm_w, esm_m)
            # sim_s.append(sim_seq)
            # sim_v.append(sim_vec)
            del_km = np.log10(row2['num_value_gm']) - np.log10(row['num_value_gm'])
            # add a row to the pairs dataframe
            # delta_km.append(del_km)
            pairs = pairs.append({ 'km_wild':np.log10(row['num_value_gm']),'value_diff': del_km, 'esm_wild': list(esm_w), 'esm_diff': list(diff), 'seq_w':seq1, 'seq_m':seq2, 'sim_vecs': sim_vec, 'sim_seqs': 0}, ignore_index=True)
            # print(pairs.head())
    num_pairs += len(pairs)
    # print(pairs.head())
    pairs.to_csv(os.getcwd()+f"/../paired_data_sim_seqs/{ec_num}_{sub_id}_pairs.csv")
    # input("Press Enter to continue...")
    ## save the pairs dataframe as a csv file
    i += 1
    if i % 10 == 0:
        print(f"Processed {i} files and {num_pairs} pairs, saved {saved} time elapsed: {time.time()-start}")
        logger.info(f"Processed {i} files and {num_pairs} pairs, saved {saved}, time elapsed: {time.time()-start}")
    


print(f"Time taken: {time.time()-start}")
logger.info(f"Time taken: {time.time()-start}")
print(f"Total files: {len(files)} and saved {saved}")
logger.info(f"Total files: {len(files)} and saved {saved}")
print(f"Total pairs: {num_pairs}")
logger.info(f"Total pairs: {num_pairs}")

t1 = time.time()
##start the analysis and sim seqs and sim vecs
path = os.getcwd() + '/../paired_data_sim_seqs/'
naming_format = '*_pairs.csv'
files = glob.glob(f'{path}/{naming_format}')
sim_s = []
sim_v = []
delta_km = []
j = 0
for file in files:
    df = pd.read_csv(file)
    print(df.head())
    for index, row in df.iterrows():
        sim_vec = row['sim_vecs']
        sim_seq = similarity_seqs(row['seq_w'], row['seq_m'])
        sim_s.append(sim_seq)
        sim_v.append(sim_vec)
        delta_km.append(row['value_diff'])
        # update the row
        df.at[index, 'sim_seqs'] = sim_seq
    # save the updated dataframe
    df.to_csv(file)
    print(df.head())
    j += 1
    if j % 2 == 0:
        print(f"Processed {j} files and time elapsed: {time.time()-t1}")

print(f"Mean similarity score for seqs: {np.mean(sim_s)}")
logger.info(f"Mean similarity score for seqs: {np.mean(sim_s)}")
print(f"Mean similarity score for vecs: {np.mean(sim_v)}")
logger.info(f"Mean similarity score for vecs: {np.mean(sim_v)}")
print(f"Mean delta km: {np.mean(delta_km)}")
logger.info(f"Mean delta km: {np.mean(delta_km)}")

#find the correlation between the similarity scores and the delta km
from scipy.stats import pearsonr
corr, _ = pearsonr(sim_s, delta_km)
print(f"Pearson correlation- sim_s - del_km: {corr}")
logger.info(f"Pearson correlation- sim_s - del_km: {corr}")
corr, _ = pearsonr(sim_v, delta_km)
print(f"Pearson correlation- sim_v - del_km: {corr}")
logger.info(f"Pearson correlation - sim_v - del_km: {corr}")

# also for similarity scores
corr, _ = pearsonr(sim_s, sim_v)

print(f"Pearson correlation sim_s and sim_v: {corr}")
logger.info(f"Pearson correlation sim_s and sim_v: {corr}")

# now find r2 values

from sklearn.metrics import r2_score
r2 = r2_score(sim_v, sim_s)
print(f"R2 score sim_s and sim_v: {r2}")
logger.info(f"R2 score between sim_s and sim_v: {r2}")

#save sim_s, sim_v and delta_km
np.savez(os.getcwd() + '/plots/similarity_scores.npz', sim_s = sim_s, sim_v = sim_v, delta_km = delta_km)

# plot sim_s
import matplotlib.pyplot as plt
plt.hist(sim_s, bins=20)
plt.xlabel('Similarity score seq')
# save it
plt.savefig(os.getcwd() + '/plots/sim_s.png')

# plot sim_v
plt.hist(sim_v, bins=20)
plt.xlabel('Similarity score vec')
# save it
plt.savefig(os.getcwd() + '/plots/sim_v.png')

# make a scatter plot of sim_s and sim_v
plt.scatter(sim_s, sim_v)
plt.xlabel('Similarity score seq')
plt.ylabel('Similarity score vec')
plt.title('Similarity score seq vs vec')
# save it
plt.savefig(os.getcwd() + '/plots/sim_s_vs_sim_v.png')


2024-06-21 17:08:59,702 - my_logger - INFO - Starting make_pairs.py
2024-06-21 17:08:59,702 - my_logger - INFO - Starting make_pairs.py
Number of files: 3596
2024-06-21 17:08:59,706 - my_logger - INFO - Number of files: 3596
2024-06-21 17:08:59,706 - my_logger - INFO - Number of files: 3596
Processed 10 files and 1092 pairs, saved 10 time elapsed: 7.868977069854736
2024-06-21 17:09:07,575 - my_logger - INFO - Processed 10 files and 1092 pairs, saved 10, time elapsed: 7.869185209274292
2024-06-21 17:09:07,575 - my_logger - INFO - Processed 10 files and 1092 pairs, saved 10, time elapsed: 7.869185209274292
Processed 20 files and 2006 pairs, saved 20 time elapsed: 14.733084678649902
2024-06-21 17:09:14,439 - my_logger - INFO - Processed 20 files and 2006 pairs, saved 20, time elapsed: 14.733198165893555
2024-06-21 17:09:14,439 - my_logger - INFO - Processed 20 files and 2006 pairs, saved 20, time elapsed: 14.733198165893555
Processed 30 files and 2120 pairs, saved 30 time elapsed: 16.1572

2024-06-21 17:10:19,502 - my_logger - INFO - Processed 250 files and 9754 pairs, saved 250, time elapsed: 79.79593181610107
Processed 260 files and 11614 pairs, saved 260 time elapsed: 92.55181646347046
2024-06-21 17:10:32,258 - my_logger - INFO - Processed 260 files and 11614 pairs, saved 260, time elapsed: 92.55191111564636
2024-06-21 17:10:32,258 - my_logger - INFO - Processed 260 files and 11614 pairs, saved 260, time elapsed: 92.55191111564636
Processed 270 files and 12058 pairs, saved 270 time elapsed: 96.01952767372131
2024-06-21 17:10:35,725 - my_logger - INFO - Processed 270 files and 12058 pairs, saved 270, time elapsed: 96.01962375640869
2024-06-21 17:10:35,725 - my_logger - INFO - Processed 270 files and 12058 pairs, saved 270, time elapsed: 96.01962375640869
Processed 280 files and 12166 pairs, saved 280 time elapsed: 97.2710292339325
2024-06-21 17:10:36,977 - my_logger - INFO - Processed 280 files and 12166 pairs, saved 280, time elapsed: 97.2711284160614
2024-06-21 17:10

2024-06-21 17:11:22,185 - my_logger - INFO - Processed 500 files and 16874 pairs, saved 500, time elapsed: 142.47890067100525
Processed 510 files and 16922 pairs, saved 510 time elapsed: 143.23872137069702
2024-06-21 17:11:22,945 - my_logger - INFO - Processed 510 files and 16922 pairs, saved 510, time elapsed: 143.23879599571228
2024-06-21 17:11:22,945 - my_logger - INFO - Processed 510 files and 16922 pairs, saved 510, time elapsed: 143.23879599571228
Processed 520 files and 17154 pairs, saved 520 time elapsed: 145.42062973976135
2024-06-21 17:11:25,127 - my_logger - INFO - Processed 520 files and 17154 pairs, saved 520, time elapsed: 145.42072772979736
2024-06-21 17:11:25,127 - my_logger - INFO - Processed 520 files and 17154 pairs, saved 520, time elapsed: 145.42072772979736
Processed 530 files and 17256 pairs, saved 530 time elapsed: 146.81376361846924
2024-06-21 17:11:26,520 - my_logger - INFO - Processed 530 files and 17256 pairs, saved 530, time elapsed: 146.81385111808777
2024

2024-06-21 17:12:13,735 - my_logger - INFO - Processed 750 files and 22436 pairs, saved 750, time elapsed: 194.02867364883423
Processed 760 files and 22518 pairs, saved 760 time elapsed: 195.10401034355164
2024-06-21 17:12:14,810 - my_logger - INFO - Processed 760 files and 22518 pairs, saved 760, time elapsed: 195.1041030883789
2024-06-21 17:12:14,810 - my_logger - INFO - Processed 760 files and 22518 pairs, saved 760, time elapsed: 195.1041030883789
Processed 770 files and 22680 pairs, saved 770 time elapsed: 196.70047521591187
2024-06-21 17:12:16,406 - my_logger - INFO - Processed 770 files and 22680 pairs, saved 770, time elapsed: 196.70055198669434
2024-06-21 17:12:16,406 - my_logger - INFO - Processed 770 files and 22680 pairs, saved 770, time elapsed: 196.70055198669434
Processed 780 files and 22862 pairs, saved 780 time elapsed: 198.45581936836243
2024-06-21 17:12:18,162 - my_logger - INFO - Processed 780 files and 22862 pairs, saved 780, time elapsed: 198.45591282844543
2024-0

2024-06-21 17:13:00,256 - my_logger - INFO - Processed 1000 files and 27242 pairs, saved 1000, time elapsed: 240.55058979988098
Processed 1010 files and 27274 pairs, saved 1010 time elapsed: 241.2207202911377
2024-06-21 17:13:00,927 - my_logger - INFO - Processed 1010 files and 27274 pairs, saved 1010, time elapsed: 241.22079181671143
2024-06-21 17:13:00,927 - my_logger - INFO - Processed 1010 files and 27274 pairs, saved 1010, time elapsed: 241.22079181671143
Processed 1020 files and 27584 pairs, saved 1020 time elapsed: 243.80262351036072
2024-06-21 17:13:03,509 - my_logger - INFO - Processed 1020 files and 27584 pairs, saved 1020, time elapsed: 243.80269813537598
2024-06-21 17:13:03,509 - my_logger - INFO - Processed 1020 files and 27584 pairs, saved 1020, time elapsed: 243.80269813537598
Processed 1030 files and 27802 pairs, saved 1030 time elapsed: 246.37644743919373
2024-06-21 17:13:06,082 - my_logger - INFO - Processed 1030 files and 27802 pairs, saved 1030, time elapsed: 246.37

Processed 1250 files and 36974 pairs, saved 1250 time elapsed: 318.39243364334106
2024-06-21 17:14:18,098 - my_logger - INFO - Processed 1250 files and 36974 pairs, saved 1250, time elapsed: 318.39251685142517
2024-06-21 17:14:18,098 - my_logger - INFO - Processed 1250 files and 36974 pairs, saved 1250, time elapsed: 318.39251685142517
Processed 1260 files and 37158 pairs, saved 1260 time elapsed: 320.2926199436188
2024-06-21 17:14:19,999 - my_logger - INFO - Processed 1260 files and 37158 pairs, saved 1260, time elapsed: 320.29269003868103
2024-06-21 17:14:19,999 - my_logger - INFO - Processed 1260 files and 37158 pairs, saved 1260, time elapsed: 320.29269003868103
Processed 1270 files and 38348 pairs, saved 1270 time elapsed: 328.5082018375397
2024-06-21 17:14:28,214 - my_logger - INFO - Processed 1270 files and 38348 pairs, saved 1270, time elapsed: 328.5082938671112
2024-06-21 17:14:28,214 - my_logger - INFO - Processed 1270 files and 38348 pairs, saved 1270, time elapsed: 328.5082

2024-06-21 17:15:40,241 - my_logger - INFO - Processed 1490 files and 47456 pairs, saved 1490, time elapsed: 400.53466987609863
Processed 1500 files and 47582 pairs, saved 1500 time elapsed: 401.95777130126953
2024-06-21 17:15:41,664 - my_logger - INFO - Processed 1500 files and 47582 pairs, saved 1500, time elapsed: 401.9578421115875
2024-06-21 17:15:41,664 - my_logger - INFO - Processed 1500 files and 47582 pairs, saved 1500, time elapsed: 401.9578421115875
Processed 1510 files and 48330 pairs, saved 1510 time elapsed: 407.62808775901794
2024-06-21 17:15:47,334 - my_logger - INFO - Processed 1510 files and 48330 pairs, saved 1510, time elapsed: 407.6281831264496
2024-06-21 17:15:47,334 - my_logger - INFO - Processed 1510 files and 48330 pairs, saved 1510, time elapsed: 407.6281831264496
Processed 1520 files and 48488 pairs, saved 1520 time elapsed: 409.19547963142395
2024-06-21 17:15:48,901 - my_logger - INFO - Processed 1520 files and 48488 pairs, saved 1520, time elapsed: 409.19555

Processed 1740 files and 57090 pairs, saved 1740 time elapsed: 479.77386474609375
2024-06-21 17:16:59,480 - my_logger - INFO - Processed 1740 files and 57090 pairs, saved 1740, time elapsed: 479.773962020874
2024-06-21 17:16:59,480 - my_logger - INFO - Processed 1740 files and 57090 pairs, saved 1740, time elapsed: 479.773962020874
Processed 1750 files and 57130 pairs, saved 1750 time elapsed: 480.6264958381653
2024-06-21 17:17:00,332 - my_logger - INFO - Processed 1750 files and 57130 pairs, saved 1750, time elapsed: 480.6265754699707
2024-06-21 17:17:00,332 - my_logger - INFO - Processed 1750 files and 57130 pairs, saved 1750, time elapsed: 480.6265754699707
Processed 1760 files and 57656 pairs, saved 1760 time elapsed: 484.9000155925751
2024-06-21 17:17:04,606 - my_logger - INFO - Processed 1760 files and 57656 pairs, saved 1760, time elapsed: 484.90010380744934
2024-06-21 17:17:04,606 - my_logger - INFO - Processed 1760 files and 57656 pairs, saved 1760, time elapsed: 484.900103807

2024-06-21 17:17:33,996 - my_logger - INFO - Processed 1980 files and 59894 pairs, saved 1980, time elapsed: 514.2898316383362
Processed 1990 files and 61594 pairs, saved 1990 time elapsed: 525.5648601055145
2024-06-21 17:17:45,271 - my_logger - INFO - Processed 1990 files and 61594 pairs, saved 1990, time elapsed: 525.5649516582489
2024-06-21 17:17:45,271 - my_logger - INFO - Processed 1990 files and 61594 pairs, saved 1990, time elapsed: 525.5649516582489
Processed 2000 files and 64620 pairs, saved 2000 time elapsed: 545.991893529892
2024-06-21 17:18:05,698 - my_logger - INFO - Processed 2000 files and 64620 pairs, saved 2000, time elapsed: 545.9919829368591
2024-06-21 17:18:05,698 - my_logger - INFO - Processed 2000 files and 64620 pairs, saved 2000, time elapsed: 545.9919829368591
Processed 2010 files and 67608 pairs, saved 2010 time elapsed: 566.1111073493958
2024-06-21 17:18:25,817 - my_logger - INFO - Processed 2010 files and 67608 pairs, saved 2010, time elapsed: 566.1113026142

2024-06-21 17:20:12,020 - my_logger - INFO - Processed 2230 files and 81518 pairs, saved 2230, time elapsed: 672.3143846988678
Processed 2240 files and 82048 pairs, saved 2240 time elapsed: 676.5395910739899
2024-06-21 17:20:16,245 - my_logger - INFO - Processed 2240 files and 82048 pairs, saved 2240, time elapsed: 676.5396635532379
2024-06-21 17:20:16,245 - my_logger - INFO - Processed 2240 files and 82048 pairs, saved 2240, time elapsed: 676.5396635532379
Processed 2250 files and 82104 pairs, saved 2250 time elapsed: 677.5778386592865
2024-06-21 17:20:17,284 - my_logger - INFO - Processed 2250 files and 82104 pairs, saved 2250, time elapsed: 677.5779201984406
2024-06-21 17:20:17,284 - my_logger - INFO - Processed 2250 files and 82104 pairs, saved 2250, time elapsed: 677.5779201984406
Processed 2260 files and 82196 pairs, saved 2260 time elapsed: 678.7083773612976
2024-06-21 17:20:18,414 - my_logger - INFO - Processed 2260 files and 82196 pairs, saved 2260, time elapsed: 678.708454370

2024-06-21 17:21:31,485 - my_logger - INFO - Processed 2480 files and 91270 pairs, saved 2480, time elapsed: 751.7787766456604
Processed 2490 files and 91318 pairs, saved 2490 time elapsed: 753.3951275348663
2024-06-21 17:21:33,101 - my_logger - INFO - Processed 2490 files and 91318 pairs, saved 2490, time elapsed: 753.3951959609985
2024-06-21 17:21:33,101 - my_logger - INFO - Processed 2490 files and 91318 pairs, saved 2490, time elapsed: 753.3951959609985
Processed 2500 files and 91352 pairs, saved 2500 time elapsed: 754.1270201206207
2024-06-21 17:21:33,833 - my_logger - INFO - Processed 2500 files and 91352 pairs, saved 2500, time elapsed: 754.1270925998688
2024-06-21 17:21:33,833 - my_logger - INFO - Processed 2500 files and 91352 pairs, saved 2500, time elapsed: 754.1270925998688
Processed 2510 files and 91784 pairs, saved 2510 time elapsed: 757.5021476745605
2024-06-21 17:21:37,208 - my_logger - INFO - Processed 2510 files and 91784 pairs, saved 2510, time elapsed: 757.502233982