<a href="https://colab.research.google.com/github/vihan-lakshman/mutagenic/blob/main/masking_pipeline_new.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Evaluation Dataset

In [4]:
!pip install Bio

Collecting Bio
  Downloading bio-1.7.1-py3-none-any.whl.metadata (5.7 kB)
Collecting biopython>=1.80 (from Bio)
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting gprofiler-official (from Bio)
  Downloading gprofiler_official-1.0.0-py3-none-any.whl.metadata (11 kB)
Collecting mygene (from Bio)
  Downloading mygene-3.2.2-py2.py3-none-any.whl.metadata (10 kB)
Collecting biothings-client>=0.2.6 (from mygene->Bio)
  Downloading biothings_client-0.4.1-py3-none-any.whl.metadata (10 kB)
Downloading bio-1.7.1-py3-none-any.whl (280 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.0/281.0 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m65.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading gprofiler_official-1.0.0-py3-none-any.whl (9.3

In [1]:
import torch
import pandas as pd
import os
import random
from sklearn.metrics.pairwise import cosine_similarity
#df = pd.read_csv('/content/prelim-deletion-validation-dataset-functional-annotations-with-interpro.csv')

In [2]:
if os.path.exists("/content/merged_data_noduplicates_new.csv"):
  merged_df = pd.read_csv('/content/merged_data_noduplicates_new.csv')
else:
  esm3_test_df = pd.read_csv('/content/selectedrcsbpdb.tsv', sep='\t', header = None)

  # Drop the last two columns
  esm3_test_df = esm3_test_df.iloc[:, :-2]
  esm3_test_df.columns = ["Protein Accession", "Seq Digest", "Seq Length", "Analysis", "Signature Accession", "Signature description", "Start location", "Stop location", "Score", "Status", "Date", "InterPro accession", "InterPro description"]

  merged_df = esm3_test_df.groupby('Protein Accession').agg({
      'Seq Length': 'first',
      'InterPro accession': lambda x: ';'.join(x[x != '-'].unique()),
      'InterPro description': lambda x: ';'.join(x[x != '-'].unique())
  }).reset_index()

  merged_df = merged_df.rename(columns={"InterPro accession": "InterPro_original", "InterPro description": "InterPro_original_description"})


interpro_go_df = pd.read_csv('/content/InterProDescriptions.tsv', sep='\t')

# Initialize an empty string to store the accessions
list_of_membrane_terms = ""

# Iterate through the DataFrame rows
for index, row in interpro_go_df.iterrows():
    go_terms = row['GO Terms']
    if isinstance(go_terms, str) and "GO:0016020" in go_terms:  # Check if the value is a string and contains "GO:0016020"
        accession = row['Accession']
        if isinstance(accession, str):  # Check if the accession is a string
          if list_of_membrane_terms == "":
            list_of_membrane_terms = accession
          else:
            list_of_membrane_terms += ';' + accession

# Add the new column
merged_df['InterPro_target'] = [list_of_membrane_terms] * len(merged_df)
df = merged_df
df

Unnamed: 0,Protein Accession,Seq Length,InterPro_original,InterPro_original_description,InterPro_target
0,5R1U_1|Chain,419,IPR034163;IPR021109;IPR001461;IPR001969;IPR033121,Aspergillopepsin-like catalytic domain;Asparti...,IPR000015;IPR000018;IPR000021;IPR000025;IPR000...
1,6KBQ_1|Chain,373,,,IPR000015;IPR000018;IPR000021;IPR000025;IPR000...
2,6KFD_1|Chain,162,IPR035992,Ricin B-like lectins,IPR000015;IPR000018;IPR000021;IPR000025;IPR000...
3,6L9T_1|Chain,595,IPR019791;IPR037120;IPR010255,"Haem peroxidase, animal-type;Haem peroxidase d...",IPR000015;IPR000018;IPR000021;IPR000025;IPR000...
4,6LM2_1|Chain,184,IPR012674;IPR015304,Calycin;ZinT domain,IPR000015;IPR000018;IPR000021;IPR000025;IPR000...
...,...,...,...,...,...
60,8WDG_1|Chain,385,IPR001998;IPR036237;IPR013022;IPR013453,Xylose isomerase;Xylose isomerase-like superfa...,IPR000015;IPR000018;IPR000021;IPR000025;IPR000...
61,8YJJ_1|Chain,190,IPR001137;IPR013319;IPR013320;IPR018208;IPR033...,Glycoside hydrolase family 11;Glycoside hydrol...,IPR000015;IPR000018;IPR000021;IPR000025;IPR000...
62,9C6E_1|Chain,365,IPR014756;IPR000698;IPR011022;IPR017864;IPR014...,Immunoglobulin E-set;Arrestin;Arrestin C-termi...,IPR000015;IPR000018;IPR000021;IPR000025;IPR000...
63,9EAT_1|Chain,220,IPR015892;IPR001765;IPR036874,"Carbonic anhydrase, prokaryotic-like, conserve...",IPR000015;IPR000018;IPR000021;IPR000025;IPR000...


In [5]:
from Bio import SeqIO
# Read FASTA file into a dictionary
def parse_fasta(fasta_file):
    fasta_dict = {}
    for record in SeqIO.parse(fasta_file, "fasta"):
        accession = record.id.split("|")[0]  # Extract only the part before the first "|"
        fasta_dict[accession] = str(record.seq)
    return fasta_dict

# Update DataFrame with sequences
def update_dataframe_with_sequences(df, fasta_dict):
    # Preprocess 'Protein Accession' column
    df["Protein Accession"] = df["Protein Accession"].str.split("|").str[0]
    df["Sequence"] = df["Protein Accession"].map(fasta_dict)
    return df

# Example usage
fasta_file = "/content/selectedrcsbpdb.fasta"
fasta_dict = parse_fasta(fasta_file)
df = update_dataframe_with_sequences(df, fasta_dict)

df

Unnamed: 0,Protein Accession,Seq Length,InterPro_original,InterPro_original_description,InterPro_target,Sequence
0,5R1U_1,419,IPR034163;IPR021109;IPR001461;IPR001969;IPR033121,Aspergillopepsin-like catalytic domain;Asparti...,IPR000015;IPR000018;IPR000021;IPR000025;IPR000...,MSSPLKNALVTAMLAGGALSSPTKQHVGIPVNASPEVGPGKYSFKQ...
1,6KBQ_1,373,,,IPR000015;IPR000018;IPR000021;IPR000025;IPR000...,MVLLVIGLPLVSLVVALVAAAAPDSQVCDVDSTATCKITATPSQFQ...
2,6KFD_1,162,IPR035992,Ricin B-like lectins,IPR000015;IPR000018;IPR000021;IPR000025;IPR000...,LTCDQLPKAAINPIQEFIDSNPLEFEYVLTETFECTTRIYVQPARW...
3,6L9T_1,595,IPR019791;IPR037120;IPR010255,"Haem peroxidase, animal-type;Haem peroxidase d...",IPR000015;IPR000018;IPR000021;IPR000025;IPR000...,SWEVGCGAPVPLVKCDENSPYRTITGDCNNRRSPALGAANRALARW...
4,6LM2_1,184,IPR012674;IPR015304,Calycin;ZinT domain,IPR000015;IPR000018;IPR000021;IPR000025;IPR000...,ALTEVEQKAANGVFDDANVQNRTLSDWDGVWQSVYPLLQSGKLDPV...
...,...,...,...,...,...,...
60,8WDG_1,385,IPR001998;IPR036237;IPR013022;IPR013453,Xylose isomerase;Xylose isomerase-like superfa...,IPR000015;IPR000018;IPR000021;IPR000025;IPR000...,YQPTPEDRFTFGLWTVGWQGRDPFGDATRRALDPVESVRRLAELGA...
61,8YJJ_1,190,IPR001137;IPR013319;IPR013320;IPR018208;IPR033...,Glycoside hydrolase family 11;Glycoside hydrol...,IPR000015;IPR000018;IPR000021;IPR000025;IPR000...,QTIQPGTGYNNGYFYSYWNDGHGGVTYTNGPGGQFSVNWSNSGNFV...
62,9C6E_1,365,IPR014756;IPR000698;IPR011022;IPR017864;IPR014...,Immunoglobulin E-set;Arrestin;Arrestin C-termi...,IPR000015;IPR000018;IPR000021;IPR000025;IPR000...,ANKPAPNHVIFKKISRDKSVTIYLGKRDYIDHVERVEPVDGVVLVD...
63,9EAT_1,220,IPR015892;IPR001765;IPR036874,"Carbonic anhydrase, prokaryotic-like, conserve...",IPR000015;IPR000018;IPR000021;IPR000025;IPR000...,MKDIDTLISNNALWSKMLVEEDPGFFEKLAQAQKPRFLWIGCSDSR...


In [6]:
# Dictionary to hold the results
embeddings_dict = {}

# Iterate through each row in the DataFrame
for _, row in df.iterrows():
    entry = row['Protein Accession']
    interpro_target = row['InterPro_target']
    interpro_original = row['InterPro_original']

    # Skip rows where 'Interpro' is None
    if pd.isna(interpro_target) or not interpro_target.strip() or pd.isna(interpro_original) or not interpro_target.strip():
        continue

    # Split the InterPro IDs by semicolons
    interpro_ids_target = interpro_target.split(';')
    interpro_ids_target = interpro_ids_target[:-1]
    interpro_ids_original = interpro_original.split(';')
    interpro_ids_original = interpro_ids_original[:-1]

    # Initialize entry in the dictionary if not present
    if entry not in embeddings_dict:
        embeddings_dict[entry] = {
            'InterPro_ids_target': interpro_ids_target,
            'InterPro_ids_original': interpro_ids_original
        }


In [7]:
first_key, first_value = next(iter(embeddings_dict.items()))
print(first_key, first_value['InterPro_ids_original'])

5R1U_1 ['IPR034163', 'IPR021109', 'IPR001461', 'IPR001969']


In [8]:
pip install esm

Collecting esm
  Downloading esm-3.1.3-py3-none-any.whl.metadata (15 kB)
Collecting torchtext (from esm)
  Downloading torchtext-0.18.0-cp311-cp311-manylinux1_x86_64.whl.metadata (7.9 kB)
Collecting transformers<4.47.0 (from esm)
  Downloading transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting biotite==0.41.2 (from esm)
  Downloading biotite-0.41.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.1 kB)
Collecting msgpack-numpy (from esm)
  Downloading msgpack_numpy-0.4.8-py2.py3-none-any.whl.metadata (5.0 kB)
Collecting brotli (from esm)
  Downloading Brotli-1.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.5 kB)
Collecting zstd (from esm)
  Downloading zstd-1.5.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.9 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.2.0->e

In [9]:
from huggingface_hub import login
from esm.models.esm3 import ESM3
from esm.sdk.api import ESM3InferenceClient, ESMProtein, GenerationConfig

# Will instruct you how to get an API key from huggingface hub, make one with "Read" permission.
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [10]:
model: ESM3InferenceClient = ESM3.from_pretrained("esm3_sm_open_v1").to("cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 22 files:   0%|          | 0/22 [00:00<?, ?it/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

data/entry_list_safety_29026.list:   0%|          | 0.00/1.60M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.06k [00:00<?, ?B/s]

data/esm3_entry.list:   0%|          | 0.00/1.93M [00:00<?, ?B/s]

data/ParentChildTreeFile.txt:   0%|          | 0.00/595k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.00 [00:00<?, ?B/s]

data/1utn.pdb:   0%|          | 0.00/569k [00:00<?, ?B/s]

data/interpro2keywords.csv:   0%|          | 0.00/7.32M [00:00<?, ?B/s]

(…)ata/interpro_29026_to_keywords_58641.csv:   0%|          | 0.00/10.1M [00:00<?, ?B/s]

hyperplanes_8bit_68103.npz:   0%|          | 0.00/34.9M [00:00<?, ?B/s]

hyperplanes_8bit_58641.npz:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

data/keywords.txt:   0%|          | 0.00/788k [00:00<?, ?B/s]

data/tag_dict_4.json:   0%|          | 0.00/691k [00:00<?, ?B/s]

(…)ord_vocabulary_safety_filtered_58641.txt:   0%|          | 0.00/788k [00:00<?, ?B/s]

keyword_idf_safety_filtered_58641.npy:   0%|          | 0.00/469k [00:00<?, ?B/s]

data/tag_dict_4_safety_filtered.json:   0%|          | 0.00/569k [00:00<?, ?B/s]

(…)0_residue_annotations_gt_1k_proteins.csv:   0%|          | 0.00/109k [00:00<?, ?B/s]

tfidf_safety_filtered_58641.pkl:   0%|          | 0.00/2.02M [00:00<?, ?B/s]

esm3_function_decoder_v0.pth:   0%|          | 0.00/1.30G [00:00<?, ?B/s]

esm3_structure_decoder_v0.pth:   0%|          | 0.00/1.24G [00:00<?, ?B/s]

esm3_structure_encoder_v0.pth:   0%|          | 0.00/62.3M [00:00<?, ?B/s]

esm3_sm_open_v1.pth:   0%|          | 0.00/2.80G [00:00<?, ?B/s]

  state_dict = torch.load(


In [11]:
import torch.nn as nn
from esm.tokenization import InterProQuantizedTokenizer
from esm.utils.types import FunctionAnnotation
def get_keywords_from_interpro(
    interpro_annotations,
    interpro2keywords=InterProQuantizedTokenizer().interpro2keywords,
):
    keyword_annotations_list = []
    for interpro_annotation in interpro_annotations:
        keywords = interpro2keywords.get(interpro_annotation.label, [])
        keyword_annotations_list.extend([
            FunctionAnnotation(
                label=keyword,
                start=interpro_annotation.start,
                end=interpro_annotation.end,
            )
            for keyword in keywords
        ])
    return keyword_annotations_list

In [12]:
#protein that only has one function?
#longer sequences of all As, all Gs, or completely random
def get_label_embedding(interpro_label,sequence):
  hostProtein = ESMProtein(sequence=sequence)
  embedding_function = model.encoder.function_embed
  hostProtein.function_annotations = get_keywords_from_interpro([FunctionAnnotation(label=interpro_label, start=1, end=len(sequence))])
  hostProtein_tensor = model.encode(hostProtein)
  device = hostProtein_tensor.function.device  # Get the device of protein2_tensor.function
  embedding_function = embedding_function.to(device)  # Move embedding_function to the device

  function_embed = torch.cat(
      [
          embed_fn(funcs.to(device)) # Ensure funcs is on the same device
          for embed_fn, funcs in zip(
              embedding_function, hostProtein_tensor.function.unbind(-1)
          )
      ],
      -1,
  )

  if function_embed.shape[0] >= 3:
      row_sum = function_embed.sum(dim=0)  # Sum all rows
      row_avg = row_sum / (function_embed.shape[0] - 2)  # Divide by (number of rows - 2)
      row_avg_np = row_avg.cpu().detach().type(torch.float32).numpy()
      return row_avg_np
  else:
      return None

In [13]:
import numpy as np
embedding_target_dict = {}  # Store the average embedding_target for each set of InterPro_ids_target

for entry, interpro_ids in embeddings_dict.items():
    # Check if the embedding_target for this set has already been calculated
    if tuple(interpro_ids['InterPro_ids_target']) not in embedding_target_dict:
        embeddings_target = []
        for interpro_id in interpro_ids['InterPro_ids_target']:
            embeddings_target.append(get_label_embedding(interpro_id, "A"))
        avg_embedding_target = np.mean(embeddings_target, axis=0)
        embedding_target_dict[tuple(interpro_ids['InterPro_ids_target'])] = avg_embedding_target
    else:
        avg_embedding_target = embedding_target_dict[tuple(interpro_ids['InterPro_ids_target'])]

    embeddings_dict[entry]['embedding_target'] = avg_embedding_target

    # Calculate embedding_original as before
    embeddings_original = []
    for interpro_id in interpro_ids['InterPro_ids_original']:
        embeddings_original.append(get_label_embedding(interpro_id, "A"))
    avg_embedding_original = np.mean(embeddings_original, axis=0)
    embeddings_dict[entry]['embedding_original'] = avg_embedding_original

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [14]:
first_key, first_value = next(iter(embeddings_dict.items()))
print(first_key, first_value)

5R1U_1 {'InterPro_ids_target': ['IPR000015', 'IPR000018', 'IPR000021', 'IPR000025', 'IPR000044', 'IPR000045', 'IPR000046', 'IPR000057', 'IPR000060', 'IPR000066', 'IPR000068', 'IPR000072', 'IPR000076', 'IPR000105', 'IPR000109', 'IPR000112', 'IPR000133', 'IPR000136', 'IPR000141', 'IPR000142', 'IPR000144', 'IPR000147', 'IPR000154', 'IPR000155', 'IPR000161', 'IPR000162', 'IPR000163', 'IPR000174', 'IPR000175', 'IPR000189', 'IPR000190', 'IPR000202', 'IPR000204', 'IPR000207', 'IPR000223', 'IPR000248', 'IPR000265', 'IPR000272', 'IPR000276', 'IPR000283', 'IPR000292', 'IPR000293', 'IPR000298', 'IPR000301', 'IPR000314', 'IPR000321', 'IPR000332', 'IPR000333', 'IPR000337', 'IPR000344', 'IPR000351', 'IPR000353', 'IPR000355', 'IPR000356', 'IPR000363', 'IPR000366', 'IPR000369', 'IPR000370', 'IPR000371', 'IPR000374', 'IPR000376', 'IPR000378', 'IPR000382', 'IPR000388', 'IPR000390', 'IPR000393', 'IPR000405', 'IPR000412', 'IPR000425', 'IPR000431', 'IPR000434', 'IPR000452', 'IPR000460', 'IPR000462', 'IPR00

In [16]:
def embedding_masking_model(
    prompt,
    model,
    df,
    embeddings_dict,
    percentage=10,
):
    """
    Helper function to process a protein sequence, calculate similarities,
    and return indices for masking.

    Args:
        prompt (str): The protein sequence to be processed.
        model: The model used for protein generation and embeddings.
        df (pd.DataFrame): DataFrame containing protein data.
        embeddings_dict (dict): Dictionary storing embeddings and other details.

    Returns:
        List[int]: Indices used for masking in the sequence.
    """
    # Create an ESMProtein object
    protein = ESMProtein(sequence=prompt)

    # Configure the model for generation
    generation_config = GenerationConfig(track="function", num_steps=8)

    # Generate the protein
    generated_protein = model.generate(protein, generation_config)

    # Check if function annotations are available
    entry = df.loc[df['Sequence'] == prompt, 'Protein Accession'].iloc[0]
    if generated_protein.function_annotations is None:
        embeddings_dict[entry]['hamming_distance'] = None
        return []

    # Getting embedding for the protein
    protein_tensor = model.encode(generated_protein)
    embedding_function = model.encoder.function_embed
    device = protein_tensor.function.device  # Get the device of protein_tensor.function
    embedding_function = embedding_function.to(device)  # Move embedding_function to the device

    function_embed = torch.cat(
        [
            embed_fn(funcs.to(device))  # Ensure funcs is on the same device
            for embed_fn, funcs in zip(
                embedding_function, protein_tensor.function.unbind(-1)
            )
        ],
        -1,
    )

    # Exclude start and end tokens
    function_embed = function_embed[1:-1, :]

    # Convert the protein_tensor.function to a NumPy array
    protein_np = function_embed.cpu().detach().type(torch.float32).numpy()

    # Retrieve embedding for target function's InterPro IDs (target function = function we're trying to add in addition to, not in place of, original)
    embedding_target = embeddings_dict[entry]['embedding_target']

    # Retrieve embedding for original function's InterPro IDs
    embedding_original = embeddings_dict[entry]['embedding_original']

    # Calculate cosine similarity
    similarities_target = cosine_similarity(protein_np, embedding_target.reshape(1, -1))
    similarities_original = cosine_similarity(protein_np, embedding_original.reshape(1, -1))

    # Get ranks for similarity from highest similarity to lowest similarity
    ranks_target = np.argsort(similarities_target.flatten())[::-1]
    ranks_original = np.argsort(similarities_original.flatten())[::-1]

    #Get number of indices to mask
    num_indices = int(len(prompt) * percentage / 100)

    # Ensure we select at least 1 index
    num_indices = max(1, num_indices)

    # Sum ranks and find top 10 indices with lowest (largest) rank
    ranks = ranks_target + ranks_original
    num_negated = -1*num_indices
    indices = np.argsort(ranks)[num_negated:]

    # Store the indices in the embeddings_dict
    embeddings_dict[entry]['indices'] = indices.tolist()

    return indices.tolist()

In [17]:
def get_random_indices(prompt, percentage):
    """
    Randomly select indices to mask based on the percentage of the prompt's length.
    """
    num_indices = int(len(prompt) * percentage / 100)
    # Ensure we select at least one index
    num_indices = max(1, num_indices)

    # Randomly select unique indices to mask
    return random.sample(range(len(prompt)), num_indices)

In [18]:
for idx, prompt in enumerate(df['Sequence']):
  try:
    random_indices = get_random_indices(prompt, 8)
    # Replace locations in prompt corresponding to indices in 'indices' with "_"
    target_seq = df.loc[df['Sequence'] == prompt, 'Sequence'].iloc[0]
    entry = df.loc[df['Sequence'] == prompt, 'Protein Accession'].iloc[0]
    if entry not in embeddings_dict:
        continue
    for method in ['embedding', 'random']:
      if method == 'embedding':
        indices = embedding_masking_model(prompt, model, df, embeddings_dict,percentage=10)
        if not indices:
          continue
      elif method == 'random':
        continue
        # indices = get_random_indices(prompt, 8)
      modified_prompt = list(prompt)
      for index in indices:
          modified_prompt[index] = "_"
      modified_prompt = "".join(modified_prompt)
      protein_prompt = ESMProtein(sequence=modified_prompt)
      sequence_generation = model.generate(
          protein_prompt,
          GenerationConfig(
              track="sequence",
              num_steps=protein_prompt.sequence.count("_") // 2,
              temperature=0.5,
          ),
      )
      # print("Generated Sequence: " + str(sequence_generation.sequence))
      generated_sequence = sequence_generation.sequence
      # print("Target Sequence: " + str(target_seq))
      # Ensure sequences are of equal length
      if len(generated_sequence) != len(target_seq):
          raise ValueError("Sequences must be of the same length to calculate Hamming distance.")

      # Calculate Hamming distance
      hamming_distance = sum(1 for gen, target in zip(generated_sequence, target_seq) if gen != target)

      # Print results
      print(f'Index {idx}: {method} Hamming Distance:', hamming_distance)
      embeddings_dict[entry][f'{method}_hamming_distance'] = hamming_distance
      embeddings_dict[entry][f'{method}_edited_sequence'] = generated_sequence
      torch.cuda.empty_cache()
  except Exception as e:
    print(f"An error occurred for index {idx}: {e}")
    continue

100%|██████████| 8/8 [00:09<00:00,  1.20s/it]
  state_dict = torch.load(
  state_dict = torch.load(
100%|██████████| 20/20 [00:11<00:00,  1.74it/s]


Index 0: embedding Hamming Distance: 25


100%|██████████| 8/8 [00:01<00:00,  4.48it/s]
100%|██████████| 8/8 [00:06<00:00,  1.18it/s]


An error occurred for index 3: CUDA out of memory. Tried to allocate 150.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 62.12 MiB is free. Process 14942 has 14.68 GiB memory in use. Of the allocated memory 13.58 GiB is allocated by PyTorch, and 998.21 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


100%|██████████| 8/8 [00:01<00:00,  4.24it/s]
100%|██████████| 9/9 [00:02<00:00,  4.05it/s]


Index 4: embedding Hamming Distance: 14


100%|██████████| 8/8 [00:03<00:00,  2.62it/s]
100%|██████████| 8/8 [00:03<00:00,  2.50it/s]
100%|██████████| 15/15 [00:06<00:00,  2.49it/s]


Index 6: embedding Hamming Distance: 6


100%|██████████| 8/8 [00:03<00:00,  2.55it/s]
100%|██████████| 14/14 [00:05<00:00,  2.53it/s]


Index 7: embedding Hamming Distance: 18


100%|██████████| 8/8 [00:01<00:00,  4.54it/s]
100%|██████████| 6/6 [00:01<00:00,  4.52it/s]


Index 8: embedding Hamming Distance: 7


100%|██████████| 8/8 [00:01<00:00,  4.20it/s]
100%|██████████| 8/8 [00:02<00:00,  3.91it/s]


Index 9: embedding Hamming Distance: 7


100%|██████████| 8/8 [00:02<00:00,  3.21it/s]
100%|██████████| 10/10 [00:03<00:00,  3.20it/s]


Index 10: embedding Hamming Distance: 11


100%|██████████| 8/8 [00:02<00:00,  3.02it/s]
100%|██████████| 12/12 [00:04<00:00,  2.99it/s]


Index 11: embedding Hamming Distance: 20


100%|██████████| 8/8 [00:03<00:00,  2.56it/s]
100%|██████████| 13/13 [00:05<00:00,  2.54it/s]


Index 12: embedding Hamming Distance: 14


100%|██████████| 8/8 [00:02<00:00,  2.92it/s]
100%|██████████| 12/12 [00:04<00:00,  2.93it/s]


Index 13: embedding Hamming Distance: 7


100%|██████████| 8/8 [00:06<00:00,  1.19it/s]


An error occurred for index 14: CUDA out of memory. Tried to allocate 140.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 100.12 MiB is free. Process 14942 has 14.64 GiB memory in use. Of the allocated memory 13.60 GiB is allocated by PyTorch, and 938.94 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


100%|██████████| 8/8 [00:04<00:00,  1.98it/s]
100%|██████████| 16/16 [00:08<00:00,  1.96it/s]


Index 15: embedding Hamming Distance: 29


100%|██████████| 8/8 [00:11<00:00,  1.48s/it]


An error occurred for index 16: CUDA out of memory. Tried to allocate 846.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 154.12 MiB is free. Process 14942 has 14.59 GiB memory in use. Of the allocated memory 13.37 GiB is allocated by PyTorch, and 1.10 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


100%|██████████| 8/8 [00:11<00:00,  1.48s/it]


An error occurred for index 17: CUDA out of memory. Tried to allocate 844.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 154.12 MiB is free. Process 14942 has 14.59 GiB memory in use. Of the allocated memory 13.36 GiB is allocated by PyTorch, and 1.10 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


100%|██████████| 8/8 [00:02<00:00,  2.92it/s]
100%|██████████| 11/11 [00:03<00:00,  2.93it/s]


Index 18: embedding Hamming Distance: 6


100%|██████████| 8/8 [00:04<00:00,  1.91it/s]
100%|██████████| 16/16 [00:08<00:00,  1.93it/s]


Index 19: embedding Hamming Distance: 9


100%|██████████| 8/8 [00:05<00:00,  1.34it/s]


An error occurred for index 20: CUDA out of memory. Tried to allocate 498.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 380.12 MiB is free. Process 14942 has 14.37 GiB memory in use. Of the allocated memory 13.09 GiB is allocated by PyTorch, and 1.15 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


100%|██████████| 8/8 [00:07<00:00,  1.07it/s]


An error occurred for index 21: CUDA out of memory. Tried to allocate 148.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 54.12 MiB is free. Process 14942 has 14.69 GiB memory in use. Of the allocated memory 13.56 GiB is allocated by PyTorch, and 1022.67 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


100%|██████████| 8/8 [00:06<00:00,  1.19it/s]


An error occurred for index 22: CUDA out of memory. Tried to allocate 136.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 54.12 MiB is free. Process 14942 has 14.69 GiB memory in use. Of the allocated memory 13.17 GiB is allocated by PyTorch, and 1.39 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


100%|██████████| 8/8 [00:04<00:00,  1.94it/s]
100%|██████████| 16/16 [00:08<00:00,  1.95it/s]


Index 23: embedding Hamming Distance: 14


100%|██████████| 8/8 [00:01<00:00,  4.12it/s]
100%|██████████| 7/7 [00:01<00:00,  4.13it/s]


Index 24: embedding Hamming Distance: 5


100%|██████████| 8/8 [00:04<00:00,  1.96it/s]
100%|██████████| 16/16 [00:08<00:00,  1.96it/s]


Index 25: embedding Hamming Distance: 24


100%|██████████| 8/8 [00:02<00:00,  3.83it/s]
100%|██████████| 8/8 [00:02<00:00,  2.84it/s]
100%|██████████| 12/12 [00:04<00:00,  2.82it/s]


Index 27: embedding Hamming Distance: 8


100%|██████████| 8/8 [00:03<00:00,  2.36it/s]
100%|██████████| 13/13 [00:05<00:00,  2.35it/s]


Index 28: embedding Hamming Distance: 7


100%|██████████| 8/8 [00:02<00:00,  2.83it/s]
100%|██████████| 11/11 [00:03<00:00,  2.85it/s]


Index 29: embedding Hamming Distance: 10


100%|██████████| 8/8 [00:04<00:00,  1.92it/s]
100%|██████████| 16/16 [00:08<00:00,  1.97it/s]


Index 30: embedding Hamming Distance: 13


100%|██████████| 8/8 [00:03<00:00,  2.42it/s]
100%|██████████| 13/13 [00:05<00:00,  2.40it/s]


Index 31: embedding Hamming Distance: 8


100%|██████████| 8/8 [00:03<00:00,  2.36it/s]
100%|██████████| 14/14 [00:06<00:00,  2.32it/s]


Index 32: embedding Hamming Distance: 21


100%|██████████| 8/8 [00:09<00:00,  1.13s/it]


An error occurred for index 33: CUDA out of memory. Tried to allocate 694.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 614.12 MiB is free. Process 14942 has 14.14 GiB memory in use. Of the allocated memory 12.97 GiB is allocated by PyTorch, and 1.05 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


100%|██████████| 8/8 [00:02<00:00,  3.99it/s]
100%|██████████| 7/7 [00:01<00:00,  3.87it/s]


Index 34: embedding Hamming Distance: 8


100%|██████████| 8/8 [00:05<00:00,  1.37it/s]


An error occurred for index 35: CUDA out of memory. Tried to allocate 486.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 90.12 MiB is free. Process 14942 has 14.65 GiB memory in use. Of the allocated memory 13.44 GiB is allocated by PyTorch, and 1.09 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


  0%|          | 0/8 [00:00<?, ?it/s]


An error occurred for index 36: CUDA out of memory. Tried to allocate 1.47 GiB. GPU 0 has a total capacity of 14.74 GiB of which 592.12 MiB is free. Process 14942 has 14.16 GiB memory in use. Of the allocated memory 12.57 GiB is allocated by PyTorch, and 1.47 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


100%|██████████| 8/8 [00:02<00:00,  2.85it/s]
100%|██████████| 12/12 [00:04<00:00,  2.83it/s]


Index 37: embedding Hamming Distance: 18


100%|██████████| 8/8 [00:01<00:00,  4.08it/s]
100%|██████████| 7/7 [00:01<00:00,  4.08it/s]


Index 38: embedding Hamming Distance: 6


100%|██████████| 8/8 [00:00<00:00,  8.34it/s]
100%|██████████| 8/8 [00:02<00:00,  2.79it/s]
100%|██████████| 12/12 [00:04<00:00,  2.77it/s]


Index 40: embedding Hamming Distance: 22


100%|██████████| 8/8 [00:01<00:00,  5.58it/s]
100%|██████████| 4/4 [00:00<00:00,  5.68it/s]


Index 41: embedding Hamming Distance: 6


100%|██████████| 8/8 [00:04<00:00,  1.83it/s]


An error occurred for index 42: Input contains NaN.


100%|██████████| 8/8 [00:03<00:00,  2.38it/s]
100%|██████████| 13/13 [00:05<00:00,  2.41it/s]


Index 43: embedding Hamming Distance: 11


100%|██████████| 8/8 [00:02<00:00,  3.90it/s]
100%|██████████| 8/8 [00:02<00:00,  3.87it/s]


Index 44: embedding Hamming Distance: 6


100%|██████████| 8/8 [00:01<00:00,  4.12it/s]
100%|██████████| 7/7 [00:01<00:00,  3.94it/s]


Index 45: embedding Hamming Distance: 9


100%|██████████| 8/8 [00:04<00:00,  1.63it/s]
100%|██████████| 19/19 [00:11<00:00,  1.61it/s]


Index 46: embedding Hamming Distance: 5


100%|██████████| 8/8 [00:06<00:00,  1.16it/s]


An error occurred for index 47: CUDA out of memory. Tried to allocate 550.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 30.12 MiB is free. Process 14942 has 14.71 GiB memory in use. Of the allocated memory 13.70 GiB is allocated by PyTorch, and 909.19 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


100%|██████████| 8/8 [00:02<00:00,  2.90it/s]
100%|██████████| 10/10 [00:03<00:00,  2.93it/s]


Index 48: embedding Hamming Distance: 10


100%|██████████| 8/8 [00:01<00:00,  5.33it/s]
100%|██████████| 8/8 [00:01<00:00,  5.62it/s]
100%|██████████| 8/8 [00:10<00:00,  1.35s/it]


An error occurred for index 51: CUDA out of memory. Tried to allocate 782.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 94.12 MiB is free. Process 14942 has 14.65 GiB memory in use. Of the allocated memory 13.68 GiB is allocated by PyTorch, and 860.33 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


100%|██████████| 8/8 [00:11<00:00,  1.39s/it]


An error occurred for index 52: CUDA out of memory. Tried to allocate 832.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 314.12 MiB is free. Process 14942 has 14.43 GiB memory in use. Of the allocated memory 13.27 GiB is allocated by PyTorch, and 1.04 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


100%|██████████| 8/8 [00:02<00:00,  2.89it/s]
100%|██████████| 11/11 [00:03<00:00,  2.92it/s]


Index 53: embedding Hamming Distance: 16


100%|██████████| 8/8 [00:04<00:00,  1.63it/s]
100%|██████████| 19/19 [00:11<00:00,  1.64it/s]


Index 54: embedding Hamming Distance: 23


100%|██████████| 8/8 [00:03<00:00,  2.45it/s]
100%|██████████| 13/13 [00:05<00:00,  2.45it/s]


Index 55: embedding Hamming Distance: 25


100%|██████████| 8/8 [00:02<00:00,  2.90it/s]
100%|██████████| 11/11 [00:03<00:00,  2.89it/s]


Index 56: embedding Hamming Distance: 11


100%|██████████| 8/8 [00:07<00:00,  1.06it/s]


An error occurred for index 57: CUDA out of memory. Tried to allocate 150.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 98.12 MiB is free. Process 14942 has 14.64 GiB memory in use. Of the allocated memory 13.55 GiB is allocated by PyTorch, and 986.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


100%|██████████| 8/8 [00:14<00:00,  1.79s/it]


An error occurred for index 58: CUDA out of memory. Tried to allocate 2.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 2.12 MiB is free. Process 14942 has 14.74 GiB memory in use. Of the allocated memory 12.98 GiB is allocated by PyTorch, and 1.63 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


100%|██████████| 8/8 [00:02<00:00,  2.91it/s]
100%|██████████| 10/10 [00:03<00:00,  2.96it/s]


Index 59: embedding Hamming Distance: 16


100%|██████████| 8/8 [00:04<00:00,  1.61it/s]
100%|██████████| 19/19 [00:10<00:00,  1.76it/s]


Index 60: embedding Hamming Distance: 17


100%|██████████| 8/8 [00:01<00:00,  4.17it/s]
100%|██████████| 9/9 [00:02<00:00,  4.18it/s]


Index 61: embedding Hamming Distance: 4


100%|██████████| 8/8 [00:03<00:00,  2.04it/s]
100%|██████████| 18/18 [00:08<00:00,  2.03it/s]


Index 62: embedding Hamming Distance: 26


100%|██████████| 8/8 [00:02<00:00,  3.20it/s]
100%|██████████| 11/11 [00:03<00:00,  3.19it/s]


Index 63: embedding Hamming Distance: 6


100%|██████████| 8/8 [00:01<00:00,  4.38it/s]
100%|██████████| 7/7 [00:01<00:00,  4.37it/s]

Index 64: embedding Hamming Distance: 6





In [21]:
# prompt: go through embeddings_dict and turn it into a dataframe where the key becomes the first column and each key value pair in the value dict becomes subsequent columns

df_embeddings = pd.DataFrame.from_dict(embeddings_dict, orient='index')

# Reset the index to make the keys a regular column
df_embeddings = df_embeddings.reset_index()

# Rename the 'index' column to 'Protein Accession'
df_embeddings = df_embeddings.rename(columns={'index': 'Protein Accession'})

df_embeddings = update_dataframe_with_sequences(df_embeddings, fasta_dict)
df_embeddings


Unnamed: 0,Protein Accession,InterPro_ids_target,InterPro_ids_original,embedding_target,embedding_original,indices,embedding_hamming_distance,embedding_edited_sequence,hamming_distance,Sequence
0,5R1U_1,"[IPR000015, IPR000018, IPR000021, IPR000025, I...","[IPR034163, IPR021109, IPR001461, IPR001969]","[-0.024249457, 0.08652738, 0.03575097, -0.0881...","[0.115722656, 0.099609375, -0.13769531, -0.084...","[170, 171, 167, 172, 218, 219, 220, 221, 222, ...",25.0,MSSPLKNALVTAMLAGGALSSPTKQHVGIPVNASPEVGPGKYSFKQ...,,MSSPLKNALVTAMLAGGALSSPTKQHVGIPVNASPEVGPGKYSFKQ...
1,6KFD_1,"[IPR000015, IPR000018, IPR000021, IPR000025, I...",[],"[-0.024249457, 0.08652738, 0.03575097, -0.0881...",,,,,,LTCDQLPKAAINPIQEFIDSNPLEFEYVLTETFECTTRIYVQPARW...
2,6L9T_1,"[IPR000015, IPR000018, IPR000021, IPR000025, I...","[IPR019791, IPR037120]","[-0.024249457, 0.08652738, 0.03575097, -0.0881...","[-0.17578125, 0.07470703, 0.54296875, -0.41210...",,,,,SWEVGCGAPVPLVKCDENSPYRTITGDCNNRRSPALGAANRALARW...
3,6LM2_1,"[IPR000015, IPR000018, IPR000021, IPR000025, I...",[IPR012674],"[-0.024249457, 0.08652738, 0.03575097, -0.0881...","[-1.0859375, 1.1640625, 0.111816406, -0.394531...","[129, 52, 110, 57, 56, 140, 34, 0, 1, 138, 124...",14.0,MSTEVEQKAANGVFDDANVQNRTLSDWDGVWQSVLPLLQSGKLDPV...,,ALTEVEQKAANGVFDDANVQNRTLSDWDGVWQSVYPLLQSGKLDPV...
4,6LOV_1,"[IPR000015, IPR000018, IPR000021, IPR000025, I...","[IPR016139, IPR036041, IPR017989, IPR017988, I...","[-0.024249457, 0.08652738, 0.03575097, -0.0881...","[-0.059765626, 0.19375, 0.16875, -0.0625, 0.16...",,,,,MSRFSVLSFLILAIFLGGSIVKGDVSFRLSGADPRSYGMFIKDLRN...
...,...,...,...,...,...,...,...,...,...,...
59,8WDG_1,"[IPR000015, IPR000018, IPR000021, IPR000025, I...","[IPR001998, IPR036237, IPR013022]","[-0.024249457, 0.08652738, 0.03575097, -0.0881...","[-0.33528647, 0.43196616, 0.12630208, -0.09114...","[253, 383, 254, 382, 261, 75, 46, 380, 74, 30,...",17.0,YQPTPEDRFTFGLWTVGWQGRDPFGDATHRPVDPVESVRRLAELGA...,,YQPTPEDRFTFGLWTVGWQGRDPFGDATRRALDPVESVRRLAELGA...
60,8YJJ_1,"[IPR000015, IPR000018, IPR000021, IPR000025, I...","[IPR001137, IPR013319, IPR013320, IPR018208, I...","[-0.024249457, 0.08652738, 0.03575097, -0.0881...","[-0.65234375, 0.00546875, 0.00234375, 0.069726...","[119, 136, 130, 135, 134, 133, 122, 121, 120, ...",4.0,MTIQPGTGYNNGYFYSYWNDGHGGVTYTNGPGGQFSVNWSNSGNFV...,,QTIQPGTGYNNGYFYSYWNDGHGGVTYTNGPGGQFSVNWSNSGNFV...
61,9C6E_1,"[IPR000015, IPR000018, IPR000021, IPR000025, I...","[IPR014756, IPR000698, IPR011022, IPR017864, I...","[-0.024249457, 0.08652738, 0.03575097, -0.0881...","[-0.708903, -0.036295574, 0.08138021, 0.289550...","[233, 229, 129, 158, 215, 130, 230, 232, 231, ...",26.0,ANKPAPNHVIFKKISRDKSVTIYLGKRDYIDHVERVEPVDGVVLVD...,,ANKPAPNHVIFKKISRDKSVTIYLGKRDYIDHVERVEPVDGVVLVD...
62,9EAT_1,"[IPR000015, IPR000018, IPR000021, IPR000025, I...","[IPR015892, IPR001765]","[-0.024249457, 0.08652738, 0.03575097, -0.0881...","[0.005126953, 0.09716797, 0.06933594, 0.429687...","[189, 195, 155, 145, 184, 196, 186, 187, 185, ...",6.0,MKDIDTLISNNALWSKMLVEEDPGFFEKLAQAQKPRFLWIGCSDSR...,,MKDIDTLISNNALWSKMLVEEDPGFFEKLAQAQKPRFLWIGCSDSR...


In [22]:
df_embeddings.to_csv("interpro_scan_test_data_w_edited_sequences.csv", index = False)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Extract Hamming distances for 'embedding' and 'random' methods
embedding_distances = []
random_distances = []

# Iterate over embeddings_dict to gather the hamming distances
for key, entry in embeddings_dict.items():
    embedding_distance = entry.get('embedding_hamming_distance', None)
    random_distance = entry.get('random_hamming_distance', None)
    seq_length = len(df.loc[df['Protein Accession'] == key, 'Sequence'].iloc[0])
    embeddings_dict[key]['seq_length'] = seq_length

    # Skip None values and collect valid distances (only compare on proteins w/ both values)
    if embedding_distance is not None and random_distance is not None:
        embedding_distances.append(embedding_distance/seq_length)
        random_distances.append(random_distance/seq_length)

# Calculate average and standard deviation for both methods
embedding_avg = np.mean(embedding_distances) if embedding_distances else 0
embedding_std = np.std(embedding_distances) if embedding_distances else 0
print(embedding_avg)
print(embedding_std)

random_avg = np.mean(random_distances) if random_distances else 0
random_std = np.std(random_distances) if random_distances else 0
print(random_avg)
print(random_std)

# Prepare data for the bar plot
methods = ['Embedding', 'Random']
averages = [embedding_avg, random_avg]
std_devs = [embedding_std, random_std]

# Create the bar plot
fig, ax = plt.subplots()

# Set positions for the bars
bar_width = 0.35
index = np.arange(len(methods))

# Plot the bars with error bars
ax.bar(index, averages, bar_width, yerr=std_devs, capsize=5, label='Average Hamming Distance')

# Customize the plot
ax.set_xlabel('Method')
ax.set_ylabel('Hamming Distance')
ax.set_title('Average Hamming Distance and Standard Deviation by Method')
ax.set_xticks(index)
ax.set_xticklabels(methods)
ax.legend()

# Display the plot
plt.show()

In [None]:
import json
import numpy as np

# Specify the file path where you want to save the JSON file
file_path = 'hamming_distance_dict.json'

# Define a custom encoder to handle NumPy arrays
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()  # Convert NumPy array to list
        return json.JSONEncoder.default(self, obj)

# Save the dictionary to a JSON file using the custom encoder
with open(file_path, 'w') as json_file:
    json.dump(embeddings_dict, json_file, indent=4, cls=NumpyEncoder)
    # Use cls=NumpyEncoder to apply the custom encoding