## calculate average similarity

### PubMedBERT

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm  # Import the tqdm library for the progress bar
import os

# Function to parse the embedding file and return a dictionary of {phrase: embedding}
def load_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r') as file:
        for line in file:
            if line.strip():  # Ignore empty lines
                # Extract phrase and embedding
                phrase = line.split(',')[0].replace('Phrase:', '').strip()
                embedding_str = line.split('Embedding:')[1].strip().strip('[]')
                embedding = np.array([float(e) for e in embedding_str.split(',')])
                embeddings[phrase] = embedding
    return embeddings

# Load the embeddings from all files (now using "disease" directory)
files = [
    '3gen_wordembeddings/PubMedBERT/Disease/-1/combined_embeddings_pubmedbert-1.txt',
    '3gen_wordembeddings/PubMedBERT/Disease/-2/combined_embeddings_pubmedbert-2.txt',
    '3gen_wordembeddings/PubMedBERT/Disease/-3/combined_embeddings_pubmedbert-3.txt',
    '3gen_wordembeddings/PubMedBERT/Disease/-4/combined_embeddings_pubmedbert-4.txt',
    '3gen_wordembeddings/PubMedBERT/Disease/Sum/combined_embeddings_pubmedbertsum.txt'
]

embeddings_dict = [load_embeddings(file) for file in files]

# Function to calculate cosine similarity between two embeddings
def calculate_similarity(embedding1, embedding2):
    return cosine_similarity(embedding1.reshape(1, -1), embedding2.reshape(1, -1))[0][0]

# Initialize a dictionary to store pairwise similarities
pairwise_similarities = {}

# Get phrases from the first file
phrases = list(embeddings_dict[0].keys())

# Use tqdm to add a progress bar for each pair of files
for i in range(len(files) - 1):
    for j in range(i + 1, len(files)):
        similarities = []
        for phrase in tqdm(phrases, desc=f"Processing {files[i]} vs {files[j]}", unit="phrase"):
            if all(phrase in embeddings for embeddings in [embeddings_dict[i], embeddings_dict[j]]):
                sim = calculate_similarity(embeddings_dict[i][phrase], embeddings_dict[j][phrase])
                similarities.append(sim)
        avg_similarity = np.mean(similarities)  # Average similarity between the two files
        pairwise_similarities[f"{files[i]} vs {files[j]}"] = avg_similarity

# Specify the output file folder
output_folder = 'output_folder/PubMedBERT/Disease'
os.makedirs(output_folder, exist_ok=True)  # Create the folder if it doesn't exist

# Specify the output file path
output_file_path = os.path.join(output_folder, 'pairwise_similarity_results.txt')

# Write the pairwise similarity results to the output file
with open(output_file_path, 'w') as f:
    for pair, avg_sim in pairwise_similarities.items():
        f.write(f"Average similarity between {pair}: {avg_sim:.4f}\n")

print(f"Pairwise results saved to {output_file_path}")


In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm  # Import the tqdm library for the progress bar
import os

# Function to parse the embedding file and return a dictionary of {phrase: embedding}
def load_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r') as file:
        for line in file:
            if line.strip():  # Ignore empty lines
                # Extract phrase and embedding
                phrase = line.split(',')[0].replace('Phrase:', '').strip()
                embedding_str = line.split('Embedding:')[1].strip().strip('[]')
                embedding = np.array([float(e) for e in embedding_str.split(',')])
                embeddings[phrase] = embedding
    return embeddings

# Load the embeddings from all files (now using "chemical" directory)
files = [
    '3gen_wordembeddings/PubMedBERT/Chemical/-1/combined_embeddings_pubmedbert-1.txt',
    '3gen_wordembeddings/PubMedBERT/Chemical/-2/combined_embeddings_pubmedbert-2.txt',
    '3gen_wordembeddings/PubMedBERT/Chemical/-3/combined_embeddings_pubmedbert-3.txt',
    '3gen_wordembeddings/PubMedBERT/Chemical/-4/combined_embeddings_pubmedbert-4.txt',
    '3gen_wordembeddings/PubMedBERT/Chemical/Sum/combined_embeddings_pubmedbertsum.txt'
]

embeddings_dict = [load_embeddings(file) for file in files]

# Function to calculate cosine similarity between two embeddings
def calculate_similarity(embedding1, embedding2):
    return cosine_similarity(embedding1.reshape(1, -1), embedding2.reshape(1, -1))[0][0]

# Initialize a dictionary to store pairwise similarities
pairwise_similarities = {}

# Get phrases from the first file
phrases = list(embeddings_dict[0].keys())

# Use tqdm to add a progress bar for each pair of files
for i in range(len(files) - 1):
    for j in range(i + 1, len(files)):
        similarities = []
        for phrase in tqdm(phrases, desc=f"Processing {files[i]} vs {files[j]}", unit="phrase"):
            if all(phrase in embeddings for embeddings in [embeddings_dict[i], embeddings_dict[j]]):
                sim = calculate_similarity(embeddings_dict[i][phrase], embeddings_dict[j][phrase])
                similarities.append(sim)
        avg_similarity = np.mean(similarities)  # Average similarity between the two files
        pairwise_similarities[f"{files[i]} vs {files[j]}"] = avg_similarity

# Specify the output file folder
output_folder = 'output_folder/PubMedBERT/Chemical'
os.makedirs(output_folder, exist_ok=True)  # Create the folder if it doesn't exist

# Specify the output file path
output_file_path = os.path.join(output_folder, 'pairwise_similarity_results.txt')

# Write the pairwise similarity results to the output file
with open(output_file_path, 'w') as f:
    for pair, avg_sim in pairwise_similarities.items():
        f.write(f"Average similarity between {pair}: {avg_sim:.4f}\n")

print(f"Pairwise results saved to {output_file_path}")


In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm  # Import the tqdm library for the progress bar
import os

# Function to parse the embedding file and return a dictionary of {phrase: embedding}
def load_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r') as file:
        for line in file:
            if line.strip():  # Ignore empty lines
                # Extract phrase and embedding
                phrase = line.split(',')[0].replace('Phrase:', '').strip()
                embedding_str = line.split('Embedding:')[1].strip().strip('[]')
                embedding = np.array([float(e) for e in embedding_str.split(',')])
                embeddings[phrase] = embedding
    return embeddings

# Load the embeddings from all files (now using "gene" directory)
files = [
    '3gen_wordembeddings/PubMedBERT/Gene/-1/combined_embeddings_pubmedbert-1.txt',
    '3gen_wordembeddings/PubMedBERT/Gene/-2/combined_embeddings_pubmedbert-2.txt',
    '3gen_wordembeddings/PubMedBERT/Gene/-3/combined_embeddings_pubmedbert-3.txt',
    '3gen_wordembeddings/PubMedBERT/Gene/-4/combined_embeddings_pubmedbert-4.txt',
    '3gen_wordembeddings/PubMedBERT/Gene/Sum/combined_embeddings_pubmedbertsum.txt'
]

embeddings_dict = [load_embeddings(file) for file in files]

# Function to calculate cosine similarity between two embeddings
def calculate_similarity(embedding1, embedding2):
    return cosine_similarity(embedding1.reshape(1, -1), embedding2.reshape(1, -1))[0][0]

# Initialize a dictionary to store pairwise similarities
pairwise_similarities = {}

# Get phrases from the first file
phrases = list(embeddings_dict[0].keys())

# Use tqdm to add a progress bar for each pair of files
for i in range(len(files) - 1):
    for j in range(i + 1, len(files)):
        similarities = []
        for phrase in tqdm(phrases, desc=f"Processing {files[i]} vs {files[j]}", unit="phrase"):
            if all(phrase in embeddings for embeddings in [embeddings_dict[i], embeddings_dict[j]]):
                sim = calculate_similarity(embeddings_dict[i][phrase], embeddings_dict[j][phrase])
                similarities.append(sim)
        avg_similarity = np.mean(similarities)  # Average similarity between the two files
        pairwise_similarities[f"{files[i]} vs {files[j]}"] = avg_similarity

# Specify the output file folder
output_folder = 'output_folder/PubMedBERT/Gene'
os.makedirs(output_folder, exist_ok=True)  # Create the folder if it doesn't exist

# Specify the output file path
output_file_path = os.path.join(output_folder, 'pairwise_similarity_results.txt')

# Write the pairwise similarity results to the output file
with open(output_file_path, 'w') as f:
    for pair, avg_sim in pairwise_similarities.items():
        f.write(f"Average similarity between {pair}: {avg_sim:.4f}\n")

print(f"Pairwise results saved to {output_file_path}")


Note: Once these similarities are calculated, in order to compare these we can plot a graph to compare the similarities 

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Sample data for Chemical, Disease, and Gene
data = {
    'Chemical': {
        'File1': ['-1', '-1', '-1', '-1', '-2', '-2', '-2', '-3', '-3', '-4'],
        'File2': ['-2', '-3', '-4', 'Sum', '-3', '-4', 'Sum', '-4', 'Sum', 'Sum'],
        'Avg_similarity': [0.969, 0.9018, 0.918, 0.9693, 0.9516, 0.9401, 0.9893, 0.9441, 0.9726, 0.9735]
    },
    'Disease': {
        'File1': ['-1', '-1', '-1', '-1', '-2', '-2', '-2', '-3', '-3', '-4'],
        'File2': ['-2', '-3', '-4', 'Sum', '-3', '-4', 'Sum', '-4', 'Sum', 'Sum'],
        'Avg_similarity': [0.9689, 0.9047, 0.9242, 0.972, 0.9544, 0.9354, 0.9893, 0.9296, 0.9711, 0.9705]
    },
    'Gene': {
        'File1': ['-1', '-1', '-1', '-1', '-2', '-2', '-2', '-3', '-3', '-4'],
        'File2': ['-2', '-3', '-4', 'Sum', '-3', '-4', 'Sum', '-4', 'Sum', 'Sum'],
        'Avg_similarity': [0.9711, 0.9037, 0.9224, 0.9711, 0.9507, 0.9417, 0.9895, 0.9422, 0.9717, 0.9739]
    }
}

# Combine all categories into one DataFrame
combined_data = []
for category in ['Chemical', 'Disease', 'Gene']:
    df = pd.DataFrame(data[category])
    df['Pair'] = df['File1'] + ' vs ' + df['File2']
    df['Category'] = category
    combined_data.append(df)

# Concatenate all category DataFrames into one
combined_df = pd.concat(combined_data)

# Define a custom color palette: Teal, Dark Magenta, and Greenish Gold
custom_palette = {
    'Chemical': '#008080',  # Teal
    'Disease': '#11002F',   # Dark Magenta
    'Gene': '#44D700'       # Greenish Gold
}

# Plot a bar chart with custom colors for Chemical, Disease, and Gene
plt.figure(figsize=(12, 6))
sns.barplot(x='Pair', y='Avg_similarity', hue='Category', data=combined_df, palette=custom_palette)

# Rotate x-axis labels for readability
plt.xticks(rotation=45, ha="right")

# Add title and labels
plt.title("Avg Similarity Bar Plot for Chemical, Disease, and Gene (Teal, Dark Magenta, and Greenish Gold)")
plt.xlabel("File Pairs")
plt.ylabel("Avg Similarity")

# Save the figure as PNG file
plt.tight_layout()
plt.savefig("Avg_Similarity_Bar_Plot_Custom.png", dpi=300)  # Save the plot as PNG with high resolution

# Show plot
plt.show()
