## Disease-Chemical 

In [None]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
import re

# Reusing the load_embeddings function from the previous step
def load_embeddings(filepath, marker):
    embeddings = {}
    with open(filepath, 'r') as file:
        for line in file:
            if marker in line:
                parts = line.split(marker)
                label = parts[1].split(',')[0].strip().lower()  # Ensure label is lowercase for case-insensitive matching
                embedding = np.array([float(x) for x in re.findall(r"[-+]?\d*\.\d+|\d+", parts[1])])
                embeddings[label] = embedding
    return embeddings

# Function to calculate cosine similarity, adjusted for case-insensitive matching
def calculate_cosine_similarity_ci(disease, chemical, disease_embeddings, chemical_embeddings):
    disease = disease.lower()  # Convert to lowercase
    chemical = chemical.lower()  # Convert to lowercase
    if disease in disease_embeddings and chemical in chemical_embeddings:
        disease_vec = disease_embeddings[disease]
        chemical_vec = chemical_embeddings[chemical]
        len_diff = len(disease_vec) - len(chemical_vec)
        
        if len_diff > 0:
            chemical_vec = np.pad(chemical_vec, (0, len_diff), 'constant', constant_values=0)
        elif len_diff < 0:
            disease_vec = np.pad(disease_vec, (0, -len_diff), 'constant', constant_values=0)
        
        return 1 - cosine(disease_vec, chemical_vec)
    else:
        return np.nan  # Return NaN if embeddings not found

# Load the CSV file
input_path = 'F_CTD/SkipGram/updated_CTD_disease_chemicals.csv'
df = pd.read_csv(input_path)

# Load embeddings
disease_embeddings = load_embeddings('gen_wordembeddings/SkipGram/Disease/SkipGram_Disease_embeddings_combined.txt', 'Phrase:')
chemical_embeddings = load_embeddings('gen_wordembeddings/SkipGram/Chemical/SkipGram_chemical_embeddings_combined.txt', 'Phrase:')

# Calculate cosine similarity where needed
for index, row in df.iterrows():
    if pd.isnull(row['CosineSimilarity']):  # Check if CosineSimilarity is NaN
        similarity = calculate_cosine_similarity_ci(row['DiseaseName'], row['ChemicalName'], disease_embeddings, chemical_embeddings)
        df.at[index, 'CosineSimilarity_lc'] = similarity

# Save the updated DataFrame
output_path = 'F_CTD/SkipGram/updated_CTD_disease_chemicals.csv'
df.to_csv(output_path, index=False)

print(f"Updated file with case-insensitive cosine similarity saved to {output_path}")


## Disease-Gene

In [None]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
import re

# Reusing the load_embeddings function from the previous step
def load_embeddings(filepath, marker):
    embeddings = {}
    with open(filepath, 'r') as file:
        for line in file:
            if marker in line:
                parts = line.split(marker)
                label = parts[1].split(',')[0].strip().lower()  # Ensure label is lowercase for case-insensitive matching
                embedding = np.array([float(x) for x in re.findall(r"[-+]?\d*\.\d+|\d+", parts[1])])
                embeddings[label] = embedding
    return embeddings

# Function to calculate cosine similarity, adjusted for case-insensitive matching
def calculate_cosine_similarity_ci(disease, gene, disease_embeddings, gene_embeddings):
    disease = disease.lower()  # Convert to lowercase
    gene = gene.lower()  # Convert to lowercase
    if disease in disease_embeddings and gene in gene_embeddings:
        disease_vec = disease_embeddings[disease]
        gene_vec = gene_embeddings[gene]
        len_diff = len(disease_vec) - len(gene_vec)
        
        if len_diff > 0:
            gene_vec = np.pad(gene_vec, (0, len_diff), 'constant', constant_values=0)
        elif len_diff < 0:
            disease_vec = np.pad(disease_vec, (0, -len_diff), 'constant', constant_values=0)
        
        return 1 - cosine(disease_vec, gene_vec)
    else:
        return np.nan  # Return NaN if embeddings not found

# Load the CSV file
input_path = 'F_CTD/CBOW/updated_CTD_disease_genes.csv'
df = pd.read_csv(input_path)

# Load embeddings
disease_embeddings = load_embeddings('gen_wordembeddings/CBOW/Disease/CBOW_Disease_embeddings_combined.txt', 'Phrase:')
gene_embeddings = load_embeddings('gen_wordembeddings/CBOW/Gene/CBOW_gene_embeddings_combined.txt', 'Gene:')

# Calculate cosine similarity where needed
for index, row in df.iterrows():
    if pd.isnull(row['CosineSimilarity']):  # Check if CosineSimilarity is NaN
        similarity = calculate_cosine_similarity_ci(row['DiseaseName'], row['GeneSymbol'], disease_embeddings, gene_embeddings)
        df.at[index, 'CosineSimilarity_lc'] = similarity

# Save the updated DataFrame
output_path = 'F_CTD/CBOW/updated_CTD_disease_genes_with_ci.csv'
df.to_csv(output_path, index=False)

print(f"Updated file with case-insensitive cosine similarity saved to {output_path}")


## Gene-Chemical

In [None]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine

# Function to load embeddings from a file
def load_embeddings(file_path, label_marker):
    embeddings = {}
    with open(file_path, 'r') as file:
        for line in file:
            if label_marker in line:
                parts = line.split('Embedding:')
                label = parts[0].split(label_marker)[1].strip().replace(',', '')
                # Remove the surrounding brackets and any leading/trailing whitespaces
                embedding_str = parts[1].strip().strip('()[]')
                # Split the string by comma and convert each number to a float
                embedding = np.array([float(x.strip()) for x in embedding_str.split(',')])
                embeddings[label] = embedding
    return embeddings

# Function to calculate cosine similarity (1 - cosine distance)
def cosine_similarity(vec_a, vec_b):
    # If either vector is NaN, return NaN
    if np.isnan(vec_a).any() or np.isnan(vec_b).any():
        return np.nan
    return 1 - cosine(vec_a, vec_b)

# Paths to the embedding files
chemical_embeddings_path = 'gen_wordembeddings/CBOW/Chemical/CBOW_chemical_embeddings_combined.txt'
gene_embeddings_path = 'gen_wordembeddings/CBOW/Gene/CBOW_gene_embeddings_combined.txt'

# Load embeddings
chemical_embeddings = load_embeddings(chemical_embeddings_path, 'Phrase:')
gene_embeddings = load_embeddings(gene_embeddings_path, 'Gene:')

# Path to the input CSV file
input_csv_path = 'F_CTD/CBOW/CTD_gene_chems_curated_with_cosine_similarity.csv'

# Load the input CSV file
df = pd.read_csv(input_csv_path)

# Function to calculate cosine similarity if 'CosineSimilarity' is empty
def calculate_similarity_lc(row):
    if pd.isna(row['CosineSimilarity']):
        chemical_name = row['ChemicalName']
        gene_symbol = row['GeneSymbol']
        
        # Check if chemical_name is in chemical_embeddings and gene_symbol is in gene_embeddings
        if chemical_name.lower() in chemical_embeddings and gene_symbol.lower() in gene_embeddings:
            return cosine_similarity(chemical_embeddings[chemical_name.lower()], gene_embeddings[gene_symbol.lower()])
    
    # If 'CosineSimilarity' is not empty or embeddings are not found, return NaN
    return np.nan

# Apply the function to each row in the DataFrame
df['CosineSimilarity_lc'] = df.apply(calculate_similarity_lc, axis=1)

# Save the updated DataFrame to a new CSV file
output_csv_path_lc = 'F_CTD/CBOW/CTD_gene_chems_curated_with_cosine_similarity_lc.csv'
df.to_csv(output_csv_path_lc, index=False)

print(f"Cosine similarities calculated and saved to {output_csv_path_lc}")


Note: Similarly Calculate cosine similarity to measure functional realtedness pairs for all the different models by changing their path.