## Filter new entries

In [None]:
import pandas as pd

# Load both CSV files into DataFrames
new_file = '1CTD/CTD_disease_genes_curated.csv'
old_file = 'ctd/CTD_disease_genes_curated.csv'

df_new = pd.read_csv(new_file)
df_old = pd.read_csv(old_file)

# Check the structure of the data to ensure we can compare
print(f"Columns in new file: {df_new.columns}")
print(f"Columns in old file: {df_old.columns}")

# Assuming both files have the same structure and we want to compare the rows directly
# Remove duplicate rows within each file first (optional, based on requirement)
df_new_unique = df_new.drop_duplicates()
df_old_unique = df_old.drop_duplicates()

# Find new entries that are in df_new but not in df_old
new_entries = pd.merge(df_new_unique, df_old_unique, how='outer', indicator=True).query('_merge == "left_only"').drop('_merge', axis=1)

# Show the new entries
if not new_entries.empty:
    print("New entries found:")
    print(new_entries)
else:
    print("No new entries found.")

# Optionally, save the new entries to a new CSV file
new_entries.to_csv('new_entries_found.csv', index=False)


Note: Similarly you can do the same from chemical-disease, disease-gene and chemical-gene pairs. Steps to download from CTD dataset will be added in Readme

## Cosine Similarity fo new pairs which are present only in August 2024 CTD release

In [2]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
import os

# Function to load embeddings from file
def load_embeddings(file_path, phrase_key):
    embeddings = {}
    with open(file_path, 'r') as file:
        for line in file:
            if line.strip():  # Ignore empty lines
                # Extract phrase and embedding
                phrase = line.split(',')[0].replace(phrase_key, '').strip()
                embedding_str = line.split('Embedding:')[1].strip().strip('[]')
                embedding = np.array([float(e) for e in embedding_str.split(',')])
                embeddings[phrase.lower()] = embedding  # Convert phrase to lowercase for consistent matching
    return embeddings

# Function to calculate cosine similarity
def calculate_cosine_similarity(vec1, vec2):
    if len(vec1) != len(vec2):
        return np.nan  # Return NaN if vectors are not of the same length
    return 1 - cosine(vec1, vec2)

# Load embeddings
chemical_embeddings = load_embeddings('1filteredgen_wordembeddings/BioBERT/Chemical/-1/filtered_embeddings_biobert-1.txt', 'Phrase:')
disease_embeddings = load_embeddings('1filteredgen_wordembeddings/BioBERT/Disease/-1/filtered_embeddings_biobert-1.txt', 'Phrase:')

# Load CSV file
df = pd.read_csv('new_entries_founddisease_chems.csv')

# Ensure the data types are correct
df['ChemicalName'] = df['ChemicalName'].astype(str)
df['DiseaseName'] = df['DiseaseName'].astype(str)

# Prepare to store results
results = []

# Iterate over rows in DataFrame
for idx, row in df.iterrows():
    chemical_name = row['ChemicalName'].lower()  # Ensure conversion to lowercase
    disease_name = row['DiseaseName'].lower()  # Ensure conversion to lowercase

    # Check if both embeddings are available
    if chemical_name in chemical_embeddings and disease_name in disease_embeddings:
        chemical_vec = chemical_embeddings[chemical_name]
        disease_vec = disease_embeddings[disease_name]
        similarity = calculate_cosine_similarity(chemical_vec, disease_vec)
        results.append({'Index': idx + 1, 'ChemicalName': chemical_name, 'DiseaseName': disease_name, 'CosineSimilarity': similarity})

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Save results to CSV
results_df.to_csv('cosine_new_entries_founddisease_chems.csv', index=False)

print("Cosine similarity calculations are saved in 'cosine_new_entries_founddisease_chems.csv'.")


Cosine similarity calculations are saved in 'cosine_new_entries_founddisease_chems.csv'.


In [5]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
import os

# Function to load embeddings from file
def load_embeddings(file_path, phrase_key):
    embeddings = {}
    with open(file_path, 'r') as file:
        for line in file:
            if line.strip():  # Ignore empty lines
                # Extract phrase and embedding
                phrase = line.split(',')[0].replace(phrase_key, '').strip()
                embedding_str = line.split('Embedding:')[1].strip().strip('[]')
                embedding = np.array([float(e) for e in embedding_str.split(',')])
                embeddings[phrase.lower()] = embedding  # Convert phrase to lowercase for consistent matching
    return embeddings

# Function to calculate cosine similarity
def calculate_cosine_similarity(vec1, vec2):
    if len(vec1) != len(vec2):
        return np.nan  # Return NaN if vectors are not of the same length
    return 1 - cosine(vec1, vec2)

# Load embeddings
gene_embeddings = load_embeddings('1filteredgen_wordembeddings/BioBERT/Gene/-1/filtered_embeddings_biobert-1.txt', 'Phrase:')
disease_embeddings = load_embeddings('1filteredgen_wordembeddings/BioBERT/Disease/-1/filtered_embeddings_biobert-1.txt', 'Phrase:')

# Load CSV file
df = pd.read_csv('new_entries_founddisease_genes.csv')

# Ensure the data types are correct
df['GeneSymbol'] = df['GeneSymbol'].astype(str)
df['DiseaseName'] = df['DiseaseName'].astype(str)

# Prepare to store results
results = []

# Iterate over rows in DataFrame
for idx, row in df.iterrows():
    gene_name = row['GeneSymbol'].lower()  # Ensure conversion to lowercase
    disease_name = row['DiseaseName'].lower()  # Ensure conversion to lowercase

    # Check if both embeddings are available
    if gene_name in gene_embeddings and disease_name in disease_embeddings:
        gene_vec = gene_embeddings[gene_name]
        disease_vec = disease_embeddings[disease_name]
        similarity = calculate_cosine_similarity(gene_vec, disease_vec)
        results.append({'Index': idx + 1, 'GeneSymbol': gene_name, 'DiseaseName': disease_name, 'CosineSimilarity': similarity})

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Save results to CSV
results_df.to_csv('cosine_new_entries_founddisease_genes.csv', index=False)

print("Cosine similarity calculations are saved in 'cosine_new_entries_founddisease_genes.csv'.")


Cosine similarity calculations are saved in 'cosine_new_entries_founddisease_genes.csv'.


In [7]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
import os

# Function to load embeddings from file
def load_embeddings(file_path, phrase_key):
    embeddings = {}
    with open(file_path, 'r') as file:
        for line in file:
            if line.strip():  # Ignore empty lines
                # Extract phrase and embedding
                phrase = line.split(',')[0].replace(phrase_key, '').strip()
                embedding_str = line.split('Embedding:')[1].strip().strip('[]')
                embedding = np.array([float(e) for e in embedding_str.split(',')])
                embeddings[phrase.lower()] = embedding  # Convert phrase to lowercase for consistent matching
    return embeddings

# Function to calculate cosine similarity
def calculate_cosine_similarity(vec1, vec2):
    if len(vec1) != len(vec2):
        return np.nan  # Return NaN if vectors are not of the same length
    return 1 - cosine(vec1, vec2)

# Load embeddings
chemical_embeddings = load_embeddings('1filteredgen_wordembeddings/BioBERT/Chemical/-1/filtered_embeddings_biobert-1.txt', 'Phrase:')
gene_embeddings = load_embeddings('1filteredgen_wordembeddings/BioBERT/Gene/-1/filtered_embeddings_biobert-1.txt', 'Phrase:')

# Load CSV file
df = pd.read_csv('new_entries_foundgene_chems.csv')

# Ensure the data types are correct
df['ChemicalName'] = df['ChemicalName'].astype(str)
df['GeneSymbol'] = df['GeneSymbol'].astype(str)

# Prepare to store results
results = []

# Iterate over rows in DataFrame
for idx, row in df.iterrows():
    chemical_name = row['ChemicalName'].lower()  # Ensure conversion to lowercase
    gene_name = row['GeneSymbol'].lower()  # Ensure conversion to lowercase

    # Check if both embeddings are available
    if chemical_name in chemical_embeddings and gene_name in gene_embeddings:
        chemical_vec = chemical_embeddings[chemical_name]
        gene_vec = gene_embeddings[gene_name]
        similarity = calculate_cosine_similarity(chemical_vec, gene_vec)
        results.append({'Index': idx + 1, 'ChemicalName': chemical_name, 'GeneSymbol': gene_name, 'CosineSimilarity': similarity})

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Save results to CSV
results_df.to_csv('cosine_new_entries_foundchemical_genes.csv', index=False)

print("Cosine similarity calculations are saved in 'cosine_new_entries_foundchemical_genes.csv'.")


Cosine similarity calculations are saved in 'cosine_new_entries_foundchemical_genes.csv'.
