## Disease-Gene

In [None]:
import pandas as pd

# Load the True Positives and False Negatives data
true_positives_file = 'OP_New_CTD/PubMedBERT1/DG/-1/updated_CTD_disease_gene.csv.csv'
false_negatives_file = 'FilCosine/DG/PubmedBERT1/-1/high_similarity_gene_disease.csv'

# Load data into DataFrames
true_positives_df = pd.read_csv(true_positives_file)
false_negatives_df = pd.read_csv(false_negatives_file)

# Make sure both gene and disease names are in lowercase for consistency
true_positives_df['GeneSymbol'] = true_positives_df['GeneSymbol'].str.lower()
true_positives_df['DiseaseName'] = true_positives_df['DiseaseName'].str.lower()

false_negatives_df['Gene'] = false_negatives_df['Gene'].str.lower()
false_negatives_df['Disease'] = false_negatives_df['Disease'].str.lower()

# Function to compute results for a given cosine similarity threshold
def compute_similarity_results(cosine_similarity_threshold):
    # Compute True Positives: Cosine similarity > threshold in both files
    true_positives = []
    for index, row in true_positives_df.iterrows():
        gene = row['GeneSymbol']
        disease = row['DiseaseName']
        cosine_similarity = row['CosineSimilarity']

        # Check if the gene-disease pair exists in the false negatives file with cosine similarity > threshold
        match = false_negatives_df[
            (false_negatives_df['Gene'] == gene) & 
            (false_negatives_df['Disease'] == disease) &
            (false_negatives_df['CosineSimilarity'] > cosine_similarity_threshold)
        ]
        
        if not match.empty:
            true_positives.append((gene, disease, cosine_similarity))

    # Compute False Positives: Gene-disease pairs that are in the true positives file but not in the false negatives
    false_positives = []
    for index, row in true_positives_df.iterrows():
        gene = row['GeneSymbol']
        disease = row['DiseaseName']
        cosine_similarity = row['CosineSimilarity']

        match = false_negatives_df[
            (false_negatives_df['Gene'] == gene) & 
            (false_negatives_df['Disease'] == disease) &
            (false_negatives_df['CosineSimilarity'] > cosine_similarity_threshold)
        ]
        
        if match.empty and cosine_similarity > cosine_similarity_threshold:
            false_positives.append((gene, disease, cosine_similarity))

    # Compute False Negatives: Gene-disease pairs that are in the false negatives file but not in the true positives
    false_negatives = []
    for index, row in false_negatives_df.iterrows():
        gene = row['Gene']
        disease = row['Disease']
        cosine_similarity = row['CosineSimilarity']

        match = true_positives_df[
            (true_positives_df['GeneSymbol'] == gene) & 
            (true_positives_df['DiseaseName'] == disease) &
            (true_positives_df['CosineSimilarity'] > cosine_similarity_threshold)
        ]
        
        if match.empty and cosine_similarity > cosine_similarity_threshold:
            false_negatives.append((gene, disease, cosine_similarity))

    # Output the results and their counts for the given threshold
    print(f"Results for Cosine Similarity Threshold: {cosine_similarity_threshold}")
    print(f"True Positives: {len(true_positives)}")
    print(f"False Positives: {len(false_positives)}")
    print(f"False Negatives: {len(false_negatives)}")

    # Convert results to DataFrame and save them to CSV for the given threshold
    true_positives_df_res = pd.DataFrame(true_positives, columns=['Gene', 'Disease', 'CosineSimilarity'])
    false_positives_df_res = pd.DataFrame(false_positives, columns=['Gene', 'Disease', 'CosineSimilarity'])
    false_negatives_df_res = pd.DataFrame(false_negatives, columns=['Gene', 'Disease', 'CosineSimilarity'])

    true_positives_df_res.to_csv(f'true_positives_{cosine_similarity_threshold}.csv', index=False)
    false_positives_df_res.to_csv(f'false_positives_{cosine_similarity_threshold}.csv', index=False)
    false_negatives_df_res.to_csv(f'false_negatives_{cosine_similarity_threshold}.csv', index=False)

# Call the function for cosine similarity thresholds 0.6, 0.7, and 0.8
cosine_similarity_thresholds = [0.6,0.7, 0.8]

for threshold in cosine_similarity_thresholds:
    compute_similarity_results(threshold)


## Disease-Chemical

In [None]:
import pandas as pd

# Load the True Positives and False Negatives data
true_positives_file = 'OP_New_CTD/PubMedBERT1/DC/-1/updated_CTD_disease_chemicals.csv'
false_negatives_file = 'FilCosine/CD/PubmedBERT1/-1/high_similarity_chemical_disease.csv'

# Load data into DataFrames
true_positives_df = pd.read_csv(true_positives_file)
false_negatives_df = pd.read_csv(false_negatives_file)

# Make sure both chemical and disease names are in lowercase for consistency
true_positives_df['ChemicalName'] = true_positives_df['ChemicalName'].str.lower()
true_positives_df['DiseaseName'] = true_positives_df['DiseaseName'].str.lower()

false_negatives_df['Chemical'] = false_negatives_df['Chemical'].str.lower()
false_negatives_df['Disease'] = false_negatives_df['Disease'].str.lower()

# Define thresholds for cosine similarity
cosine_similarity_thresholds = [0.7, 0.8]

# Function to compute the results for a given threshold
def compute_similarity_results(threshold):
    true_positives = []
    false_positives = []
    false_negatives = []
    
    # Compute True Positives: Cosine similarity > threshold in both files
    for index, row in true_positives_df.iterrows():
        chemical = row['ChemicalName']
        disease = row['DiseaseName']
        cosine_similarity = row['CosineSimilarity']

        # Check if the chemical-disease pair exists in the false negatives file with cosine similarity > threshold
        match = false_negatives_df[
            (false_negatives_df['Chemical'] == chemical) & 
            (false_negatives_df['Disease'] == disease) &
            (false_negatives_df['CosineSimilarity'] > threshold)
        ]
        
        if not match.empty:
            true_positives.append((chemical, disease, cosine_similarity))

    # Compute False Positives: Chemical-disease pairs that are in the true positives file but not in the false negatives
    for index, row in true_positives_df.iterrows():
        chemical = row['ChemicalName']
        disease = row['DiseaseName']
        cosine_similarity = row['CosineSimilarity']

        match = false_negatives_df[
            (false_negatives_df['Chemical'] == chemical) & 
            (false_negatives_df['Disease'] == disease) &
            (false_negatives_df['CosineSimilarity'] > threshold)
        ]
        
        if match.empty and cosine_similarity > threshold:
            false_positives.append((chemical, disease, cosine_similarity))

    # Compute False Negatives: Chemical-disease pairs that are in the false negatives file but not in the true positives
    for index, row in false_negatives_df.iterrows():
        chemical = row['Chemical']
        disease = row['Disease']
        cosine_similarity = row['CosineSimilarity']

        match = true_positives_df[
            (true_positives_df['ChemicalName'] == chemical) & 
            (true_positives_df['DiseaseName'] == disease) &
            (true_positives_df['CosineSimilarity'] > threshold)
        ]
        
        if match.empty and cosine_similarity > threshold:
            false_negatives.append((chemical, disease, cosine_similarity))

    # Output the results and their counts for the current threshold
    print(f"Results for Cosine Similarity Threshold: {threshold}")
    print(f"True Positives: {len(true_positives)}")
    print(f"False Positives: {len(false_positives)}")
    print(f"False Negatives: {len(false_negatives)}")
    
    # Convert results to DataFrame and save them to CSV for the current threshold
    true_positives_df_res = pd.DataFrame(true_positives, columns=['Chemical', 'Disease', 'CosineSimilarity'])
    false_positives_df_res = pd.DataFrame(false_positives, columns=['Chemical', 'Disease', 'CosineSimilarity'])
    false_negatives_df_res = pd.DataFrame(false_negatives, columns=['Chemical', 'Disease', 'CosineSimilarity'])
    
    true_positives_df_res.to_csv(f'true_positives_{threshold}.csv', index=False)
    false_positives_df_res.to_csv(f'false_positives_{threshold}.csv', index=False)
    false_negatives_df_res.to_csv(f'false_negatives_{threshold}.csv', index=False)

# Iterate through the different thresholds and compute results for each
for threshold in cosine_similarity_thresholds:
    compute_similarity_results(threshold)


## Chemical-Gene

In [None]:
import pandas as pd

# Load the True Positives and False Negatives data
true_positives_file = 'OP_New_CTD/PubMedBERT1/GC/-1/updated_CTD_gene_chemicals.csv'
false_negatives_file = 'FilCosine/CG/PubMedBERT1/-1/high_similarity_chemical_gene.csv'

# Load data into DataFrames
true_positives_df = pd.read_csv(true_positives_file)
false_negatives_df = pd.read_csv(false_negatives_file)

# Make sure both chemical and gene names are in lowercase for consistency
true_positives_df['ChemicalName'] = true_positives_df['ChemicalName'].str.lower()
true_positives_df['GeneSymbol'] = true_positives_df['GeneSymbol'].str.lower()

false_negatives_df['Chemical'] = false_negatives_df['Chemical'].str.lower()
false_negatives_df['Gene'] = false_negatives_df['Gene'].str.lower()

# Function to compute results for a given cosine similarity threshold
def compute_similarity_results(cosine_similarity_threshold):
    # Compute True Positives: Cosine similarity > threshold in both files
    true_positives = []
    for index, row in true_positives_df.iterrows():
        chemical = row['ChemicalName']
        gene = row['GeneSymbol']
        cosine_similarity = row['CosineSimilarity']

        # Check if the chemical-gene pair exists in the false negatives file with cosine similarity > threshold
        match = false_negatives_df[
            (false_negatives_df['Chemical'] == chemical) & 
            (false_negatives_df['Gene'] == gene) &
            (false_negatives_df['CosineSimilarity'] > cosine_similarity_threshold)
        ]
        
        if not match.empty:
            true_positives.append((chemical, gene, cosine_similarity))

    # Compute False Positives: Chemical-gene pairs that are in the true positives file but not in the false negatives
    false_positives = []
    for index, row in true_positives_df.iterrows():
        chemical = row['ChemicalName']
        gene = row['GeneSymbol']
        cosine_similarity = row['CosineSimilarity']

        match = false_negatives_df[
            (false_negatives_df['Chemical'] == chemical) & 
            (false_negatives_df['Gene'] == gene) &
            (false_negatives_df['CosineSimilarity'] > cosine_similarity_threshold)
        ]
        
        if match.empty and cosine_similarity > cosine_similarity_threshold:
            false_positives.append((chemical, gene, cosine_similarity))

    # Compute False Negatives: Chemical-gene pairs that are in the false negatives file but not in the true positives
    false_negatives = []
    for index, row in false_negatives_df.iterrows():
        chemical = row['Chemical']
        gene = row['Gene']
        cosine_similarity = row['CosineSimilarity']

        match = true_positives_df[
            (true_positives_df['ChemicalName'] == chemical) & 
            (true_positives_df['GeneSymbol'] == gene) &
            (true_positives_df['CosineSimilarity'] > cosine_similarity_threshold)
        ]
        
        if match.empty and cosine_similarity > cosine_similarity_threshold:
            false_negatives.append((chemical, gene, cosine_similarity))

    # Output the results and their counts for the given threshold
    print(f"Results for Cosine Similarity Threshold: {cosine_similarity_threshold}")
    print(f"True Positives: {len(true_positives)}")
    print(f"False Positives: {len(false_positives)}")
    print(f"False Negatives: {len(false_negatives)}")

    # Convert results to DataFrame and save them to CSV for the given threshold
    true_positives_df_res = pd.DataFrame(true_positives, columns=['Chemical', 'Gene', 'CosineSimilarity'])
    false_positives_df_res = pd.DataFrame(false_positives, columns=['Chemical', 'Gene', 'CosineSimilarity'])
    false_negatives_df_res = pd.DataFrame(false_negatives, columns=['Chemical', 'Gene', 'CosineSimilarity'])

    true_positives_df_res.to_csv(f'true_positives_{cosine_similarity_threshold}.csv', index=False)
    false_positives_df_res.to_csv(f'false_positives_{cosine_similarity_threshold}.csv', index=False)
    false_negatives_df_res.to_csv(f'false_negatives_{cosine_similarity_threshold}.csv', index=False)

# Call the function for cosine similarity thresholds 0.7, 0.8
cosine_similarity_thresholds = [0.6,0.7, 0.8]

for threshold in cosine_similarity_thresholds:
    compute_similarity_results(threshold)


Note: Once all true positives, false positives and false negatives are calculated can be stored in a csv file respectvely. with FileName, TP, FP,FN values. We can calculate PR and visualize them


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

# Load the data
df = pd.read_csv('PR_metrics/DC/DC_0.8_metrics.csv')  # Adjust the path to your CSV file as necessary

# Ensure the directory exists for the visualization output
output_dir = 'PR_metrics/Vis'
os.makedirs(output_dir, exist_ok=True)

# Check if the DataFrame has the necessary columns
if 'FileName' in df.columns and 'Precision' in df.columns and 'Recall' in df.columns:
    # Generate a heatmap for Precision and Recall
    plt.figure(figsize=(10, 10))
    heatmap = sns.heatmap(df.set_index('FileName')[['Precision', 'Recall']],
                          annot=True,
                          fmt=".4f",
                          cmap="coolwarm",
                          cbar_kws={'label': 'Scale'},
                          vmin=0,  # Setting minimum value of scale to include zero
                          vmax=1)  # Assuming maximum value in your data does not exceed 1
    plt.title('DC 0.8 - Precision and Recall')
    plt.xlabel('Metrics')
    plt.ylabel('FileName')

    # Improving layout to prevent cut-off labels
    plt.tight_layout()

    # Save the heatmap
    heatmap.figure.savefig(f'{output_dir}/DC_0.8_heatmap.png')
    plt.show()

else:
    print("Required columns are missing from the DataFrame.")
