In [1]:

import os
import re
import json
import pandas as pd
import numpy as np
import math

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from collections import Counter
from nltk.tokenize import RegexpTokenizer

#### Naive Bayes Classifier Summary

- For example, classify Invasive Carcinoma given a qa_answer.
- p(Present | qa_answer) = p(Present)* p(word_1 | Present) * p(word_2 | Present)...
- Take the max of p(Present | qa_answer) and p(Absent | qa_answer) and classify it.
- prior probability: p(Present) = # of Present rows in Invasive Carcinoma / # of total rows in Invasive Carcinoma
- likelihoods: p(word_1 | Present) = # of rows with word_1 in Present and Invasive Carcinoma / # of total rows in Present and Invasive Carcinoma
- OR if word_1 doesn't exist in training data, p(word_1 | Present) = 1/1000000
- prior and liklihoods are based off the training data

In [2]:
train_df = pd.read_csv('../data/clean/non_synoptic/train_data.csv')

In [3]:
import pandas as pd
import nltk
from collections import Counter

nltk.download('punkt')  # Download the Punkt tokenizer

def calculate_likelihood(train_df, desired_label_keys):
    """
    For each label_key and label_value combination pair, get the likelihood prob of all words in it. e.g. p(word | Present)
    """
    # Initialize the results list (to be converted to a DataFrame)
    likelihoods = []

    # For a given label_key
    for label_key in desired_label_keys:
        # Filter the DataFrame by the current label_key
        filtered_df = train_df[train_df['label_key'] == label_key]

        # Get the unique label values for this label key
        unique_label_values = filtered_df['label_value'].unique()

        # For a given label_key and label_value pair...
        for label_value in unique_label_values:
            # Get the label_key and label_value subset df
            value_filtered_df = filtered_df[filtered_df['label_value'] == label_value]

            tokenizer = RegexpTokenizer(r'\w+')
            tokenized_sentences = value_filtered_df['answer'].apply(lambda x: [word.lower() for word in tokenizer.tokenize(x)])
            # Breaks each answer in the subset df into words e.g. [negative, identifed] [positive, yes]...
            # tokenized_sentences = value_filtered_df['answer'].apply(lambda x: [word.lower() for word in nltk.word_tokenize(x)])

            # Gets the counts of each word in terms of how many times a word shows up in an answer and stores it in "count"
            word_counts = Counter(word for words in tokenized_sentences for word in set(words))

            # Get the number of rows in the label_key and label_value combination pair subset dataframe
            total_rows = len(value_filtered_df)

            # Calculate the probability of each word and add it to the likelihoods list
            for word, count in word_counts.items():
                likelihoods.append({
                    'label_key': label_key,
                    'label_value': label_value,
                    'word': word,
                    'prob': count / total_rows
                })

    # Convert the list to a DataFrame
    return pd.DataFrame(likelihoods)

desired_label_keys = ["DCIS Margins", "ER Status", "Extranodal Extension", "HER2 Status", "Insitu Component", "Invasive Carcinoma", "Invasive Carcinoma Margins", "Lymphovascular Invasion", "Necrosis", "PR Status", "Tumour Focality"]

likelihood_df = calculate_likelihood(train_df, desired_label_keys)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\trevor.kwan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
likelihood_df

Unnamed: 0,label_key,label_value,word,prob
0,DCIS Margins,Negative,negative,0.746575
1,DCIS Margins,Negative,margins,0.595890
2,DCIS Margins,Negative,carcinoma,0.184932
3,DCIS Margins,Negative,all,0.246575
4,DCIS Margins,Negative,of,0.178082
...,...,...,...,...
693,Tumour Focality,Single,invasive,0.057143
694,Tumour Focality,Single,1,0.028571
695,Tumour Focality,Single,is,0.028571
696,Tumour Focality,Single,tumour,0.028571


In [5]:
import pandas as pd
import nltk

# Counting the occurrences for each combination of label_key and label_value
occurrences = train_df[train_df['label_key'].isin(desired_label_keys)].groupby(['label_key', 'label_value']).size().reset_index(name='count')

# Counting the total occurrences for each label_key
total_occurrences = occurrences.groupby('label_key')['count'].transform('sum')

# Calculating the probability
occurrences['prob'] = occurrences['count'] / total_occurrences

# Creating the prior_prob DataFrame with the desired columns
prior_prob = occurrences[['label_key', 'label_value', 'prob']].drop_duplicates().reset_index(drop=True)

prior_prob

Unnamed: 0,label_key,label_value,prob
0,DCIS Margins,Can't Be Assessed,0.00625
1,DCIS Margins,Negative,0.9125
2,DCIS Margins,Positive,0.08125
3,ER Status,Can't Be Assessed,0.014706
4,ER Status,negative,0.191176
5,ER Status,positive,0.794118
6,Extranodal Extension,Absent,0.478261
7,Extranodal Extension,Present,0.521739
8,HER2 Status,equivocal,0.081967
9,HER2 Status,negative,0.721311


In [6]:
import os
import pandas as pd
import nltk

original_model_checkpoint = 'franklu/pubmed_bert_squadv2' # for qa
original_model_name = original_model_checkpoint.split("/")[-1]
original_model_dir = "../results/trained\\"
original_model_signature = '_19827_v2\\' # for qa
version = "v2"

foiset = ["DCIS Margins", "ER Status", "Extranodal Extension", "HER2 Status", "Insitu Component", "Invasive Carcinoma", "Invasive Carcinoma Margins", "Lymphovascular Invasion", "Necrosis", "PR Status", "Tumour Focality"]
# foiset = ["Invasive Carcinoma"]

def classify(qa_answer, label_key, likelihood_df, prior_prob, debug_file, train_df):
    if pd.isna(qa_answer) or str(qa_answer).strip() == '':
        return 'No Mention', {}

    tokenizer = RegexpTokenizer(r'\w+')
    words = [word.lower() for word in tokenizer.tokenize(str(qa_answer))]
    # words = [word.lower() for word in nltk.word_tokenize(str(qa_answer))]
    debug_file.write(f"Words in the qa_answer row: {words}\n")

    product_likelihoods = {}

    label_values = prior_prob[prior_prob['label_key'] == label_key]['label_value'].unique()
    for label_value in label_values:
        prior = prior_prob[(prior_prob['label_key'] == label_key) & (prior_prob['label_value'] == label_value)]['prob'].iloc[0]
        debug_file.write(f"The prior probs of this row: {prior}\n")

        subset_count = train_df[(train_df['label_key'] == label_key) & (train_df['label_value'] == label_value)].shape[0]
        debug_file.write(f"The subset count is: {subset_count}\n")

        for word in words:
            entry = likelihood_df[(likelihood_df['label_key'] == label_key) & (likelihood_df['label_value'] == label_value) & (likelihood_df['word'].str.lower() == word)]
            debug_file.write(f"The entry is:{entry}\n")

            # If the entry is empty, set the likelihood as 1 divided by the number of rows in the subset
            if entry.empty:
                likelihood = 1/1000000
                debug_file.write(f"likelihood = 1/1,000,000\n")
                # likelihood = 1 / (subset_count + 1)
                # debug_file.write(f"likelihood = 1/(subset_count+1)\n")
            else:
                likelihood = entry['prob'].iloc[0]
                debug_file.write(f"likelihood = prob\n")

            if label_value not in product_likelihoods:
                product_likelihoods[label_value] = prior * likelihood
            else:
                product_likelihoods[label_value] *= likelihood

            debug_file.write(f"Midpoint product likelihood: {product_likelihoods}\n")

    debug_file.write(f"Final product likelihoods: {product_likelihoods}\n")

    return (max(product_likelihoods, key=product_likelihoods.get) if product_likelihoods else 'No match', product_likelihoods)


# Initialize a dictionary to hold the overall accuracies for each "foi"
overall_accuracies = {}

# Initialize a dictionary to hold the accuracies for each label_value for each "foi"
label_value_accuracies = {}

# Loop over each "foi" (label_key) in foiset
for foi in foiset:
    # create the bayes directory if it doesn't exist
    base_directory = original_model_dir + original_model_name + original_model_signature + "eval\\" + version + "\\"
    bayes_classification_directory = base_directory + foi + "\\" + "bayes_classification\\"
    if not os.path.exists(bayes_classification_directory):
        os.makedirs(bayes_classification_directory)
    # create debug file
    debug_filename = original_model_dir + original_model_name + original_model_signature + "eval\\" + version + "\\" + foi + "\\" + "bayes_classification\\" + "debug_output.txt"
    with open(debug_filename, 'w') as debug_file:
        # Load the prediction CSV
        pred_foi = pd.read_csv(original_model_dir + original_model_name + original_model_signature + "eval\\" + version + "\\" + foi + "\\" + 'predictions_' + foi + '.csv')

        # in val pred csvs, add No Mention
        pred_foi['label_value'].fillna('No Mention', inplace=True)
        pred_foi['label_value'].replace('', 'No Mention', inplace=True)

        # Apply the classify function and store the results
        # pred_foi['bayes_classification'] = pred_foi['qa_answer'].apply(lambda x: classify(x, foi, likelihood_df, prior_prob))

        # Apply the classify function and store the results in two new columns
        pred_foi['classification_results'] = pred_foi['qa_answer'].apply(lambda x: classify(x, foi, likelihood_df, prior_prob, debug_file, train_df))
        pred_foi['bayes_classification'] = pred_foi['classification_results'].apply(lambda x: x[0])
        pred_foi['prod_likelihoods'] = pred_foi['classification_results'].apply(lambda x: str(x[1]))  # Convert dictionary to string for saving

        # Drop the temporary column used for storing both results
        pred_foi.drop(columns=['classification_results'], inplace=True)

        # Create the directory if it doesn't exist
        directory = original_model_dir + original_model_name + original_model_signature + "eval\\" + version + "\\" + foi + "\\" + "bayes_classification\\"
        if not os.path.exists(directory):
            os.makedirs(directory)

        # Save the new prediction CSV
        pred_foi.to_csv(directory + 'pred_classified_' + foi + '.csv', index=False)

        # Compute the overall accuracy for this "foi"
        overall_accuracy = (pred_foi['bayes_classification'] == pred_foi['label_value']).mean()
        overall_accuracies[foi] = overall_accuracy

        # Compute the accuracy for each label_value for this "foi"
        label_value_accuracy = pred_foi.groupby('label_value').apply(lambda df: (df['bayes_classification'] == df['label_value']).mean())
        label_value_accuracies[foi] = label_value_accuracy

        # Convert the label_value_accuracies to a DataFrame and save to a CSV file for this "foi"
        label_value_accuracy_df = pd.DataFrame(label_value_accuracy, columns=['accuracy'])
        label_value_accuracy_df.to_csv(directory + 'accuracies_' + foi + '.csv')

# Convert the overall_accuracies to a DataFrame and save to a CSV file
overall_accuracies_df = pd.DataFrame.from_dict(overall_accuracies, orient='index', columns=['accuracy'])

# Convert the label_value_accuracies to a DataFrame and save to a CSV file
label_value_accuracies_df = pd.concat({k: pd.Series(v) for k, v in label_value_accuracies.items()}).reset_index()
label_value_accuracies_df.columns = ['foi', 'label_value', 'accuracy']

# Append overall accuracies to label_value_accuracies_df
overall_accuracies_df = overall_accuracies_df.reset_index().rename(columns={'index': 'foi', 'accuracy': 'accuracy'})
combined_accuracies_df = pd.concat([label_value_accuracies_df, overall_accuracies_df], keys=['label_value_accuracy', 'overall_accuracy'], ignore_index=False)
combined_accuracies_df.reset_index(level=0, inplace=True)
combined_accuracies_df.rename(columns={'level_0': 'type'}, inplace=True)

# Calculate the mean accuracy for "No Mention" across all FOIs
no_mention_mean_accuracy = label_value_accuracies_df[label_value_accuracies_df['label_value'] == 'No Mention']['accuracy'].mean()
# Create a DataFrame for the overall "No Mention" mean accuracy
overall_no_mention_accuracy_df = pd.DataFrame({
    'type': ['overall_accuracy'],
    'foi': ['No Mention'],
    'label_value': [None],
    'accuracy': [no_mention_mean_accuracy]
})
# Concatenate the overall "No Mention" mean accuracy with the combined_accuracies_df
combined_accuracies_df = pd.concat([combined_accuracies_df, overall_no_mention_accuracy_df], ignore_index=True)

combined_accuracies_df.to_csv(original_model_dir + original_model_name + original_model_signature + 'eval\\' + version + "\\" + 'all_foi_bayes_accuracies.csv', index=False)