In [27]:

import os
import re
import json
import pandas as pd
import numpy as np
import math

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from collections import Counter
from nltk.tokenize import RegexpTokenizer

In [28]:
train_df = pd.read_csv('../data/clean/non_synoptic/train_data.csv')

Unnamed: 0,report_id,text,label,start,end,label_key,label_value,question,answer
0,2,\n AGE/S...,# of Sentinel LN,1254,1255,# of Sentinel LN,,What is the Number of Sentinel Nodes Examined?,1
1,2,\n AGE/S...,# of Sentinel LN,1344,1345,# of Sentinel LN,,What is the Number of Sentinel Nodes Examined?,1
2,2,\n AGE/S...,Estimated Size of Largest LN,3924,3931,Estimated Size of Largest LN,,What is the Estimated Size of Largest Lymph no...,0.5 cm
3,2,\n AGE/S...,Histologic Type (Insitu) - Other,914,946,Histologic Type (Insitu),Other,What is the In Situ Component Type?,encapsulated papillary carcinoma
4,2,\n AGE/S...,Estimated Size (IO),957,962,Estimated Size,,What is the estimated size corresponding to an...,32 mm
...,...,...,...,...,...,...,...,...,...
4203,565,\n AGE/S...,Architectural Patterns (ID) - cribriform,1986,1997,Architectural Patterns (ID),cribriform,What are the architectural patterns correspond...,cribriform
4204,565,\n AGE/S...,Architectural Patterns (ID) - solid,2001,2007,Architectural Patterns (ID),solid,What are the architectural patterns correspond...,solid
4205,565,\n AGE/S...,Necrosis (ID) - Present,2469,2499,Necrosis,Present,What is the necrosis?,with comedonecrosis identified
4206,565,\n AGE/S...,Comedo necrosis,2474,2489,Comedo Necrosis,,What is the necrosis type?,comedonecrosis


In [29]:
from collections import Counter
import pandas as pd
import nltk

nltk.download('punkt')  # Download the Punkt tokenizer

# Function to calculate weights
def calculate_weights(train_df, desired_label_keys):
    # Initialize the results list (to be converted to a DataFrame)
    weights = []

    # Loop over the desired label keys
    for label_key in desired_label_keys:
        # Filter the DataFrame by the current label_key
        filtered_df = train_df[train_df['label_key'] == label_key]

        # Get the unique label values for this label key
        unique_label_values = filtered_df['label_value'].unique()

        # Loop over the unique label values
        for label_value in unique_label_values:
            # Further filter the DataFrame by the current label_value
            value_filtered_df = filtered_df[filtered_df['label_value'] == label_value]

            # Tokenize the 'answer' column into words, convert to lowercase, and flatten the list
            tokenizer = RegexpTokenizer(r'\w+')
            words = [word.lower() for sentence in value_filtered_df['answer'] for word in tokenizer.tokenize(sentence)]
            # words = [word.lower() for sentence in value_filtered_df['answer'] for word in nltk.word_tokenize(sentence)]

            # Count the occurrences of each word
            word_counts = Counter(words)

            # Calculate the total number of words
            total_words = sum(word_counts.values())

            # Calculate the weight of each word and add it to the weights list
            for word, count in word_counts.items():
                weights.append({
                    'label_key': label_key,
                    'label_value': label_value,
                    'word': word,
                    'weight': count / total_words
                })

    # Convert the list to a DataFrame
    return pd.DataFrame(weights)

desired_label_keys = ["DCIS Margins", "ER Status", "Extranodal Extension", "HER2 Status", "Insitu Component", "Invasive Carcinoma", "Invasive Carcinoma Margins", "Lymphovascular Invasion", "Necrosis", "PR Status", "Tumour Focality"]

weights_df = calculate_weights(train_df, desired_label_keys)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\trevor.kwan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [30]:
df = weights_df[weights_df['label_key'] == 'Insitu Component']
df[df['word'] == 'identified']

Unnamed: 0,label_key,label_value,word,weight
264,Insitu Component,Present,identified,0.003722
295,Insitu Component,Absent,identified,0.099338


In [31]:
weights_df[weights_df['word'] == '(']

Unnamed: 0,label_key,label_value,word,weight


In [32]:
import pandas as pd
import nltk

original_model_checkpoint = 'franklu/pubmed_bert_squadv2' # for qa
original_model_name = original_model_checkpoint.split("/")[-1]
original_model_dir = "../results/trained\\"
original_model_signature = '_19827_v2\\' # for qa
version = "v2"

# Assuming foiset is your list of "foi"s
# foiset = ["DCIS Margins", "ER Status", "Extranodal Extension", "HER2 Status", "Insitu Component", "Invasive Carcinoma", "Invasive Carcinoma Margins", "Lymphovascular Invasion", "Necrosis", "PR Status", "Tumour Focality"]
foiset = ['Insitu Component', 'Invasive Carcinoma']

nltk.download('punkt')  # Download the Punkt tokenizer

def classify(row, weights_df):
    if pd.isna(row["qa_answer"]) or str(row["qa_answer"]).strip() == '':
        return 'No Mention', {}
    
    # Tokenize the 'qa_answer' column into words, convert to lowercase
    tokenizer = RegexpTokenizer(r'\w+')
    words = [word.lower() for word in tokenizer.tokenize(row['qa_answer'])]
    # words = [word.lower() for word in nltk.word_tokenize(row['qa_answer'])]
    
    # Initialize the dictionary for storing the sum of weights for each label_value
    sum_weights = {}
    
    # Loop over each word in the answer
    for word in words:
        # Find all the corresponding entries in weights_df, ensure comparison is also case-insensitive
        entries = weights_df[weights_df['word'].str.lower() == word]
        
        # If the word is not in weights_df, continue to the next word
        if entries.empty:
            continue
        
        # for each time the word appears in weights_df, add the weight to the corresponding label_value in sum_weights
        for _, entry in entries.iterrows(): # for index, entry (loops over the rows of the entries df) entries df is all the rows in weights_df that include the "word" being searched (entry is one row)
            if entry['label_value'] not in sum_weights: # init the first label_value and weight to sum_weights
                sum_weights[entry['label_value']] = entry['weight']
            else:
                sum_weights[entry['label_value']] += entry['weight']
    
    # If no weights were found, return 'No match'
    if not sum_weights:
        return 'No match', sum_weights
    
    # Return the label_value with the highest sum of weights
    return max(sum_weights, key=sum_weights.get), sum_weights

# Initialize a dictionary to hold the overall accuracies for each "foi"
overall_accuracies = {}

# Initialize a dictionary to hold the accuracies for each label_value for each "foi"
label_value_accuracies = {}

for foi in foiset:
    # Subset the weights_df for the current foi
    weights_foi_df = weights_df[weights_df['label_key'] == foi]

    pred_foi = pd.read_csv(original_model_dir + original_model_name + original_model_signature + "eval\\" + version + "\\" + foi + "\\" + 'predictions_' + foi + '.csv')
    pred_foi = pred_foi[['qa_answer', 'label_value']]

    # in val pred csvs, add No Mention
    pred_foi['label_value'].fillna('No Mention', inplace=True)
    pred_foi['label_value'].replace('', 'No Mention', inplace=True)

    # Create a DataFrame to hold the sum of weights
    weights_sum_df = pd.DataFrame()

    # Apply the modified classify function and store the results
    classifications_and_weights = pred_foi.apply(lambda row: classify(row, weights_foi_df), axis=1)
    zero_class = classifications_and_weights[0]
    print("classifications_and_weights:", zero_class)
    pred_foi['weights_classification'] = [x[0] for x in classifications_and_weights]
    weights_sum_df = pd.concat([weights_sum_df, pd.DataFrame.from_records([x[1] for x in classifications_and_weights])])

    # Compute the overall accuracy for this "foi"
    overall_accuracy = (pred_foi['weights_classification'] == pred_foi['label_value']).mean()
    overall_accuracies[foi] = overall_accuracy

    # Compute the accuracy for each label_value for this "foi"
    label_value_accuracy = pred_foi.groupby('label_value').apply(lambda df: (df['weights_classification'] == df['label_value']).mean())
    label_value_accuracies[foi] = label_value_accuracy

    # Create the directory if it doesn't exist
    directory = original_model_dir + original_model_name + original_model_signature + "eval\\" + version + "\\" + foi + "\\" + "weights_classification\\"
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Save the predictions for this "foi" to a CSV file
    pred_foi.to_csv(directory + 'pred_classified_' + foi + '.csv', index=False)

    # Save the sum of weights to a CSV file
    weights_sum_df.to_csv(directory + 'weights_sum_' + foi + '.csv', index=False)

    # Convert the label_value_accuracies to a DataFrame and save to a CSV file for this "foi"
    label_value_accuracy_df = pd.DataFrame(label_value_accuracy, columns=['accuracy'])
    label_value_accuracy_df.to_csv(directory + 'accuracies_' + foi + '.csv')

# Convert the overall_accuracies to a DataFrame and save to a CSV file
overall_accuracies_df = pd.DataFrame.from_dict(overall_accuracies, orient='index', columns=['accuracy'])

# Convert the label_value_accuracies to a DataFrame and save to a CSV file
label_value_accuracies_df = pd.concat({k: pd.Series(v) for k, v in label_value_accuracies.items()}).reset_index()
label_value_accuracies_df.columns = ['foi', 'label_value', 'accuracy']

# Append overall accuracies to label_value_accuracies_df
overall_accuracies_df = overall_accuracies_df.reset_index().rename(columns={'index': 'foi', 'accuracy': 'accuracy'})
combined_accuracies_df = pd.concat([label_value_accuracies_df, overall_accuracies_df], keys=['label_value_accuracy', 'overall_accuracy'], ignore_index=False)
combined_accuracies_df.reset_index(level=0, inplace=True)
combined_accuracies_df.rename(columns={'level_0': 'type'}, inplace=True)

# Calculate the mean accuracy for "No Mention" across all FOIs
no_mention_mean_accuracy = label_value_accuracies_df[label_value_accuracies_df['label_value'] == 'No Mention']['accuracy'].mean()
# Create a DataFrame for the overall "No Mention" mean accuracy
overall_no_mention_accuracy_df = pd.DataFrame({
    'type': ['overall_accuracy'],
    'foi': ['No Mention'],
    'label_value': [None],
    'accuracy': [no_mention_mean_accuracy]
})
# Concatenate the overall "No Mention" mean accuracy with the combined_accuracies_df
combined_accuracies_df = pd.concat([combined_accuracies_df, overall_no_mention_accuracy_df], ignore_index=True)

combined_accuracies_df.to_csv(original_model_dir + original_model_name + original_model_signature + 'eval\\' + version + "\\" + 'all_foi_weight_accuracies.csv', index=False)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\trevor.kwan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


classifications_and_weights: ('No Mention', {})
classifications_and_weights: ('negative', {'negative': 0.8666666666666667})
classifications_and_weights: ('Absent', {'Present': 0.24050632911392406, 'Absent': 0.2631578947368421})
classifications_and_weights: ('negative', {'negative': 0.8723404255319148})
classifications_and_weights: ('Absent', {'Absent': 0.15894039735099336, 'Present': 0.0037220843672456576})
classifications_and_weights: ('Absent', {'Present': 0.5384615384615384, 'Absent': 0.5882352941176471})
classifications_and_weights: ('Negative', {'Negative': 0.2686945500633714, 'Positive': 0.08227848101265824})
classifications_and_weights: ('Absent', {'Present': 0.3722627737226277, 'Absent': 0.3772727272727273, 'Cannot be determined': 0.3333333333333333})
classifications_and_weights: ('No Mention', {})
classifications_and_weights: ('negative', {'negative': 0.6296296296296297})
classifications_and_weights: ('No Mention', {})
