#### Simple Regex Classification
- For each FOI, takes a qa_answer prediction and searches the text for all unique label_values of that FOI.
- If it finds one label_value, classifies it with that label_value.
- If it finds more than one label_value or no label_values in qa_answer, classifies as "No Mention".

In [16]:
import pandas as pd
import os
import pandas as pd
import os
from nltk.tokenize import RegexpTokenizer

original_model_checkpoint = 'franklu/pubmed_bert_squadv2' # for qa
original_model_name = original_model_checkpoint.split("/")[-1]
original_model_dir = "../results/trained\\"
original_model_signature = '_19827_v2\\' # for qa
version = "v2"

tokenizer = RegexpTokenizer(r'\w+')

def classify_answers(foi, prediction_path):
    # Read the prediction CSV
    df = pd.read_csv(prediction_path)
    # in val pred csvs, add No Mention
    df['label_value'].fillna('No Mention', inplace=True)
    df['label_value'].replace('', 'No Mention', inplace=True)

    # Unique label values for the current foi
    unique_labels = [label.lower() for label in df['label_value'].unique()]  # Convert labels to lowercase

    def classify(row):
        answer = str(row['qa_answer']).lower()  # Convert answer to lowercase
        words = tokenizer.tokenize(answer)  # Tokenize the answer using the provided tokenizer
        matches = [label for label in unique_labels if any(word == label for word in words)]
        
        # If no matches or multiple matches, return 'No Match'
        if len(matches) != 1:
            return 'No Mention'
        
        return matches[0].capitalize()  # Return the only matched label
    
    df['regex_classification'] = df.apply(classify, axis=1)

    return df

# Assuming you have a folder path for predictions
prediction_folder = original_model_dir + original_model_name + original_model_signature + "eval\\" + version + "\\"
foiset = ["DCIS Margins", "ER Status", "Extranodal Extension", "HER2 Status", "Insitu Component", "Invasive Carcinoma", "Invasive Carcinoma Margins", "Lymphovascular Invasion", "Necrosis", "PR Status", "Tumour Focality"]

# Specify the output directory
output_directory = original_model_dir + original_model_name + original_model_signature + "eval\\" + version + "\\"
# Ensure the output directory exists or create it
if not os.path.exists(output_directory):
    os.makedirs(output_directory) 

for foi in foiset:
    prediction_path = os.path.join(prediction_folder, foi, f"predictions_{foi}.csv")
    if os.path.exists(prediction_path):
        classified_df = classify_answers(foi, prediction_path)
        # Define the path where you want to save the classified_df
        output_path = os.path.join(output_directory, foi, f"regex_classification_{foi}.csv")
        
        # Save the classified_df to the specified path
        classified_df.to_csv(output_path, index=False)

# Define a new directory for accuracy results
accuracy_directory = os.path.join(original_model_dir, original_model_name + original_model_signature, "eval", version)
if not os.path.exists(accuracy_directory):
    os.makedirs(accuracy_directory)

# Dictionary to store accuracy values for each foi
accuracy_results = {}

for foi in foiset:
    classification_csv_path = os.path.join(output_directory, foi, f"regex_classification_{foi}.csv")
    if os.path.exists(classification_csv_path):
        df = pd.read_csv(classification_csv_path)
        total_rows = len(df)
        matching_rows = len(df[df['regex_classification'].str.lower() == df['label_value'].str.lower()])
        accuracy = matching_rows / total_rows
        accuracy_results[foi] = accuracy

# Save accuracy results to csv
accuracy_df = pd.DataFrame(list(accuracy_results.items()), columns=['FOI', 'Accuracy'])
accuracy_csv_path = os.path.join(accuracy_directory, "all_foi_regex_classification.csv")
accuracy_df.to_csv(accuracy_csv_path, index=False)