<a href="https://colab.research.google.com/github/yzm9393/swineBRET-ICD/blob/main/01_Data_Preparation_for_Annotation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Import relevant libraries

In [None]:
import pandas as pd
import numpy as np
import spacy
import re
import time
from tqdm import tqdm

## 2. Define file paths

In [None]:
# --- File Paths & Naming ---
df = pd.read_excel("/Users/zimoyang/Documents/swine_project/data/HistoryData.xls")
TEXT_COLUMN = "HISTORY"
ID_COLUMN = "U_SUBMISSIONID"
DATE_COLUMN = "CREATEDT"
ANNOTATION_FILE_OUTPUT = 'Enriched_Annotation_Sample_For_Dr_Poljak.xlsx'




## 3. Define configurations for anoynmizations

In [None]:
# --- Parameters ---
TEST_SET_FRACTION = 0.20
ANNOTATION_SAMPLE_SIZE = 2000
RANDOM_STATE = 42

# Configuration
NLP_MODEL_NAME = "en_core_web_md"

# Rule-based classification
UNKNOWN_EXACT_MATCH_RULES = [
    "no history provided", "no history given", "none given", "none", "unknown"
]
DIAGNOSTIC_EXACT_MATCH_RULES = [
    "test purpose monitoring", "monitoring", "routine monitoring", "testing for olymel", "testing for maple leaf", "vaccination",
    "vaccine", "testing", "pcr", "healthy", "normal", "booster", "surveillance", "vax", "health check", "blood test"
]
# Anonymization regex patterns
ANONYMIZATION_PATTERNS_REGEX = {
    "company": re.compile(r'\b(olymel|maple leaf|duroc|conestoga|hypor)\b', re.IGNORECASE), # For ORG_NAME
    # Optional: Keep these if you want regex to catch specific date/ID formats BEFORE spaCy.
    # If spaCy alone should handle dates (with stricter logic), you can comment these out.
    "date": re.compile(r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{1,2},?\s+\d{4}\b', re.IGNORECASE),
    "iso_date": re.compile(r'\b\d{4}-\d{2}-\d{2}\b'),
    "submission_id": re.compile(r'\b[A-Z]{2}\d{3,6}\b'), # Example, adjust as needed
}

## 4. Define anonymization functions

In [None]:
# Function Definitions

def load_nlp_model(model_name=NLP_MODEL_NAME):
    """Loads and returns the spaCy NLP model."""
    try:
        nlp = spacy.load(model_name)
        print(f"Successfully loaded spaCy model: {model_name}")
        return nlp
    except OSError:
        print(f"spaCy model '{model_name}' not found. Please download it: python -m spacy download {model_name}")
        return None

def clean_text_data(text):
    """Cleans a single text string."""
    text = str(text).strip().lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[\r\n]+', ' ', text)
    text = re.sub(r'[^\w\s_]', '', text) # Keeps alphanumeric, whitespace, underscore
    return text

def classify_text_by_rules(text_input,
                           unknown_exact_match,
                           diagnostic_exact_match): # Only two rule lists now
    """
    Classifies cleaned text based on predefined, simplified exact match rules.
    """
    text = str(text_input).strip().lower() # Rules expect cleaned, lowercased text

    if text in unknown_exact_match:
        return "unknown_rule"
    elif text in diagnostic_exact_match:
        return "diagnostic_rule"

    # Everything else is initially considered "Normal"
    return "normal_rule"

def anonymize_single_text(text_input, nlp_model, regex_patterns):
    """
    Anonymizes text with very strict date handling:
    - Regex for specific "company" patterns ([ORG_NAME]).
    - Regex for specific, full date formats ("Month Day, Year", "YYYY-MM-DD").
    - SpaCy NER for:
        - PERSON: Replaced with [VET_NAME] only if "Dr." prefix is found.
        - GPE, LOC: Replaced with [LOCATION] (with exceptions).
        - DATE: Replaced with [DATE] ONLY if it's a regex match OR if spaCy's entity
                  clearly contains year & day components and is not a duration.
    - SpaCy NER for ORG/FAC is IGNORED for replacement.
    - EntityRuler "protected_labels" are preserved.
    """
    text = str(text_input)

    # 1. Apply defined regex patterns first
    # These handle your explicit, high-confidence patterns.
    text_after_regex = text # Store the result of regex processing
    if "company" in regex_patterns:
        text_after_regex = regex_patterns["company"].sub("[ORG_NAME]", text_after_regex)
    if "date" in regex_patterns: # Your regex for "Month Day, Year" etc.
        text_after_regex = regex_patterns["date"].sub("[DATE]", text_after_regex)
    if "iso_date" in regex_patterns: # Your regex for "YYYY-MM-DD"
        text_after_regex = regex_patterns["iso_date"].sub("[DATE]", text_after_regex)
    if "submission_id" in regex_patterns:
        text_after_regex = regex_patterns["submission_id"].sub("[SUBMISSION_ID]", text_after_regex)

    text = text_after_regex # Update text with results of regex pass

    if nlp_model is None:
        return text

    doc = nlp_model(text)
    new_text_parts = []
    current_pos = 0

    protected_labels_from_ruler = [
        "DISEASE_CODE", "VET_ABBREV", "MATERIAL", "ANIMAL_GROUP_TERM",
        "INTERNAL_CODE", "PROCESS_TERM", "BIOLOGICAL_SAMPLE"
    ]
    known_non_person_terms = [
        "prrs", "routine prrs", "rmgp3", "s", "pcr coronavirus s",
        "pedv", "routine pedv", "gilt iso", "bloodserum", "dacron"
    ]
    known_non_location_terms = ["viro"]

    for ent in doc.ents:
        new_text_parts.append(text[current_pos:ent.start_char])
        placeholder = ent.text
        ent_text_lower = ent.text.lower()

        if ent.label_ in protected_labels_from_ruler:
            placeholder = ent.text
        elif ent.label_ == "PERSON":
            if ent_text_lower in known_non_person_terms:
                placeholder = ent.text
            else:
                is_doctor_prefix = False
                if re.match(r'^(dr\.?|doctor)\s+', ent.text, re.IGNORECASE):
                    is_doctor_prefix = True
                else:
                    prefix_window_start = max(0, ent.start_char - 10)
                    text_segment_before = text[prefix_window_start:ent.start_char]
                    if re.search(r'(dr\.?|doctor)\s+$', text_segment_before, re.IGNORECASE):
                        is_doctor_prefix = True
                if is_doctor_prefix:
                    placeholder = "[VET_NAME]"
                else:
                    placeholder = ent.text
        elif ent.label_ in ("GPE", "LOC"):
            if ent_text_lower in known_non_location_terms:
                placeholder = ent.text
            elif ent_text_lower != "[location]":
                placeholder = "[LOCATION]"

        elif ent.label_ == "DATE":
            if ent.text == "[DATE]": # Already handled by your regex pass
                placeholder = "[DATE]"
            else: # For DATE entities found by spaCy that weren't caught by your initial regex
                # Apply your stricter conditions (e.g., contains letters or multiple numbers, not a duration)
                if re.fullmatch(r'\d+\s+(week|day|month|year)s?(\s+old)?', ent_text_lower): # Exclude durations
                    placeholder = ent.text
                else:
                    has_year_4_digits = bool(re.search(r'\b\d{4}\b', ent.text))
                    has_day_number = bool(re.search(r'\b\d{1,2}\b', ent.text))
                    has_month_name = bool(re.search(r'(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)', ent_text_lower))
                    is_structured_date_like = bool(re.fullmatch(r'\d{4}[\-\/\.]\d{1,2}[\-\/\.]\d{1,2}', ent.text)) or \
                                              bool(re.fullmatch(r'\d{1,2}[\-\/\.]\d{1,2}[\-\/\.]\d{4}', ent.text))

                    if is_structured_date_like or \
                       (has_year_4_digits and has_day_number) or \
                       (has_month_name and has_day_number):
                        placeholder = "[DATE]"
                    # else: placeholder remains ent.text
        new_text_parts.append(placeholder)
        current_pos = ent.end_char
    new_text_parts.append(text[current_pos:])
    return "".join(new_text_parts)

## 5. Define preprocessing pipeline

In [None]:
# More functions
def run_preprocessing_pipeline(df, text_column, nlp_model):
    if nlp_model is None:
        print("Error: NLP model not loaded. Cannot run preprocessing pipeline.")
        # Add placeholder columns to avoid breaking downstream if possible
        df['cleaned_text'] = ""
        df['rule_based_note_type'] = "error_nlp_missing"
        df['anonymized_text_for_ml'] = ""
        df['target_ml_diagnostic'] = 0
        df['target_ml_unknown'] = 0
        df['is_normal_for_icd'] = False
        return df

    print("Starting preprocessing pipeline...")

    # Step 1: Clean Text
    print(f"Cleaning text in column '{text_column}' for {len(df)} records...")
    df['cleaned_text'] = df[text_column].progress_apply(clean_text_data)

    # Step 2: Apply Rule-Based Classification (uses cleaned_text - SIMPLIFIED CALL)
    print("Applying simplified rule-based classification...")
    # (UNKNOWN_EXACT_MATCH_RULES, DIAGNOSTIC_EXACT_MATCH_RULES) are defined from Cell 2 & 3
    df['rule_based_note_type'] = df['cleaned_text'].progress_apply(
        lambda x: classify_text_by_rules(x,
                                         UNKNOWN_EXACT_MATCH_RULES,
                                         DIAGNOSTIC_EXACT_MATCH_RULES)
    )

    # Step 3: Anonymize Text (uses cleaned_text to produce a separate anonymized version for ML)
    print("Anonymizing text for ML...")
    # Ensure anonymize_single_text and ANONYMIZATION_PATTERNS_REGEX are defined from Cell 2 & 3
    # THIS IS THE CRUCIAL LINE FOR CREATING THE COLUMN:
    df['anonymized_text_for_ml'] = df['cleaned_text'].progress_apply(
        lambda x: anonymize_single_text(x, nlp_model, ANONYMIZATION_PATTERNS_REGEX)
    )

    # Step 4: Prepare target labels for "Diagnostic" and "Unknown" ML Classifiers
    print("Preparing target labels for auxiliary ML classifiers...")
    df['target_ml_diagnostic'] = (df['rule_based_note_type'] == 'diagnostic_rule').astype(int)
    df['target_ml_unknown'] = (df['rule_based_note_type'] == 'unknown_rule').astype(int)

    # Step 5: Flag "normal" text intended for ICD-11 classifiers
    df['is_normal_for_icd'] = (df['rule_based_note_type'] == 'normal_rule')

    print("Preprocessing pipeline completed.")
    return df


## 6. Prepare the 2000 data set for Dr.Poljak to annotate
1. Loading and Cleaning:It loads the full dataset and removes records with no text.

2. Splitting Data: It separates the data into a large 80% training set and a smaller 20% test set, which is set aside for the final evaluation.

3. Finding Clinical Cases: It uses keyword rules to pre-classify the training data into "Clinical" and "Monitoring/Unknown" groups.

4. Creating an Enriched Sample: It creates a 2,000-record sample by taking 90% from the "Clinical" group and 10% from the "Monitoring/Unknown" group to focus the expert's time on the most relevant cases.

5. Formatting and Saving: Finally, it shuffles the sample, adds empty columns for the expert's labels, and saves it as an Excel file.

In [None]:
#Main
df_full = pd.read_excel("/Users/zimoyang/Documents/swine_project/data/HistoryData.xls")
ID_COLUMN = 'U_SUBMISSIONID'
TEXT_COLUMN = 'HISTORY'
TARGET_ANNOTATION_SIZE = 2000
TEST_SET_FRACTION = 0.20
DATE_COLUMN = 'CREATEDT'

print("--- Starting One-Time Data Preparation ---")

# --- 1. Initial Cleaning ---
# This is the critical step correctly identified was missing.
print(f"\nInitial record count: {len(df_full)}")
# First, convert any empty strings in the text column to proper NA values
df_full[TEXT_COLUMN] = df_full[TEXT_COLUMN].replace('', pd.NA)
# Now, drop any rows that have NA in the text column
df_full.dropna(subset=[TEXT_COLUMN], inplace=True)
print(f"Record count after dropping records with missing '{TEXT_COLUMN}': {len(df_full)}")


# --- 2. Perform the 80/20 Split ---
print("--- Performing 80/20 Random Split ---")
df_test_final = df_full.sample(frac=TEST_SET_FRACTION, random_state=42)
df_train_dev = df_full.drop(df_test_final.index)

print(f"Training & Development Set size: {len(df_train_dev)} records.")
print(f"Final Test Set size: {len(df_test_final)} records.")

# --- Rule-based classification keywords ---
UNKNOWN_EXACT_MATCH_RULES = [
    "no history provided", "no history given", "none given", "none", "unknown"
]
MONITORING_RULES = [
    "test purpose monitoring", "monitoring", "routine monitoring", "testing for olymel", "testing for maple leaf", "vaccination",
    "vaccine", "testing", "pcr", "healthy", "normal", "booster", "surveillance", "vax", "health check", "blood test"
]

# --- 3. Apply Rule-Based Filter to the Training Set ---
print("\n--- Applying Rule-Based Filter to Training Set ---")

def classify_text_for_sampling(text):
    """Applies rules to classify text for sampling purposes."""
    if not isinstance(text, str):
        return 'Unknown'

    text_lower = text.lower()

    if text_lower in UNKNOWN_EXACT_MATCH_RULES:
        return 'Unknown'

    # Use regex to find any of the monitoring/healthy keywords
    # This is more robust than an exact match for this list
    if any(re.search(r'\b' + re.escape(keyword) + r'\b', text_lower) for keyword in MONITORING_RULES):
        return 'Monitoring'

    return 'Clinical' # If it's not Unknown or Monitoring, it's likely a clinical case

# Apply the function to create a temporary classification column
df_train_dev['sampling_type'] = df_train_dev[TEXT_COLUMN].apply(classify_text_for_sampling)

print("Pre-classification complete. Value counts:")
print(df_train_dev['sampling_type'].value_counts())

# --- 4. Perform Targeted, Enriched Sampling ---
print("\n--- Performing Targeted Sampling to Create Enriched Dataset ---")

# Define the desired composition of the final 2000-record set
# We will aim for 90% clinical cases and 10% non-clinical cases
clinical_pct = 0.90
non_clinical_pct = 0.10

num_clinical_to_sample = int(TARGET_ANNOTATION_SIZE * clinical_pct)
num_non_clinical_to_sample = TARGET_ANNOTATION_SIZE - num_clinical_to_sample

# Create pools of records
clinical_pool = df_train_dev[df_train_dev['sampling_type'] == 'Clinical']
non_clinical_pool = df_train_dev[df_train_dev['sampling_type'] != 'Clinical']

print(f"Sampling {num_clinical_to_sample} records from the 'Clinical' pool...")
clinical_sample = clinical_pool.sample(n=min(num_clinical_to_sample, len(clinical_pool)), random_state=42)

print(f"Sampling {num_non_clinical_to_sample} records from the 'Monitoring/Healthy/Unknown' pool...")
non_clinical_sample = non_clinical_pool.sample(n=min(num_non_clinical_to_sample, len(non_clinical_pool)), random_state=42)

# Combine the samples
annotation_sample = pd.concat([clinical_sample, non_clinical_sample])

# --- 5. Prepare and Save the Final File for Dr. Poljak ---
# Shuffle the final dataset to ensure random order for annotation
final_annotation_df = annotation_sample.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"\nCreated final enriched annotation set with {len(final_annotation_df)} records.")

# Define the final ICD-11 labels for the columns
final_icd11_labels = [
    '[01] Certain infectious or parasitic diseases', '[02] Neoplasms',
    '[03] Diseases of the blood or blood-forming organs', '[04] Diseases of the immune system',
    '[05] Endocrine, nutritional or metabolic diseases', '[06] Mental, behavioural or neurodevelopmental disorders',
    '[08] Diseases of the nervous system', '[09] Diseases of the visual system',
    '[10] Diseases of the ear or mastoid process', '[11] Diseases of the circulatory system',
    '[12] Diseases of the respiratory system', '[13] Diseases of the digestive system',
    '[14] Diseases of the skin', '[15] Diseases of the musculoskeletal system or connective tissue',
    '[16] Diseases of the genitourinary system', '[18] Pregnancy, childbirth or the puerperium',
    '[19] Certain conditions originating in the perinatal period', '[20] Developmental anomalies',
    '[22] Injury, poisoning or certain other consequences of external causes',
    'Monitoring', 'Unknown'
]

columns_to_include = [ID_COLUMN, DATE_COLUMN, TEXT_COLUMN]
df_for_annotation = annotation_sample[columns_to_include].copy().reset_index(drop=True)

for label in final_icd11_labels:
    df_for_annotation[label] = ''

# Save to Excel
output_filename_excel = 'Enriched_Annotation_Sample_For_Dr_Poljak.xlsx'
df_for_annotation.to_excel(output_filename_excel, index=False)

print(f"\nSuccessfully created '{output_filename_excel}'.")
print("This file contains an enriched sample with a higher proportion of clinical cases.")

--- Starting One-Time Data Preparation ---

Initial record count: 64923
Record count after dropping records with missing 'HISTORY': 46322
--- Performing 80/20 Random Split ---
Training & Development Set size: 37058 records.
Final Test Set size: 9264 records.

--- Applying Rule-Based Filter to Training Set ---
Pre-classification complete. Value counts:
sampling_type
Clinical      13533
Monitoring    13368
Unknown       10157
Name: count, dtype: int64

--- Performing Targeted Sampling to Create Enriched Dataset ---
Sampling 1800 records from the 'Clinical' pool...
Sampling 200 records from the 'Monitoring/Healthy/Unknown' pool...

Created final enriched annotation set with 2000 records.

Successfully created 'Enriched_Annotation_Sample_For_Dr_Poljak.xlsx'.
This file contains an enriched sample with a higher proportion of clinical cases.


## 7. Prepare the 1000 test data for Dr.Poljak to annotate (same process as the 2000 data set)

In [None]:
# Prepare 1000 data for test
import pandas as pd
import numpy as np
import re

# --- Configuration for Test Set Generation ---
# This is the untouched test set (~9,600 records) created earlier.
TEST_SET_FILE = '/Users/zimoyang/Documents/swine_project/data/final_test_set.csv'
TEXT_COLUMN = 'HISTORY'
ID_COLUMN = 'U_SUBMISSIONID'
DATE_COLUMN = 'CREATEDT'

# The desired size for the final annotation sample.
FINAL_TEST_ANNOTATION_SIZE = 1000

# --- Load the untouched test set ---
print("--- Loading Final Test Set ---")
try:
    df_test_final = pd.read_csv(TEST_SET_FILE)
    print(f"Successfully loaded {len(df_test_final)} records from the test set.")
except FileNotFoundError:
    print(f"ERROR: The file '{TEST_SET_FILE}' was not found. Please ensure it has been created.")
    df_test_final = pd.DataFrame()

if not df_test_final.empty:
    # --- Apply the same rule-based filter to the Test Set ---
    print("\n--- Applying Rule-Based Filter to Test Set ---")

    # Re-using the same classification function for consistency
    df_test_final['sampling_type'] = df_test_final[TEXT_COLUMN].apply(classify_text_for_sampling)

    print("Pre-classification of test set complete. Value counts:")
    print(df_test_final['sampling_type'].value_counts())

    # --- Perform Targeted Sampling on the Test Set ---
    print("\n--- Performing Targeted Sampling to Create Final Test Annotation Set ---")

    # We use the same 90% clinical / 10% non-clinical ratio
    clinical_pct = 0.90
    num_clinical_to_sample = int(FINAL_TEST_ANNOTATION_SIZE * clinical_pct)
    num_non_clinical_to_sample = FINAL_TEST_ANNOTATION_SIZE - num_clinical_to_sample

    # Create pools of records from the test set
    clinical_pool_test = df_test_final[df_test_final['sampling_type'] == 'Clinical']
    non_clinical_pool_test = df_test_final[df_test_final['sampling_type'] != 'Clinical']

    print(f"Sampling {num_clinical_to_sample} records from the 'Clinical' pool...")
    clinical_sample_test = clinical_pool_test.sample(n=min(num_clinical_to_sample, len(clinical_pool_test)), random_state=42)

    print(f"Sampling {num_non_clinical_to_sample} records from the 'Non-Clinical' pool...")
    non_clinical_sample_test = non_clinical_pool_test.sample(n=min(num_non_clinical_to_sample, len(non_clinical_pool_test)), random_state=42)

    # Combine and shuffle the samples
    final_test_annotation_sample = pd.concat([clinical_sample_test, non_clinical_sample_test])
    final_test_annotation_sample = final_test_annotation_sample.sample(frac=1, random_state=42).reset_index(drop=True)

    # --- Prepare and Save the Final Annotation File ---
    print(f"\nCreated final enriched test annotation set with {len(final_test_annotation_sample)} records.")

    # Define the final ICD-11 labels for the columns
    final_icd11_labels = [
    '[01] Certain infectious or parasitic diseases',
    '[08] Diseases of the nervous system',
    '[12] Diseases of the respiratory system',
    '[13] Diseases of the digestive system',
    '[14] Diseases of the skin',
    '[15] Diseases of the musculoskeletal system or connective tissue',
    '[18] Pregnancy, childbirth or the puerperium',
    '[19] Certain conditions originating in the perinatal period',
    'Monitoring',
    'Unknown',
    'Symptoms not classified elsewhere'
    ]

    # Create the wide-format DataFrame for annotation
    columns_to_include = [ID_COLUMN, DATE_COLUMN, TEXT_COLUMN]
    df_for_final_annotation = final_test_annotation_sample[columns_to_include].copy()

    for label in final_icd11_labels:
        df_for_final_annotation[label] = ''

    # Save to a new Excel file
    output_filename_excel = 'Final_Evaluation_Sample_For_Dr_Poljak.xlsx'
    df_for_final_annotation.to_excel(output_filename_excel, index=False)

    print(f"\nSuccessfully created '{output_filename_excel}'.")
    print("This file is ready for final expert annotation.")

--- Loading Final Test Set ---
Successfully loaded 9264 records from the test set.

--- Applying Rule-Based Filter to Test Set ---
Pre-classification of test set complete. Value counts:
sampling_type
Clinical      3424
Monitoring    3381
Unknown       2459
Name: count, dtype: int64

--- Performing Targeted Sampling to Create Final Test Annotation Set ---
Sampling 900 records from the 'Clinical' pool...
Sampling 100 records from the 'Non-Clinical' pool...

Created final enriched test annotation set with 1000 records.

Successfully created 'Final_Evaluation_Sample_For_Dr_Poljak.xlsx'.
This file is ready for final expert annotation.


In [None]:
# Preprocess the 1000 test data
import pandas as pd
import time
from tqdm import tqdm

# --- Initialize tqdm for pandas ---
tqdm.pandas(desc="Processing Records")

# --- 1. Configuration ---
# INPUT: The annotated Excel file for the test set
INPUT_TEST_FILE = '/Users/zimoyang/Documents/swine_project/data/Final_Evaluation_Sample_For_Dr_Poljak.xlsx'

# OUTPUT: The final, processed test file ready for model evaluation
OUTPUT_PROCESSED_TEST_FILE = '/Users/zimoyang/Documents/swine_project/data/final_test_set_processed.csv'

# The column with the clinical text to be anonymized
TEXT_COLUMN = 'HISTORY'

# --- Prerequisite ---
# This script assumes 'load_nlp_model()' and 'run_preprocessing_pipeline()' are defined.

# --- 2. Load and Process the Test Data ---
print("--- Starting Preprocessing for Final Test Set ---")

# Load the NLP model once
nlp_model_global = load_nlp_model()

if nlp_model_global:
    print(f"Loading data from '{INPUT_TEST_FILE}'...")
    try:
        df_to_process = pd.read_excel(INPUT_TEST_FILE)

        print(f"Starting preprocessing for {len(df_to_process)} test records...")
        start_time = time.time()

        # This function creates the 'anonymized_text_for_ml' column
        df_processed = run_preprocessing_pipeline(df_to_process, TEXT_COLUMN, nlp_model_global)

        end_time = time.time()
        duration = end_time - start_time
        print(f"\nPreprocessing complete. Took {duration:.2f} seconds.")

        # --- THIS IS THE MODIFIED PART ---

        # 1. Get the list of all label columns from the processed dataframe
        # (This assumes label columns are the only ones left besides ID, text, etc.)
        id_and_text_cols = ['U_SUBMISSIONID', 'CREATEDT', 'HISTORY', 'anonymized_text_for_ml']
        label_columns = [col for col in df_processed.columns if col not in id_and_text_cols]

        # 2. Define the exact columns you want to save
        columns_to_save = ['anonymized_text_for_ml'] + label_columns

        # 3. Create the final, clean dataframe
        df_final_output = df_processed[columns_to_save]

        # 4. Save the clean dataframe
        df_final_output.to_csv(OUTPUT_PROCESSED_TEST_FILE, index=False)
        print(f"Successfully saved final processed data to '{OUTPUT_PROCESSED_TEST_FILE}'.")

        print("\n--- Sample of Final Output DataFrame ---")
        print(df_final_output.head())

    except FileNotFoundError:
        print(f"Error: Input file not found at '{INPUT_TEST_FILE}'")
    except Exception as e:
        print(f"An error occurred: {e}")
else:
    print("Skipping preprocessing because NLP model could not be loaded.")

--- Starting Preprocessing for Final Test Set ---
Successfully loaded spaCy model: en_core_web_md
Loading data from '/Users/zimoyang/Documents/swine_project/data/Final_Evaluation_Sample_For_Dr_Poljak.xlsx'...
Starting preprocessing for 1000 test records...
Starting preprocessing pipeline...
Cleaning text in column 'HISTORY' for 1000 records...


Processing Records: 100%|████████████████| 1000/1000 [00:00<00:00, 84839.68it/s]


Applying simplified rule-based classification...


Processing Records: 100%|███████████████| 1000/1000 [00:00<00:00, 590331.32it/s]


Anonymizing text for ML...


Processing Records: 100%|██████████████████| 1000/1000 [00:04<00:00, 203.85it/s]


Preparing target labels for auxiliary ML classifiers...
Preprocessing pipeline completed.

Preprocessing complete. Took 4.93 seconds.
Successfully saved final processed data to '/Users/zimoyang/Documents/swine_project/data/final_test_set_processed.csv'.

--- Sample of Final Output DataFrame ---
                              anonymized_text_for_ml  \
0  pigs falling back in nursing prev dx of swine ...   
1  porcine feces for rotavirus sequencing group c...   
2                                       feed samples   
3  cough throughout barn suspect influenza but ha...   
4  swelling in joints difficulty walking can star...   

   [01] Certain infectious or parasitic diseases  \
0                                            NaN   
1                                            NaN   
2                                            NaN   
3                                            NaN   
4                                            NaN   

   [08] Diseases of the nervous system  \
0           

In [None]:
import pandas as pd

# 1. Define the file paths
input_file_path = "/Users/zimoyang/Documents/swine_project/data/final_test_set_processed.csv"
# We'll save the cleaned data to a new file to avoid overwriting the original
output_file_path = "/Users/zimoyang/Documents/swine_project/data/final_test_set_cleaned.csv"

# 2. Load the CSV into a pandas DataFrame
try:
    df_processed = pd.read_csv(input_file_path)
    print(f"Successfully loaded '{input_file_path}'. Found {len(df_processed)} records.")

    # 3. Define the list of columns you want to delete
    columns_to_drop = [
        'cleaned_text',
        'rule_based_note_type',
        'target_ml_diagnostic',
        'target_ml_unknown',
        'is_normal_for_icd'
    ]

    # 4. Drop the columns from the DataFrame
    # The errors='ignore' argument prevents an error if a column is not found
    original_columns = df_processed.columns.tolist()
    df_cleaned = df_processed.drop(columns=columns_to_drop, errors='ignore')

    print("\nColumns successfully dropped.")

    # 5. Save the cleaned DataFrame to the new file
    # Use index=False to prevent writing the DataFrame index as an extra column
    df_cleaned.to_csv(output_file_path, index=False)

    print(f"Cleaned data has been successfully saved to: '{output_file_path}'")
    print("\nRemaining columns in the new file:")
    print(df_cleaned.columns.tolist())

except FileNotFoundError:
    print(f"Error: The file was not found at the specified path: {input_file_path}")

Successfully loaded '/Users/zimoyang/Documents/swine_project/data/final_test_set_processed.csv'. Found 1000 records.

Columns successfully dropped.
Cleaned data has been successfully saved to: '/Users/zimoyang/Documents/swine_project/data/final_test_set_cleaned.csv'

Remaining columns in the new file:
['anonymized_text_for_ml', '[01] Certain infectious or parasitic diseases', '[08] Diseases of the nervous system', '[12] Diseases of the respiratory system', '[13] Diseases of the digestive system', '[14] Diseases of the skin', '[15] Diseases of the musculoskeletal system or connective tissue', '[18] Pregnancy, childbirth or the puerperium', '[19] Certain conditions originating in the perinatal period', 'Monitoring', 'Unknown', 'Symptoms not classified elsewhere']
