<a href="https://colab.research.google.com/github/yzm9393/swineBRET-ICD/blob/main/02_Annotation_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Import relevant libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

## 2. Preprocess the annotated 2000 data set

In [None]:
#The submission ID is deleted by Dr.Poljak and I am restoring it by row order
# --- 1. File Paths ---
# The original file sent to Dr. Poljak (it has the Submission ID)
ORIGINAL_ANNOTATION_FILE = '/Users/zimoyang/Documents/swine_project/2000_Annotation_Sample.xlsx'

# The new file Dr. Poljak sent back (it is missing the ID column)
RETURNED_ANNOTATED_FILE = '/Users/zimoyang/Documents/swine_project/2000_Annotation_Sample_annotated.xlsx' # <--- IMPORTANT: UPDATE THIS PATH

ID_COLUMN = 'U_SUBMISSIONID' # Or 'record_id' if you created one
TEXT_COLUMN = 'HISTORY'


# --- 2. Load Both Datasets ---
print("Loading both the original and the returned annotated files...")
try:
    df_original = pd.read_excel(ORIGINAL_ANNOTATION_FILE)
    df_returned = pd.read_excel(RETURNED_ANNOTATED_FILE)
except FileNotFoundError as e:
    print(f"Error: Could not find one of the files. Please check the paths. Details: {e}")
    # Stop execution if files aren't found
    raise

# --- 3. Crucial Assumption Check ---
print("\nVerifying that both files have the same number of rows...")
if len(df_original) != len(df_returned):
    print("STOPPING: The number of rows in the files do not match!")
    print(f"Original file has {len(df_original)} rows.")
    print(f"Returned file has {len(df_returned)} rows.")
    # You must stop here and investigate what happened.
    raise ValueError("Row count mismatch between original and returned files.")
else:
    print("Row counts match. Proceeding with merge based on row order.")


# --- 4. Re-add the ID Column and Combine ---
# Since we assume the row order is the same, we can simply copy the ID column
# from the original dataframe to the returned one.
df_returned[ID_COLUMN] = df_original[ID_COLUMN]


# --- 5. Validation and Sanity Check ---
# Before we trust the merge, let's confirm the HISTORY text still matches for a few rows.
# This validates our assumption that the row order was not changed.
print("\nPerforming sanity check by comparing text from 5 random rows...")
validation_sample = df_returned.sample(5, random_state=42)

all_match = True
for index in validation_sample.index:
    original_text = df_original.loc[index, TEXT_COLUMN]
    returned_text = df_returned.loc[index, TEXT_COLUMN]
    if str(original_text).strip() != str(returned_text).strip():
        print(f"WARNING: Mismatch found at index {index}!")
        all_match = False

if all_match:
    print("Sanity check passed! The text content appears to be correctly aligned.")
else:
    print("WARNING: Sanity check failed. The row order may have changed. Proceed with caution.")

# --- 6. Save the Final, Corrected Data ---
# Now that df_returned has the ID column restored and has been validated,
# we can save it to a new file. This file will be your "gold standard" source.

output_filename = '/Users/zimoyang/Documents/swine_project/annotated_results_corrected.csv'

print(f"\nSaving the corrected and validated data to '{output_filename}'...")

# We save the df_returned dataframe, which now contains the ID and the labels from Dr. Poljak.
df_returned.to_csv(output_filename, index=False)

print("Save complete! You can now use this new file for the next steps of your analysis.")

Loading both the original and the returned annotated files...
Error: Could not find one of the files. Please check the paths. Details: [Errno 2] No such file or directory: '/Users/zimoyang/Documents/swine_project/2000_Annotation_Sample.xlsx'


FileNotFoundError: [Errno 2] No such file or directory: '/Users/zimoyang/Documents/swine_project/2000_Annotation_Sample.xlsx'

## 3. Label consolidation
Given the prevalence in the 2000 annotation data set, I will consolidate all rare labels less than 10 cases to symptoms_not_classified_elsewhere

In [None]:
import pandas as pd
import numpy as np

# --- 1. Configuration ---
ANNOTATED_FILE_PATH = '/Users/zimoyang/Documents/swine_project/data/annotated_results_corrected.csv'
FINAL_PROCESSED_FILE = '/Users/zimoyang/Documents/swine_project/data/annotated_data_for_modeling_consolidated.csv'
ID_COLUMN = 'U_SUBMISSIONID'
TEXT_COLUMN = 'HISTORY'
DATE_COLUMN = 'CREATEDT'

# --- 2. Load and Prepare Annotated Data ---
print(f"Loading expert annotations from: '{ANNOTATED_FILE_PATH}'")
df_annotated = pd.read_csv(ANNOTATED_FILE_PATH)

# Convert all potential label columns to a numerical 0/1 format for consistency
# This avoids errors if a column doesn't exist or has mixed types
potential_labels = df_annotated.columns.drop([ID_COLUMN, DATE_COLUMN, TEXT_COLUMN], errors='ignore').tolist()
for col in potential_labels:
    # Ensure column is numeric before trying to fillna
    if pd.api.types.is_numeric_dtype(df_annotated[col]):
        df_annotated[col].fillna(0, inplace=True)
    else:
        # If not numeric, convert based on presence of any mark
        df_annotated[col] = df_annotated[col].notna().astype(int)

# --- 3. Define Labels to Keep vs. Consolidate (Explicitly) ---

# --- REVISED LOGIC: Be explicit about which labels to keep and which to consolidate ---
# This avoids accidentally including non-label columns like 'Note' or 'Unnamed: ...'

labels_to_keep_as_is = [
    '[01] Certain infectious or parasitic diseases',
    '[08] Diseases of the nervous system',
    '[12] Diseases of the respiratory system',
    '[13] Diseases of the digestive system',
    '[14] Diseases of the skin',
    '[15] Diseases of the musculoskeletal system or connective tissue',
    '[18] Pregnancy, childbirth or the puerperium',
    '[19] Certain conditions originating in the perinatal period',
    'Monitoring',
    'Unknown',
]

# This is the label we will consolidate the rare ones INTO
consolidation_label = 'Symptoms not classified elsewhere'

# This is the explicit list of RARE CLINICAL LABELS to consolidate
explicit_rare_labels = [
    '[02] Neoplasms',
    '[03] Diseases of the blood or blood-forming organs',
    '[04] Diseases of the immune system',
    '[05] Endocrine, nutritional or metabolic diseases',
    '[06] Mental, behavioural or neurodevelopmental disorders',
    '[09] Diseases of the visual system',
    '[10] Diseases of the ear or mastoid process',
    '[11] Diseases of the circulatory system',
    '[16] Diseases of the genitourinary system',
    '[20] Developmental anomalies',
    '[22] Injury, poisoning or certain other consequences of external causes',
    'Symptoms_not_classified_elswhere'
]

# Only try to consolidate labels that actually exist in the loaded dataframe
labels_to_consolidate = [label for label in explicit_rare_labels if label in df_annotated.columns]

print(f"\nKeeping {len(labels_to_keep_as_is)} labels as individual targets.")
print(f"Consolidating {len(labels_to_consolidate)} rare labels into '{consolidation_label}'.")


# --- 4. Perform the Consolidation (Revised to Prevent KeyError) ---

# --- THIS IS THE FIX ---
# First, check if the consolidation column exists. If not, create it and initialize with 0.
if consolidation_label not in df_annotated.columns:
    print(f"Creating new column for consolidation: '{consolidation_label}'")
    df_annotated[consolidation_label] = 0
# ----------------------

# For any row where one of the rare labels is 1, ensure the consolidation label is also 1.
is_any_rare_disease = df_annotated[labels_to_consolidate].any(axis=1)
df_annotated[consolidation_label] = df_annotated[consolidation_label] | is_any_rare_disease.astype(int)

# --- 5. Create the Final DataFrame for Modeling ---
# Add the consolidation label to our list of keepers to define the final set
final_modeling_labels = labels_to_keep_as_is + [consolidation_label]
final_modeling_labels = [label for label in final_modeling_labels if label in df_annotated.columns]


# Select only the necessary columns
columns_to_export = [ID_COLUMN, DATE_COLUMN, TEXT_COLUMN] + final_modeling_labels
df_modeling = df_annotated[columns_to_export].copy()

# Create the Multi-Hot Vector from the NEW Consolidated Labels
print("\nCreating multi-hot vector from the final modeling labels...")
df_modeling['expert_labels_vector'] = df_modeling[final_modeling_labels].values.tolist()

print("\nFinal number of labels for modeling:", len(final_modeling_labels))

# --- 6. Save the Final Processed Data ---
df_modeling.to_csv(FINAL_PROCESSED_FILE, index=False)
print(f"\nSuccessfully created your final modeling dataset: '{FINAL_PROCESSED_FILE}'")

 # --- 5. Count the Final Number of Records ---
    # The .sum() method on a column of 0s and 1s gives a total count of the 1s.
final_count = df_annotated[consolidation_label].sum()

total_records = len(df_annotated)
percentage = (final_count / total_records) * 100
print("\n--- FINAL COUNT ---")
print(f"The total number of records classified as '{consolidation_label}' is: {final_count}")
print(f"This represents {percentage:.2f}% of the {total_records} annotated records.")


## 3. Merge the 2000 annotated data set (gold-standard data set) with the original training data set

In [None]:
# --- 1. Configuration: Define your filenames and key column ---
ANNOTATED_DATA_FILE = '/Users/zimoyang/Documents/swine_project/data/annotated_data_for_modeling_consolidated.csv'
TRAINING_SET_FILE = '/Users/zimoyang/Documents/swine_project/data/training_development_set.csv'
FINAL_OUTPUT_FILE = '/Users/zimoyang/Documents/swine_project/data/training_set_with_gold_labels.csv'

# This is the unique ID column that links the two files
ID_COLUMN = 'U_SUBMISSIONID'

# --- 2. Load Your Datasets ---
print("Loading datasets...")
try:
    df_train_dev = pd.read_csv(TRAINING_SET_FILE)
    df_annotated = pd.read_csv(ANNOTATED_DATA_FILE)
    print("Files loaded successfully.")
except FileNotFoundError as e:
    print(f"Error: A required file was not found. Please check filenames. Details: {e}")
    # Stop execution if a file is missing
    raise

# --- 3. Prepare for the Merge ---
# From the annotated file, we only need the ID and the new vector column for the merge
columns_to_merge = [ID_COLUMN, 'expert_labels_vector']
annotations_to_merge = df_annotated[columns_to_merge]

# --- 4. Perform the Merge ---
print("\nMerging annotated labels into the main training set...")
# We use a 'left' merge to ensure we keep all ~38,400 records from the main training set.
df_final_merged = pd.merge(df_train_dev, annotations_to_merge, on=ID_COLUMN, how='left')

print("Merge complete.")
print(f"The final merged DataFrame has {len(df_final_merged)} rows.")


# --- 5. Inspect the Result (Optional but Recommended) ---
# Count how many rows have the expert labels vs. how many are blank (NaN)
annotated_count = df_final_merged['expert_labels_vector'].notna().sum()
unannotated_count = df_final_merged['expert_labels_vector'].isna().sum()
print(f"\nVerification:")
print(f"Number of rows with gold-standard labels: {annotated_count}")
print(f"Number of rows without gold-standard labels: {unannotated_count}")


# --- 6. Save the Final Master Dataset ---
df_final_merged.to_csv(FINAL_OUTPUT_FILE, index=False)
print(f"\nSuccessfully created your master training file: '{FINAL_OUTPUT_FILE}'")

Loading datasets...
Files loaded successfully.

Merging annotated labels into the main training set...
Merge complete.
The final merged DataFrame has 37058 rows.

Verification:
Number of rows with gold-standard labels: 2000
Number of rows without gold-standard labels: 35058

Successfully created your master training file: '/Users/zimoyang/Documents/swine_project/data/training_set_with_gold_labels.csv'


## 5. Preprcess the merged data set

In [None]:
# Preprocessing
# Configuration
import re
NLP_MODEL_NAME = "en_core_web_md" # Or your chosen model

# Rule-based classification templates (SIMPLIFIED) - These are for classify_text_by_rules
UNKNOWN_EXACT_MATCH_RULES = [
    "no history provided", "no history given", "none given", "none", "unknown"
]
DIAGNOSTIC_EXACT_MATCH_RULES = [
    "test purpose monitoring", "monitoring", "routine monitoring", "testing for olymel", "testing for maple leaf", "vaccination",
    "vaccine", "testing", "pcr", "healthy", "normal", "booster", "surveillance", "vax", "health check", "blood test"
]
# Anonymization regex patterns
ANONYMIZATION_PATTERNS_REGEX = {
    "company": re.compile(r'\b(olymel|maple leaf|duroc|conestoga|hypor)\b', re.IGNORECASE), # For ORG_NAME
    # Optional: Keep these if you want regex to catch specific date/ID formats BEFORE spaCy.
    # If spaCy alone should handle dates (with stricter logic), you can comment these out.
    "date": re.compile(r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{1,2},?\s+\d{4}\b', re.IGNORECASE),
    "iso_date": re.compile(r'\b\d{4}-\d{2}-\d{2}\b'),
    "submission_id": re.compile(r'\b[A-Z]{2}\d{3,6}\b'), # Example, adjust as needed
}

### 5.1 Define annonymization functions

In [None]:
# Function Definitions

def load_nlp_model(model_name=NLP_MODEL_NAME):
    """Loads and returns the spaCy NLP model."""
    try:
        nlp = spacy.load(model_name)
        print(f"Successfully loaded spaCy model: {model_name}")
        return nlp
    except OSError:
        print(f"spaCy model '{model_name}' not found. Please download it: python -m spacy download {model_name}")
        return None

def clean_text_data(text):
    """Cleans a single text string."""
    text = str(text).strip().lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[\r\n]+', ' ', text)
    text = re.sub(r'[^\w\s_]', '', text) # Keeps alphanumeric, whitespace, underscore
    return text

# Ensure your rule lists are defined at the top of your script:
# DIAGNOSTIC_EXACT_MATCH_RULES
# DIAGNOSTIC_WEAK_PREFIX_RULES
# UNKNOWN_LOGISTICS_TERMS_RULES
# UNKNOWN_EXACT_MATCH_RULES

def classify_text_by_rules(text_input,
                           unknown_exact_match,
                           diagnostic_exact_match): # Only two rule lists now
    """
    Classifies cleaned text based on predefined, simplified exact match rules.
    """
    text = str(text_input).strip().lower() # Rules expect cleaned, lowercased text

    if text in unknown_exact_match:
        return "unknown_rule"
    elif text in diagnostic_exact_match:
        return "diagnostic_rule"

    # Everything else is initially considered "Normal"
    return "normal_rule"

def anonymize_single_text(text_input, nlp_model, regex_patterns):
    """
    Anonymizes text with very strict date handling:
    - Regex for specific "company" patterns ([ORG_NAME]).
    - Regex for specific, full date formats ("Month Day, Year", "YYYY-MM-DD").
    - SpaCy NER for:
        - PERSON: Replaced with [VET_NAME] only if "Dr." prefix is found.
        - GPE, LOC: Replaced with [LOCATION] (with exceptions).
        - DATE: Replaced with [DATE] ONLY if it's a regex match OR if spaCy's entity
                  clearly contains year & day components and is not a duration.
    - SpaCy NER for ORG/FAC is IGNORED for replacement.
    - EntityRuler "protected_labels" are preserved.
    """
    text = str(text_input)

    # 1. Apply defined regex patterns first
    # These handle your explicit, high-confidence patterns.
    text_after_regex = text # Store the result of regex processing
    if "company" in regex_patterns:
        text_after_regex = regex_patterns["company"].sub("[ORG_NAME]", text_after_regex)
    if "date" in regex_patterns: # Your regex for "Month Day, Year" etc.
        text_after_regex = regex_patterns["date"].sub("[DATE]", text_after_regex)
    if "iso_date" in regex_patterns: # Your regex for "YYYY-MM-DD"
        text_after_regex = regex_patterns["iso_date"].sub("[DATE]", text_after_regex)
    if "submission_id" in regex_patterns:
        text_after_regex = regex_patterns["submission_id"].sub("[SUBMISSION_ID]", text_after_regex)

    text = text_after_regex # Update text with results of regex pass

    if nlp_model is None:
        return text

    doc = nlp_model(text)
    new_text_parts = []
    current_pos = 0

    protected_labels_from_ruler = [
        "DISEASE_CODE", "VET_ABBREV", "MATERIAL", "ANIMAL_GROUP_TERM",
        "INTERNAL_CODE", "PROCESS_TERM", "BIOLOGICAL_SAMPLE"
    ]
    known_non_person_terms = [
        "prrs", "routine prrs", "rmgp3", "s", "pcr coronavirus s",
        "pedv", "routine pedv", "gilt iso", "bloodserum", "dacron"
    ]
    known_non_location_terms = ["viro"]

    for ent in doc.ents:
        new_text_parts.append(text[current_pos:ent.start_char])
        placeholder = ent.text
        ent_text_lower = ent.text.lower()

        if ent.label_ in protected_labels_from_ruler:
            placeholder = ent.text
        elif ent.label_ == "PERSON":
            if ent_text_lower in known_non_person_terms:
                placeholder = ent.text
            else:
                is_doctor_prefix = False
                if re.match(r'^(dr\.?|doctor)\s+', ent.text, re.IGNORECASE):
                    is_doctor_prefix = True
                else:
                    prefix_window_start = max(0, ent.start_char - 10)
                    text_segment_before = text[prefix_window_start:ent.start_char]
                    if re.search(r'(dr\.?|doctor)\s+$', text_segment_before, re.IGNORECASE):
                        is_doctor_prefix = True
                if is_doctor_prefix:
                    placeholder = "[VET_NAME]"
                else:
                    placeholder = ent.text
        elif ent.label_ in ("GPE", "LOC"):
            if ent_text_lower in known_non_location_terms:
                placeholder = ent.text
            elif ent_text_lower != "[location]":
                placeholder = "[LOCATION]"

        elif ent.label_ == "DATE":
            if ent.text == "[DATE]": # Already handled by your regex pass
                placeholder = "[DATE]"
            else: # For DATE entities found by spaCy that weren't caught by your initial regex
                # Apply your stricter conditions (e.g., contains letters or multiple numbers, not a duration)
                if re.fullmatch(r'\d+\s+(week|day|month|year)s?(\s+old)?', ent_text_lower): # Exclude durations
                    placeholder = ent.text
                else:
                    has_year_4_digits = bool(re.search(r'\b\d{4}\b', ent.text))
                    has_day_number = bool(re.search(r'\b\d{1,2}\b', ent.text))
                    has_month_name = bool(re.search(r'(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)', ent_text_lower))
                    is_structured_date_like = bool(re.fullmatch(r'\d{4}[\-\/\.]\d{1,2}[\-\/\.]\d{1,2}', ent.text)) or \
                                              bool(re.fullmatch(r'\d{1,2}[\-\/\.]\d{1,2}[\-\/\.]\d{4}', ent.text))

                    if is_structured_date_like or \
                       (has_year_4_digits and has_day_number) or \
                       (has_month_name and has_day_number):
                        placeholder = "[DATE]"
                    # else: placeholder remains ent.text
        new_text_parts.append(placeholder)
        current_pos = ent.end_char
    new_text_parts.append(text[current_pos:])
    return "".join(new_text_parts)

### 5.2 Define preprocessing pipeline

In [None]:
# More functions
def run_preprocessing_pipeline(df, text_column, nlp_model):
    if nlp_model is None:
        print("Error: NLP model not loaded. Cannot run preprocessing pipeline.")
        # Add placeholder columns to avoid breaking downstream if possible
        df['cleaned_text'] = ""
        df['rule_based_note_type'] = "error_nlp_missing"
        df['anonymized_text_for_ml'] = ""
        df['target_ml_diagnostic'] = 0
        df['target_ml_unknown'] = 0
        df['is_normal_for_icd'] = False
        return df

    print("Starting preprocessing pipeline...")

    # Step 1: Clean Text
    print(f"Cleaning text in column '{text_column}' for {len(df)} records...")
    df['cleaned_text'] = df[text_column].progress_apply(clean_text_data)

    # Step 2: Apply Rule-Based Classification (uses cleaned_text - SIMPLIFIED CALL)
    print("Applying simplified rule-based classification...")
    # (UNKNOWN_EXACT_MATCH_RULES, DIAGNOSTIC_EXACT_MATCH_RULES) are defined from Cell 2 & 3
    df['rule_based_note_type'] = df['cleaned_text'].progress_apply(
        lambda x: classify_text_by_rules(x,
                                         UNKNOWN_EXACT_MATCH_RULES,
                                         DIAGNOSTIC_EXACT_MATCH_RULES)
    )

    # Step 3: Anonymize Text (uses cleaned_text to produce a separate anonymized version for ML)
    print("Anonymizing text for ML...")
    # Ensure anonymize_single_text and ANONYMIZATION_PATTERNS_REGEX are defined from Cell 2 & 3
    # THIS IS THE CRUCIAL LINE FOR CREATING THE COLUMN:
    df['anonymized_text_for_ml'] = df['cleaned_text'].progress_apply(
        lambda x: anonymize_single_text(x, nlp_model, ANONYMIZATION_PATTERNS_REGEX)
    )

    # Step 4: Prepare target labels for "Diagnostic" and "Unknown" ML Classifiers
    print("Preparing target labels for auxiliary ML classifiers...")
    df['target_ml_diagnostic'] = (df['rule_based_note_type'] == 'diagnostic_rule').astype(int)
    df['target_ml_unknown'] = (df['rule_based_note_type'] == 'unknown_rule').astype(int)

    # Step 5: Flag "normal" text intended for ICD-11 classifiers
    df['is_normal_for_icd'] = (df['rule_based_note_type'] == 'normal_rule')

    print("Preprocessing pipeline completed.")
    return df


### 5.3 Run preprocess on merged training dataset

In [None]:
# This cell loads the clean training data, runs the full preprocessing and
# anonymization pipeline, and saves the result to a new file.
# This is a one-time, computationally intensive step.

import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm
import re
import time

# --- Initialize tqdm for pandas ---
# This "patches" pandas to add the .progress_apply() method
# Run this once at the beginning of your session.
tqdm.pandas(desc="Processing Records")

# --- Configuration ---
# This is the large training set (~38,400 records) created in the previous step
INPUT_DATA_PATH = '/Users/zimoyang/Documents/swine_project/data/training_set_with_gold_labels.csv'
# This will be the final, fully processed dataset for all future modeling
OUTPUT_PROCESSED_PATH = '/Users/zimoyang/Documents/swine_project/data/training_set_processed.csv'
TEXT_COLUMN = 'HISTORY'

# --- Prerequisite ---
# This script assumes your 'load_nlp_model()' and 'run_preprocessing_pipeline()'
# functions are already defined in a previous cell in your notebook.

print("--- Starting Data Preprocessing ---")

# Load the NLP model once
nlp_model_global = load_nlp_model()

if nlp_model_global:
    print(f"Loading data from '{INPUT_DATA_PATH}'...")
    try:
        df_to_process = pd.read_csv(INPUT_DATA_PATH)

        print(f"Starting preprocessing for {len(df_to_process)} records...")
        start_time = time.time()

        # This is where your main function is called on the entire training set
        df_processed = run_preprocessing_pipeline(df_to_process, TEXT_COLUMN, nlp_model_global)

        end_time = time.time()
        duration = end_time - start_time
        print(f"\nPreprocessing complete. Took {duration:.2f} seconds ({duration/60:.2f} minutes).")

        # Save the processed data to a new file for future use
        df_processed.to_csv(OUTPUT_PROCESSED_PATH, index=False)
        print(f"Successfully saved fully processed data to '{OUTPUT_PROCESSED_PATH}'.")

        print("\n--- Sample of Final Processed DataFrame ---")
        # Display the first few rows with the new 'anonymized_text_for_ml' column
        print(df_processed.head())

    except FileNotFoundError:
        print(f"Error: Input file not found at '{INPUT_DATA_PATH}'")
    except Exception as e:
        print(f"An error occurred: {e}")
else:
    print("Skipping preprocessing because NLP model could not be loaded.")

--- Starting Data Preprocessing ---
Successfully loaded spaCy model: en_core_web_md
Loading data from '/Users/zimoyang/Documents/swine_project/data/training_set_with_gold_labels.csv'...
Starting preprocessing for 37058 records...
Starting preprocessing pipeline...
Cleaning text in column 'HISTORY' for 37058 records...


Processing Records: 100%|█████████████| 37058/37058 [00:00<00:00, 125380.66it/s]


Applying simplified rule-based classification...


Processing Records: 100%|█████████████| 37058/37058 [00:00<00:00, 949345.36it/s]


Anonymizing text for ML...


Processing Records: 100%|████████████████| 37058/37058 [02:40<00:00, 230.86it/s]


Preparing target labels for auxiliary ML classifiers...
Preprocessing pipeline completed.

Preprocessing complete. Took 160.91 seconds (2.68 minutes).
Successfully saved fully processed data to '/Users/zimoyang/Documents/swine_project/data/training_set_processed.csv'.

--- Sample of Final Processed DataFrame ---
  U_SUBMISSIONID             CREATEDT  \
0      14-000027  2014-01-02 09:21:24   
1      14-000112  2014-01-02 12:51:30   
2      14-000156  2014-01-02 17:03:48   
3      14-000208  2014-01-03 09:07:31   
4      14-000215  2014-01-03 09:15:18   

                                             HISTORY  SPECIES  \
0  Batch IDs for PRRS PCR testing are as follows:...  Porcine   
1                            kmitch09: Routine check  Porcine   
2  PRRS testing for AI Entry\nSamples IDs for PRR...  Porcine   
3                                   No history given  Porcine   
4  Routine monitor for PRRS\n30 Sacron swabs - po...  Porcine   

  expert_labels_vector                          

### 5.4 run preprocssing on the 2000 annotated data set

In [None]:
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm
import re
import time

# --- Initialize tqdm for pandas ---
# This line "patches" pandas to add the .progress_apply() method.
# By placing it here, it will always be active for your notebook session.
tqdm.pandas(desc="Processing Records")

print("Libraries imported and tqdm is ready for pandas.")
# --- 1. Configuration ---
# The annotated file from Dr. Poljak
ANNOTATED_DATA_FILE = '/Users/zimoyang/Documents/swine_project/data/annotated_data_for_modeling_consolidated.csv'
# The final output file after preprocessing
OUTPUT_PROCESSED_FILE = '/Users/zimoyang/Documents/swine_project/data/annotated_gold_standard_processed.csv'
TEXT_COLUMN = 'HISTORY'

# --- Prerequisite ---
# This script assumes your 'load_nlp_model()' and 'run_preprocessing_pipeline()'
# functions are already defined in a previous cell.

# --- 2. Load and Process Data ---
print("--- Starting Preprocessing for Gold-Standard Data ---")

# Load the NLP model once
nlp_model_global = load_nlp_model()

if nlp_model_global:
    print(f"Loading data from '{ANNOTATED_DATA_FILE}'...")
    try:
        df_to_process = pd.read_csv(ANNOTATED_DATA_FILE)

        print(f"Starting preprocessing for {len(df_to_process)} records...")
        start_time = time.time()

        # This is where your main function is called on the annotated set
        df_processed = run_preprocessing_pipeline(df_to_process, TEXT_COLUMN, nlp_model_global)

        end_time = time.time()
        duration = end_time - start_time
        print(f"\nPreprocessing complete. Took {duration:.2f} seconds ({duration/60:.2f} minutes).")

        # Save the processed data
        df_processed.to_csv(OUTPUT_PROCESSED_FILE, index=False)
        print(f"Successfully saved processed data to '{OUTPUT_PROCESSED_FILE}'.")

        print("\n--- Sample of Final Processed DataFrame ---")
        print(df_processed.head())

    except FileNotFoundError:
        print(f"Error: Input file not found at '{ANNOTATED_DATA_FILE}'")
    except Exception as e:
        print(f"An error occurred: {e}")
else:
    print("Skipping preprocessing because NLP model could not be loaded.")

Libraries imported and tqdm is ready for pandas.
--- Starting Preprocessing for Gold-Standard Data ---
Successfully loaded spaCy model: en_core_web_md
Loading data from '/Users/zimoyang/Documents/swine_project/data/annotated_data_for_modeling_consolidated.csv'...
Starting preprocessing for 2000 records...
Starting preprocessing pipeline...
Cleaning text in column 'HISTORY' for 2000 records...


Processing Records: 100%|████████████████| 2000/2000 [00:00<00:00, 81376.43it/s]


Applying simplified rule-based classification...


Processing Records: 100%|███████████████| 2000/2000 [00:00<00:00, 709456.02it/s]


Anonymizing text for ML...


Processing Records: 100%|██████████████████| 2000/2000 [00:09<00:00, 211.30it/s]


Preparing target labels for auxiliary ML classifiers...
Preprocessing pipeline completed.

Preprocessing complete. Took 9.53 seconds (0.16 minutes).
Successfully saved processed data to '/Users/zimoyang/Documents/swine_project/data/annotated_gold_standard_processed.csv'.

--- Sample of Final Processed DataFrame ---
  U_SUBMISSIONID             CREATEDT  \
0      15-034583  2015-05-13 08:17:22   
1      19-078217  2019-10-04 13:15:18   
2      18-008292  2018-01-30 14:22:28   
3      14-021155  2014-03-19 14:41:25   
4      17-091269  2017-11-09 11:03:20   

                                             HISTORY  \
0  Truck tested positive for Delta Corona virus @...   
1                                      NV on samples   
2  5-1 to 5-5 are 10 weeks old\n8-1 to 8-5 are 9 ...   
3  Suckling piglets becoming hairy/untriffty at d...   
4  Pet pig, indoor/outdoor, 1 other pig in househ...   

   [01] Certain infectious or parasitic diseases  \
0                                            0.