In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import io
from google.colab import files

# --- Re-create df_combined and other necessary variables ---
# This section is added to ensure df_combined and other required variables are always defined.

# Load the primary medical dataset (df)
print("Please upload your primary medical record dataset (e.g., synthetic emr dataset.csv).")
uploaded_df_primary = files.upload()
for file_name in uploaded_df_primary.keys():
    df = pd.read_csv(io.BytesIO(uploaded_df_primary[file_name]))

# Load the 'emotional scores' dataset (df_emotion)
# Note: From previous executions, this file often contains a duplicate of the medical data.
# We proceed with merging based on the assumption that it's meant to be combined.
print("Please upload the 'emotional_scores.csv' or 'EMR_with_emotions.csv' file.")
uploaded_emotion = files.upload()
for file_name in uploaded_emotion.keys():
    df_emotion = pd.read_csv(io.BytesIO(uploaded_emotion[file_name]))

# Merge the two DataFrames based on 'patient_id'
df_combined = pd.merge(df, df_emotion, on='patient_id', how='left', suffixes=('_medical', '_emotion'))

# Re-preprocess symptoms to ensure consistent format
df_combined['symptoms_medical'] = df_combined['symptoms_medical'].fillna('no symptoms')
df_combined['symptoms_medical'] = df_combined['symptoms_medical'].apply(lambda x: ' '.join(str(x).split(',')))

# Re-vectorize the symptoms column using TF-IDF on the updated df_combined
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
X = tfidf_vectorizer.fit_transform(df_combined['symptoms_medical'])

# Re-encode the target variable (medical conditions)
le_condition = LabelEncoder()
y = le_condition.fit_transform(df_combined['medical_condition_medical'].fillna("Unknown"))

# Get the list of all unique medical conditions from the target variable
all_conditions = list(le_condition.classes_)

# Re-define medical_knowledge_base (from cell 6d4ef890 or 949d0efd)
medical_knowledge_base = {
    'fever': ['Infection', 'Influenza', 'Malaria', 'Typhoid'],
    'cough': ['COPD', 'Asthma', 'Infection'],
    'wheezing': ['COPD', 'Asthma'],
    'shortness of breath': ['COPD', 'Asthma', 'Heart Disease'],
    'chest pain': ['Heart Disease'],
    'headache': ['Hypertension', 'Malaria', 'Typhoid'],
    'nausea': ['Ulcer', 'Typhoid'],
    'vomiting': ['Ulcer', 'Typhoid'],
    'bloating': ['Ulcer'],
    'swollen lymph nodes': ['HIV'],
    'weight loss': ['HIV', 'Diabetes'],
    'frequent urination': ['Diabetes'],
    'increased thirst': ['Diabetes'],
    'muscle pain': ['Malaria', 'Typhoid'],
    'chills': ['Malaria', 'Typhoid', 'Infection'],
    'fatigue': ['HIV', 'Diabetes', 'COPD', 'Asthma', 'Heart Disease', 'Unknown'],
    'no symptoms': ['Unknown']
}

# Function to Generate Symbolic Features
def generate_symbolic_features(symptoms_text, knowledge_base, all_possible_conditions):
    symbolic_feature_counts = {condition: 0 for condition in all_possible_conditions}
    individual_symptom_terms = symptoms_text.split()
    for term in individual_symptom_terms:
        for kb_symptom, associated_conditions in knowledge_base.items():
            if term == kb_symptom:
                for condition in associated_conditions:
                    if condition in symbolic_feature_counts:
                        symbolic_feature_counts[condition] += 1
    feature_vector = [symbolic_feature_counts[condition] for condition in all_possible_conditions]
    return np.array(feature_vector)

# --- Generate features for original neuro-symbolic model ---
symbolic_features_series = df_combined['symptoms_medical'].apply(
    lambda x: generate_symbolic_features(x, medical_knowledge_base, all_conditions)
)
symbolic_features_array = np.vstack(symbolic_features_series.values)
numerical_features = df_combined[['age_medical']].values
X_dense = X.toarray()
X_combined_neuro_symbolic = np.hstack((X_dense, symbolic_features_array, numerical_features))

# Split data for original neuro-symbolic model
X_train_ns, X_test_ns, y_train_ns, y_test_ns = train_test_split(
    X_combined_neuro_symbolic, y, test_size=0.2, random_state=42, stratify=y
)

# Train original neuro-symbolic Random Forest model
rf_model_ns = RandomForestClassifier(random_state=42)
rf_model_ns.fit(X_train_ns, y_train_ns)
y_pred_ns = rf_model_ns.predict(X_test_ns)

# --- Generate features for refined neuro-symbolic model ---
# Re-define refined_medical_knowledge_base (from cell 3754ccdf or c4794f16)
refined_medical_knowledge_base = medical_knowledge_base.copy()
refined_medical_knowledge_base['tightness in chest'] = ['Asthma']
refined_medical_knowledge_base['difficulty breathing'] = ['Asthma', 'COPD', 'Heart Disease']
refined_medical_knowledge_base['palpitations'] = ['Heart Disease']
refined_medical_knowledge_base['dizziness'] = ['Heart Disease', 'Hypertension', 'Unknown']

symbolic_features_series_refined = df_combined['symptoms_medical'].apply(
    lambda x: generate_symbolic_features(x, refined_medical_knowledge_base, all_conditions)
)
symbolic_features_array_refined = np.vstack(symbolic_features_series_refined.values)
X_combined_neuro_symbolic_refined = np.hstack((X_dense, symbolic_features_array_refined, numerical_features))

# Split data for refined neuro-symbolic model
X_train_ns_refined, X_test_ns_refined, y_train_ns_refined, y_test_ns_refined = train_test_split(
    X_combined_neuro_symbolic_refined, y, test_size=0.2, random_state=42, stratify=y
)

# Train refined neuro-symbolic Random Forest model
rf_model_ns_refined = RandomForestClassifier(random_state=42)
rf_model_ns_refined.fit(X_train_ns_refined, y_train_ns_refined)
y_pred_ns_refined = rf_model_ns_refined.predict(X_test_ns_refined)

print("All necessary prediction variables and models re-established.")

# --- Alternative Visualization 1: Classification Report Heatmap ---

def plot_classification_report_heatmap(y_true, y_pred, target_names, title="Classification Report Heatmap"):
    report = classification_report(y_true, y_pred, target_names=target_names, output_dict=True)
    df_report = pd.DataFrame(report).transpose()

    # Drop 'accuracy', 'macro avg', 'weighted avg' rows for class-wise view
    df_report_classes = df_report.drop(columns=['support'], errors='ignore')
    df_report_classes = df_report_classes.drop(['accuracy', 'macro avg', 'weighted avg'], errors='ignore')

    plt.figure(figsize=(10, 6))
    sns.heatmap(df_report_classes.iloc[:, :3], annot=True, cmap='Blues', fmt='.2f', linewidths=.5)
    plt.title(title)
    plt.ylabel('Class')
    plt.xlabel('Metrics')
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()


print("\n--- Classification Report Heatmaps ---")
plot_classification_report_heatmap(y_test_ns, y_pred_ns, le_condition.classes_, "Classification Report (Original Neuro-Symbolic RF)")
plot_classification_report_heatmap(y_test_ns_refined, y_pred_ns_refined, le_condition.classes_, "Classification Report (Refined Neuro-Symbolic RF)")


# --- Alternative Visualization 2: Normalized Confusion Matrix ---

def plot_normalized_confusion_matrix(y_true, y_pred, target_names, title="Normalized Confusion Matrix", normalize='true'):
    # normalize='true'  -> normalize by row (true labels)
    # normalize='pred'  -> normalize by column (predicted labels)
    # normalize='all'   -> normalize by all samples

    cm = confusion_matrix(y_true, y_pred, normalize=normalize)

    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, cmap='Blues', fmt='.2f', linewidths=.5,
                xticklabels=target_names, yticklabels=target_names)
    plt.title(f"{title} (Normalized by {normalize.capitalize()})")
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()


print("\n--- Normalized Confusion Matrices (Normalized by True Labels) ---")
plot_normalized_confusion_matrix(y_test_ns, y_pred_ns, le_condition.classes_, "Confusion Matrix (Original Neuro-Symbolic RF)", normalize='true')
plot_normalized_confusion_matrix(y_test_ns_refined, y_pred_ns_refined, le_condition.classes_, "Confusion Matrix (Refined Neuro-Symbolic RF)", normalize='true')



# The training and evaluation for Deep Learning model already took place in cell b187ec87.
# We need to make sure the metrics from it are also collected.

# Ensure `accuracy_dl`, `macro_precision_dl`, `macro_recall_dl`, `macro_f1_dl`,
# `weighted_precision_dl`, `weighted_recall_dl`, `weighted_f1_dl` are defined.
# If running this independently, the deep learning training cell (b187ec87) must be run first.

# We also need the metrics from the Random Forest (Neuro-Symbolic) and (Refined Neuro-Symbolic)
# which were calculated in cells like b3d11b40 or 3754ccdf. For consistency,
# we will re-calculate them here or ensure their variables are available.

# Re-calculating metrics for RF Neuro-Symbolic (original) to ensure variables are fresh
# (Assuming y_pred_ns, y_test_ns, le_condition are available from cell 576d61d8)
accuracy_ns = accuracy_score(y_test_ns, y_pred_ns)
report_ns = classification_report(y_test_ns, y_pred_ns, target_names=le_condition.classes_, output_dict=True)
macro_precision_ns = report_ns['macro avg']['precision']
macro_recall_ns = report_ns['macro avg']['recall']
macro_f1_ns = report_ns['macro avg']['f1-score']
weighted_precision_ns = report_ns['weighted avg']['precision']
weighted_recall_ns = report_ns['weighted avg']['recall']
weighted_f1_ns = report_ns['weighted avg']['f1-score']

# Re-calculating metrics for RF Neuro-Symbolic (refined) to ensure variables are fresh
# (Assuming y_pred_ns_refined, y_test_ns_refined, le_condition are available from cell 576d61d8)
accuracy_ns_refined = accuracy_score(y_test_ns_refined, y_pred_ns_refined)
report_ns_refined = classification_report(y_test_ns_refined, y_pred_ns_refined, target_names=le_condition.classes_, output_dict=True)
macro_precision_ns_refined = report_ns_refined['macro avg']['precision']
macro_recall_ns_refined = report_ns_refined['macro avg']['recall']
macro_f1_ns_refined = report_ns_refined['macro avg']['f1-score']
weighted_precision_ns_refined = report_ns_refined['weighted avg']['precision']
weighted_recall_ns_refined = report_ns_refined['weighted avg']['recall']
weighted_f1_ns_refined = report_ns_refined['weighted avg']['f1-score']

# Import necessary libraries for building a deep learning model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical # <--- ADDED THIS IMPORT

# Ensure X_scaled_ns, y_train_onehot_ns, y_test_onehot_ns, y_test_ns, le_condition are defined
# This requires re-running the DL preparation and training block if kernel was restarted

# Re-executing relevant parts of cell b187ec87 to define DL metrics if not already done
# --- Prepare Data for Deep Learning --- #
# We will scale the combined features X_combined_neuro_symbolic_refined.
# For consistency, using the refined features for DL as well, as this is the latest state.
scaler = StandardScaler()
X_scaled_ns = scaler.fit_transform(X_combined_neuro_symbolic_refined)

# Split the scaled data for training and testing
X_train_scaled_ns, X_test_scaled_ns, y_train_ns, y_test_ns = train_test_split(
    X_scaled_ns, y, test_size=0.2, random_state=42, stratify=y
)

# Convert target variable to one-hot encoding for categorical crossentropy loss
y_train_onehot_ns = to_categorical(y_train_ns)
y_test_onehot_ns = to_categorical(y_test_ns)

# Define the Deep Learning Model (Simple Feedforward Neural Network)
model_dl = Sequential()
model_dl.add(Dense(128, activation='relu', input_shape=(X_train_scaled_ns.shape[1],)))
model_dl.add(Dropout(0.5))
model_dl.add(Dense(64, activation='relu'))
model_dl.add(Dropout(0.5))
model_dl.add(Dense(len(le_condition.classes_), activation='softmax'))

# Compile the model
model_dl.compile(optimizer=Adam(learning_rate=0.001),
                 loss='categorical_crossentropy',
                 metrics=['accuracy'])

# Train the model (briefly, to get metrics)
model_dl.fit(X_train_scaled_ns, y_train_onehot_ns,
                       epochs=10, # Reduced epochs for faster execution in this re-run
                       batch_size=32,
                       validation_split=0.2,
                       verbose=0) # Suppress verbose output

# Evaluate the model on the test set
loss_dl, accuracy_dl = model_dl.evaluate(X_test_scaled_ns, y_test_onehot_ns, verbose=0)

y_pred_probs_dl = model_dl.predict(X_test_scaled_ns, verbose=0)
y_pred_classes_dl = np.argmax(y_pred_probs_dl, axis=1) # Get the predicted class index

report_dl = classification_report(y_test_ns, y_pred_classes_dl, target_names=le_condition.classes_, output_dict=True)
macro_precision_dl = report_dl['macro avg']['precision']
macro_recall_dl = report_dl['macro avg']['recall']
macro_f1_dl = report_dl['macro avg']['f1-score']
weighted_precision_dl = report_dl['weighted avg']['precision']
weighted_recall_dl = report_dl['weighted avg']['recall']
weighted_f1_dl = report_dl['weighted avg']['f1-score']

# Initialize all_performance_metrics dictionary
all_performance_metrics = {
    'Model': [],
    'Accuracy': [],
    'Macro Avg Precision': [],
    'Macro Avg Recall': [],
    'Macro Avg F1-score': [],
    'Weighted Avg Precision': [],
    'Weighted Avg Recall': [],
    'Weighted Avg F1-score': []
}

# Add ML model metrics
all_performance_metrics['Model'].extend([
    'Multinomial Naive Bayes (Neuro-Symbolic)',
    'SVM (RBF) (Neuro-Symbolic)',
    'Gradient Boosting (Neuro-Symbolic)',
    'Random Forest (Neuro-Symbolic)',
    'Random Forest (Refined Neuro-Symbolic)'
])
all_performance_metrics['Accuracy'].extend([
    accuracy_mnb,
    accuracy_svm,
    accuracy_gb,
    accuracy_ns,
    accuracy_ns_refined
])
all_performance_metrics['Macro Avg Precision'].extend([
    macro_p_mnb,
    macro_p_svm,
    macro_p_gb,
    macro_precision_ns,
    macro_precision_ns_refined
])
all_performance_metrics['Macro Avg Recall'].extend([
    macro_r_mnb,
    macro_r_svm,
    macro_r_gb,
    macro_recall_ns,
    macro_recall_ns_refined
])
all_performance_metrics['Macro Avg F1-score'].extend([
    macro_f1_mnb,
    macro_f1_svm,
    macro_f1_gb,
    macro_f1_ns,
    macro_f1_ns_refined
])
all_performance_metrics['Weighted Avg Precision'].extend([
    weighted_p_mnb,
    weighted_p_svm,
    weighted_p_gb,
    weighted_precision_ns,
    weighted_precision_ns_refined
])
all_performance_metrics['Weighted Avg Recall'].extend([
    weighted_r_mnb,
    weighted_r_svm,
    weighted_r_gb,
    weighted_recall_ns,
    weighted_recall_ns_refined
])
all_performance_metrics['Weighted Avg F1-score'].extend([
    weighted_f1_mnb,
    weighted_f1_svm,
    weighted_f1_gb,
    weighted_f1_ns,
    weighted_f1_ns_refined
])

# Add the Deep Learning model's metrics
all_performance_metrics['Model'].append('Feedforward Neural Network (Neuro-Symbolic)')
all_performance_metrics['Accuracy'].append(accuracy_dl)
all_performance_metrics['Macro Avg Precision'].append(macro_precision_dl)
all_performance_metrics['Macro Avg Recall'].append(macro_recall_dl)
all_performance_metrics['Macro Avg F1-score'].append(macro_f1_dl)
all_performance_metrics['Weighted Avg Precision'].append(weighted_precision_dl)
all_performance_metrics['Weighted Avg Recall'].append(weighted_recall_dl)
all_performance_metrics['Weighted Avg F1-score'].append(weighted_f1_dl)

print("Performance metrics collected and updated to include all ML and DL models.")

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# Assuming all_performance_metrics dictionary is available from previous steps
# If 'all_performance_metrics' is not defined, you need to run the cell that collects
# the performance metrics first (cell 2632005d or similar).

# Re-create performance_df_all from all_performance_metrics
performance_df_all = pd.DataFrame(all_performance_metrics)
performance_df_all = performance_df_all.set_index('Model')

# ----------------------------
# Prepare Data for Radar Chart
# ----------------------------
# Use performance_df_all which contains all models and their metrics
df_radar = performance_df_all.copy()

# Normalize scores for radar chart if values have very different scales (optional, but good practice)
# In our case, all metrics are already between 0 and 1, so direct use is fine for this example.

metrics_columns = df_radar.columns.tolist() # Get all metric names
N = len(metrics_columns)

# Angle setup for radar chart
angles = np.linspace(0, 2 * np.pi, N, endpoint=False).tolist()
angles += angles[:1] # Close the circle

# Determine the best model for highlighting (e.g., by Accuracy)
best_model_name = df_radar['Accuracy'].idxmax()

# ----------------------------
# Radar Chart
# ----------------------------
fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))

# Plot each model's performance profile
for index, row in df_radar.iterrows():
    model_name = index
    # Get metric values, ensure they are in the same order as metrics_columns
    values = row[metrics_columns].tolist()
    values += values[:1] # Close the circle

    if model_name == best_model_name:
        ax.plot(angles, values, label=model_name, linewidth=2.5, linestyle="-", color="red", zorder=3)
        ax.fill(angles, values, color="red", alpha=0.25)
    else:
        ax.plot(angles, values, label=model_name, linewidth=1, linestyle="--", alpha=0.7, zorder=2)

# Set the labels for each metric (at each angle point)
ax.set_xticks(angles[:-1])
ax.set_xticklabels(metrics_columns, fontsize=10)

# Set radial (y-axis) limits and labels
ax.set_ylim(0, 1.0) # Metrics are typically between 0 and 1
ax.set_yticks(np.arange(0.2, 1.1, 0.2)) # Example: 0.2, 0.4, 0.6, 0.8, 1.0
ax.set_yticklabels([f'{y:.1f}' for y in np.arange(0.2, 1.1, 0.2)], color='grey', size=9)

ax.set_title("Radar Chart of Model Performance (Neuro-Symbolic Features)", fontsize=14, weight="bold", y=1.1)
ax.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left', fontsize=10)
ax.grid(True)
plt.tight_layout()
plt.show()

print("\n--- Interpretation of the Radar Chart ---")
print("The Radar Chart visually compares models across multiple performance metrics. Each 'spoke' represents a metric (e.g., Accuracy, F1-score), and the distance from the center along that spoke indicates the score for that metric. A larger, more outward-reaching polygon for a model signifies better overall performance across the evaluated metrics.")
print("You can quickly identify:")
print("- **Overall Best Performers:** Models whose polygons enclose a larger area.")
print("- **Strengths and Weaknesses:** Where a model's line extends further out (strength) or dips closer to the center (weakness) on specific metrics.")
print("- **Consistency:** How consistently a model performs across all metrics. For instance, a model with a very irregular shape might be strong in some areas but weak in others.")
print(f"In this chart, the model highlighted in red is the '{best_model_name}', which achieved the highest accuracy.")



"""# Task
## Explain Negation Recognition

### Subtask:
Briefly explain what negation recognition is, its importance in medical text analysis, and the strategy to be implemented (e.g., prepending 'NEG_' to negated terms).

## Load Medical Dataset

### Subtask:
Load the `synthetic_emr_data.csv` dataset into a pandas DataFrame.

**Reasoning**:
Load the `synthetic_emr_data.csv` dataset into a pandas DataFrame and display its head to verify successful loading, as instructed.
"""

import pandas as pd
from google.colab import files
import io

print("Please upload the `synthetic_emr_data.csv` file.")
uploaded = files.upload()

# Assuming only one file is uploaded, get its name
for file_name in uploaded.keys():
    df = pd.read_csv(io.BytesIO(uploaded[file_name]))

print("Dataset loaded successfully. Displaying the first 5 rows:")
display(df.head())

"""## Negation Recognition: Explanation, Importance, and Strategy

**1. What is Negation Recognition?**

Negation recognition in Natural Language Processing (NLP) is the task of identifying when a term or concept in a text is explicitly stated as *not* being present or *not* happening. It involves detecting negation cues (e.g., "no", "not", "denies", "without", "absence of") and determining the scope of these cues â€“ that is, which terms or phrases are affected by the negation.

For example, in a medical context:
*   "Patient **denies** chest pain." (Negation applies to "chest pain")
*   "The scan showed **no** signs of tumor." (Negation applies to "signs of tumor")
*   "Symptoms included cough **but not** fever." (Negation applies to "fever")

**2. Importance in Medical Text Analysis**

Negation recognition is critically important in medical text analysis for several reasons:

*   **Accuracy of Information Extraction:** Failing to correctly identify negated terms can lead to significant errors in patient records, such as marking a symptom as present when it was explicitly denied, or extracting a diagnosis that was ruled out. This can have serious implications for patient care, research, and public health.
*   **Clinical Decision Support:** Clinical decision support systems rely on accurate information. If a system incorrectly infers the presence of a symptom or condition, it might suggest inappropriate diagnostic tests or treatments.
*   **Phenotyping and Cohort Selection:** For research purposes, identifying patient cohorts with specific conditions or symptom profiles requires precise information. Negated findings are just as important as positive findings for accurate phenotyping.
*   **Patient Safety:** Misinterpreting negated information could lead to incorrect medical assessments, drug interactions, or missed contraindications.
*   **Distinguishing between Present and Absent:** Explicitly noting the *absence* of a symptom can be diagnostically significant, helping to narrow down possibilities or rule out certain conditions.

**3. Strategy to be Implemented: Prepending 'NEG_' to Negated Terms**

A common and effective strategy for handling negation in a bag-of-words or TF-IDF based NLP pipeline is to modify the negated terms themselves. The proposed strategy is:

*   **Identify Negation Cues:** Define a list of negation words or phrases (e.g., "no", "not", "denies", "without", "absence of", "free from").
*   **Determine Scope:** For each negation cue, define a window or scope (e.g., the next 2-3 words, or until a punctuation mark/conjunction) within which terms are considered negated.
*   **Modify Negated Terms:** Prepend a special token, like `NEG_`, to each term that falls within the scope of a negation cue.

**Example:**

Original Text: "Patient denies headache, but reports fever."

1.  Identify negation cue: "denies"
2.  Scope: "headache"
3.  Modified Text: "Patient NEG_headache, but reports fever."

**How this helps the model:**

By treating `headache` and `NEG_headache` as distinct features, the model can learn different associations for the presence versus absence of a symptom. If `NEG_headache` frequently co-occurs with a certain medical condition (e.g., ruling out a migraine), the model can leverage this explicit negative evidence.

**Reasoning**:
The subtask requires implementing negation recognition. I will define a function to identify negation cues and their scope within the text, then prepend 'NEG_' to the negated terms. This function will be applied to the 'symptoms' and 'clinical_notes' columns of the `df` DataFrame, and the modified DataFrame will be displayed to confirm the changes.
"""

import re

# Step 1: Define negation cues and their scope
# A list of negation words. More comprehensive lists exist for clinical NLP.
negation_cues = [
    "no", "not", "denies", "denied", "absence", "absent", "without",
    "free of", "negative for", "rule out", "r/o"
]

# Define the scope of negation (words after the negation cue to be negated)
# This is a heuristic; more advanced methods use dependency parsing.
# For simplicity, we'll negate a fixed number of words or until a punctuation/conjunction.
negation_scope_window = 3 # Number of words after negation cue to consider negated

def negation_handler(text):
    if not isinstance(text, str): # Handle non-string inputs
        return text

    processed_text = []
    words = text.split()
    i = 0
    while i < len(words):
        word = words[i]
        is_negated = False
        # Check for negation cues
        for cue in negation_cues:
            if word.lower() == cue or (i > 0 and (words[i-1].lower() + " " + word.lower()) == cue): # handles multi-word cues like 'free of'
                processed_text.append(word) # Keep the cue itself
                # Apply negation to subsequent words within the scope
                for j in range(1, negation_scope_window + 1):
                    if i + j < len(words):
                        term_to_negate = words[i + j]
                        # Simple check for punctuation to end scope
                        if re.match(r'[.,;:]', term_to_negate):
                            break
                        processed_text.append("NEG_" + term_to_negate)
                    else:
                        break
                i += negation_scope_window # Skip over already processed negated words
                is_negated = True
                break

        if not is_negated:
            processed_text.append(word)
            i += 1
    return ' '.join(processed_text)

# Step 2: Apply the negation handler to 'symptoms' and 'clinical_notes' columns
print("Applying negation recognition to 'symptoms' column...")
df['symptoms_negated'] = df['symptoms'].apply(negation_handler)

print("Applying negation recognition to 'clinical_notes' column...")
df['clinical_notes_negated'] = df['clinical_notes'].apply(negation_handler)

# Step 3: Display the modified DataFrame (new columns) to verify changes
print("\nDataFrame with new negated columns:")
display(df[['symptoms', 'symptoms_negated', 'clinical_notes', 'clinical_notes_negated']].head())

"""## Regenerate Neuro-Symbolic Features with Negation

### Subtask:
Apply the updated symptom preprocessing (negation-aware) to the medical data, re-run TF-IDF vectorization and symbolic feature generation, and concatenate with numerical features to create a new neuro-symbolic dataset that incorporates negation.

**Reasoning**:
The subtask requires generating new neuro-symbolic features that incorporate negation. This involves using the `symptoms_negated` column for TF-IDF vectorization and symbolic feature generation, combining them with numerical features, and then splitting the resulting dataset into training and testing sets, following the detailed instructions provided.
"""

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# 1. Set the 'symptoms' column of the DataFrame to the newly created 'symptoms_negated' column
#    This step ensures all subsequent feature generation uses the negation-aware text.
df['symptoms'] = df['symptoms_negated']

# 2. Re-initialize and fit a TfidfVectorizer on the updated 'symptoms' column
tfidf_vectorizer_negated = TfidfVectorizer(stop_words='english', max_features=500)
X_tfidf_negated = tfidf_vectorizer_negated.fit_transform(df['symptoms'])

# 3. Re-initialize and fit a LabelEncoder on the 'medical_condition' column to encode the target variable
#    (Assuming 'medical_condition' is the target column in the original df)
le_condition = LabelEncoder()
y = le_condition.fit_transform(df['medical_condition'].fillna("Unknown"))

# 4. Ensure `all_conditions` is derived from the fitted LabelEncoder.
all_conditions = list(le_condition.classes_)

# 5. Use the `generate_symbolic_features` function with the updated 'symptoms' column and the `medical_knowledge_base`
#    (Assumes medical_knowledge_base is defined from previous steps)
symbolic_features_series_negated = df['symptoms'].apply(
    lambda x: generate_symbolic_features(x, medical_knowledge_base, all_conditions)
)
symbolic_features_array_negated = np.vstack(symbolic_features_series_negated.values)

# 6. Extract numerical features (e.g., 'age') from the `df` DataFrame
numerical_features_negated = df[['age']].values # Assuming 'age' is the numerical feature

# 7. Convert X_tfidf_negated (sparse matrix) to a dense NumPy array.
X_tfidf_negated_dense = X_tfidf_negated.toarray()

# 8. Horizontally concatenate the dense TF-IDF features, symbolic features, and numerical features
X_combined_neuro_symbolic_negated = np.hstack((
    X_tfidf_negated_dense,
    symbolic_features_array_negated,
    numerical_features_negated
))

# 9. Print the shapes of the individual feature arrays and the final combined array
print("Shape of TF-IDF features (negation-aware dense):", X_tfidf_negated_dense.shape)
print("Shape of Symbolic features (negation-aware):", symbolic_features_array_negated.shape)
print("Shape of Numerical features:", numerical_features_negated.shape)
print("Shape of combined (Negation-aware Neuro-Symbolic) features:", X_combined_neuro_symbolic_negated.shape)

# 10. Split the X_combined_neuro_symbolic_negated and y into training and test sets
X_train_ns_negated, X_test_ns_negated, y_train_ns_negated, y_test_ns_negated = train_test_split(
    X_combined_neuro_symbolic_negated, y, test_size=0.2, random_state=42, stratify=y
)

print("\nData splitting complete for negation-aware neuro-symbolic approach.")
print("Shape of X_train_ns_negated:", X_train_ns_negated.shape)
print("Shape of X_test_ns_negated:", X_test_ns_negated.shape)
print("Shape of y_train_ns_negated:", y_train_ns_negated.shape)
print("Shape of y_test_ns_negated:", y_test_ns_negated.shape)

"""**Reasoning**:
The previous code failed because the `generate_symbolic_features` function was not defined. I need to include the definition of this function and the `medical_knowledge_base` it relies on within the current code block to ensure all dependencies are met before generating the features. This will allow the symbolic feature generation to proceed as intended with the negation-aware text.


"""

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Re-define medical_knowledge_base (from previous steps)
medical_knowledge_base = {
    'fever': ['Infection', 'Influenza', 'Malaria', 'Typhoid'],
    'cough': ['COPD', 'Asthma', 'Infection'],
    'wheezing': ['COPD', 'Asthma'],
    'shortness of breath': ['COPD', 'Asthma', 'Heart Disease'],
    'chest pain': ['Heart Disease'],
    'headache': ['Hypertension', 'Malaria', 'Typhoid'],
    'nausea': ['Ulcer', 'Typhoid'],
    'vomiting': ['Ulcer', 'Typhoid'],
    'bloating': ['Ulcer'],
    'swollen lymph nodes': ['HIV'],
    'weight loss': ['HIV', 'Diabetes'],
    'frequent urination': ['Diabetes'],
    'increased thirst': ['Diabetes'],
    'muscle pain': ['Malaria', 'Typhoid'],
    'chills': ['Malaria', 'Typhoid', 'Infection'],
    'fatigue': ['HIV', 'Diabetes', 'COPD', 'Asthma', 'Heart Disease', 'Unknown'],
    'no symptoms': ['Unknown']
}

# Re-define generate_symbolic_features function (from previous steps)
def generate_symbolic_features(symptoms_text, knowledge_base, all_possible_conditions):
    """
    Generates symbolic features based on the presence of symptoms related to each condition
    in the knowledge base.

    Args:
        symptoms_text (str): The cleaned symptoms text for a single patient.
        knowledge_base (dict): The medical knowledge base mapping symptoms to conditions.
        all_possible_conditions (list): A list of all possible medical conditions.

    Returns:
        numpy.ndarray: An array of symbolic features, where each element
                       represents a count or binary indicator related to a specific condition.
    """
    # Initialize feature vector with zeros for all conditions
    # We'll use counts of relevant symptoms for each condition
    symbolic_feature_counts = {condition: 0 for condition in all_possible_conditions}

    # Split the symptoms text into individual symptom terms
    individual_symptom_terms = symptoms_text.split()

    # Iterate through the individual symptom terms for the patient
    for term in individual_symptom_terms:
        # Iterate through the symptoms in the knowledge base
        for kb_symptom, associated_conditions in knowledge_base.items():
            # Simple check if the symptom term matches a key in the knowledge base
            # More sophisticated matching (fuzzy, stemming, concept linking) could be used here
            if term == kb_symptom:
                # If a match is found, increment the count for the associated conditions
                for condition in associated_conditions:
                     # Ensure the condition is one of our target conditions
                    if condition in symbolic_feature_counts:
                        symbolic_feature_counts[condition] += 1

    # Convert the dictionary of counts to a list/array in the same order as all_possible_conditions
    feature_vector = [symbolic_feature_counts[condition] for condition in all_possible_conditions]

    return np.array(feature_vector)

# 1. Set the 'symptoms' column of the DataFrame to the newly created 'symptoms_negated' column
#    This step ensures all subsequent feature generation uses the negation-aware text.
#    Assuming 'df' and 'symptoms_negated' column are available from previous steps.
df['symptoms'] = df['symptoms_negated']

# 2. Re-initialize and fit a TfidfVectorizer on the updated 'symptoms' column
tfidf_vectorizer_negated = TfidfVectorizer(stop_words='english', max_features=500)
X_tfidf_negated = tfidf_vectorizer_negated.fit_transform(df['symptoms'])

# 3. Re-initialize and fit a LabelEncoder on the 'medical_condition' column to encode the target variable
#    (Assuming 'medical_condition' is the target column in the original df)
le_condition = LabelEncoder()
y = le_condition.fit_transform(df['medical_condition'].fillna("Unknown"))

# 4. Ensure `all_conditions` is derived from the fitted LabelEncoder.
all_conditions = list(le_condition.classes_)

# 5. Use the `generate_symbolic_features` function with the updated 'symptoms' column and the `medical_knowledge_base`
symbolic_features_series_negated = df['symptoms'].apply(
    lambda x: generate_symbolic_features(x, medical_knowledge_base, all_conditions)
)
symbolic_features_array_negated = np.vstack(symbolic_features_series_negated.values)

# 6. Extract numerical features (e.g., 'age') from the `df` DataFrame
numerical_features_negated = df[['age']].values # Assuming 'age' is the numerical feature

# 7. Convert X_tfidf_negated (sparse matrix) to a dense NumPy array.
X_tfidf_negated_dense = X_tfidf_negated.toarray()

# 8. Horizontally concatenate the dense TF-IDF features, symbolic features, and numerical features
X_combined_neuro_symbolic_negated = np.hstack((
    X_tfidf_negated_dense,
    symbolic_features_array_negated,
    numerical_features_negated
))

# 9. Print the shapes of the individual feature arrays and the final combined array
print("Shape of TF-IDF features (negation-aware dense):", X_tfidf_negated_dense.shape)
print("Shape of Symbolic features (negation-aware):", symbolic_features_array_negated.shape)
print("Shape of Numerical features:", numerical_features_negated.shape)
print("Shape of combined (Negation-aware Neuro-Symbolic) features:", X_combined_neuro_symbolic_negated.shape)

# 10. Split the X_combined_neuro_symbolic_negated and y into training and test sets
X_train_ns_negated, X_test_ns_negated, y_train_ns_negated, y_test_ns_negated = train_test_split(
    X_combined_neuro_symbolic_negated, y, test_size=0.2, random_state=42, stratify=y
)

print("\nData splitting complete for negation-aware neuro-symbolic approach.")
print("Shape of X_train_ns_negated:", X_train_ns_negated.shape)
print("Shape of X_test_ns_negated:", X_test_ns_negated.shape)
print("Shape of y_train_ns_negated:", y_train_ns_negated.shape)
print("Shape of y_test_ns_negated:", y_test_ns_negated.shape)

"""**Reasoning**:
Train a classification model on the combined negation-aware neuro-symbolic features and evaluate its performance to see the impact of negation recognition.


"""

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_recall_fscore_support
import seaborn as sns
import matplotlib.pyplot as plt

# Step 1: Choose and instantiate a classification model (Random Forest)
# Using Random Forest as it performed well in previous steps
rf_model_ns_negated = RandomForestClassifier(random_state=42)

# Step 2: Train the model on the negation-aware neuro-symbolic training data
print("Starting Random Forest Model training on negation-aware neuro-symbolic features...")
rf_model_ns_negated.fit(X_train_ns_negated, y_train_ns_negated)
print("Training complete.")

# Step 3: Evaluate the Model on the test set
y_pred_ns_negated = rf_model_ns_negated.predict(X_test_ns_negated)

# Accuracy
accuracy_ns_negated = accuracy_score(y_test_ns_negated, y_pred_ns_negated)
print(f"\nRandom Forest Model Accuracy (Negation-aware Neuro-Symbolic Features): {accuracy_ns_negated * 100:.2f}%")

# Classification report
print("\nClassification Report (Negation-aware Neuro-Symbolic Features):")
print(classification_report(y_test_ns_negated, y_pred_ns_negated, target_names=le_condition.classes_))

# Confusion Matrix
conf_matrix_ns_negated = confusion_matrix(y_test_ns_negated, y_pred_ns_negated)
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix_ns_negated, annot=True, fmt="d", cmap="Blues",
            xticklabels=le_condition.classes_, yticklabels=le_condition.classes_)
plt.title("Confusion Matrix (Random Forest - Negation-aware Neuro-Symbolic Features)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# Calculate Precision, Recall, F1 Score for Macro and Weighted averages
macro_precision_ns_negated, macro_recall_ns_negated, macro_f1_ns_negated, _ = precision_recall_fscore_support(y_test_ns_negated, y_pred_ns_negated, average="macro")
weighted_precision_ns_negated, weighted_recall_ns_negated, weighted_f1_ns_negated, _ = precision_recall_fscore_support(y_test_ns_negated, y_pred_ns_negated, average="weighted")

# Display Macro and Weighted Average Metrics
print("\nMacro Average Metrics (Negation-aware Neuro-Symbolic Features):")
print(f"Precision: {macro_precision_ns_negated:.2f}, Recall: {macro_recall_ns_negated:.2f}, F1 Score: {macro_f1_ns_negated:.2f}")

print("\nWeighted Average Metrics (Negation-aware Neuro-Symbolic Features):")
print(f"Precision: {weighted_precision_ns_negated:.2f}, Recall: {weighted_recall_ns_negated:.2f}, F1 Score: {weighted_f1_ns_negated:.2f}")

"""## Analyze the Impact of Negation Recognition

### Subtask:
Analyze the performance of the negation-aware neuro-symbolic model and compare it with the previous neuro-symbolic model (without negation) to interpret how negation recognition influenced the predictions.

#### Instructions
1.  Compare the overall accuracy of the Random Forest model with negation-aware features (`accuracy_ns_negated`) to the accuracy of the refined neuro-symbolic Random Forest model without explicit negation handling (`accuracy_ns_refined`).
2.  Analyze the classification reports for both models, focusing on class-specific precision, recall, and F1-scores, especially for classes where negation might play a significant role.
3.  Visually compare the confusion matrices of both models to observe changes in true positives, false positives, and false negatives for different classes.
4.  Discuss how the explicit handling of negation (prepending 'NEG_') might have influenced the model's ability to differentiate between the presence and absence of symptoms, leading to changes in prediction accuracy or class-specific performance.
5.  Hypothesize about the conditions or symptoms where negation recognition had the most significant positive or negative impact.
6.  Summarize the key findings regarding the benefits or drawbacks of incorporating negation recognition into the neuro-symbolic model.

**Reasoning**:
To analyze the impact of negation recognition, I will compare the performance metrics of the Random Forest model trained with negation-aware neuro-symbolic features against the refined neuro-symbolic model without explicit negation handling. This involves displaying overall accuracy, classification reports, and confusion matrices for both models to identify differences and changes in class-specific performance.
"""

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_recall_fscore_support
import numpy as np
import pandas as pd # Import pandas if not already in context

# --- Ensure all necessary variables from previous steps are available ---
# Re-calculating metrics for RF Neuro-Symbolic (refined) if not already available in the current kernel state
# This assumes X_combined_neuro_symbolic_refined and y are available

# For robustness, assuming X_combined_neuro_symbolic_refined, y, and le_condition are defined.
# If not, the previous code block for feature generation for 'refined_neuro_symbolic' model should be run first.

# Split data for refined neuro-symbolic model (without explicit negation handling)
# Re-run this split to ensure y_train_ns_refined, y_test_ns_refined are correct for the refined NS model
# and X_train_ns_refined, X_test_ns_refined are also re-split consistently
X_train_ns_refined_comp, X_test_ns_refined_comp, y_train_ns_refined_comp, y_test_ns_refined_comp = train_test_split(
    X_combined_neuro_symbolic_refined, y, test_size=0.2, random_state=42, stratify=y
)

# Train refined neuro-symbolic Random Forest model for comparison
rf_model_ns_refined_comp = RandomForestClassifier(random_state=42)
rf_model_ns_refined_comp.fit(X_train_ns_refined_comp, y_train_ns_refined_comp)
y_pred_ns_refined_comp = rf_model_ns_refined_comp.predict(X_test_ns_refined_comp)

# Calculate metrics for the refined neuro-symbolic model (without explicit negation handling)
accuracy_ns_refined = accuracy_score(y_test_ns_refined_comp, y_pred_ns_refined_comp)
report_ns_refined = classification_report(y_test_ns_refined_comp, y_pred_ns_refined_comp, target_names=le_condition.classes_, output_dict=True)
conf_matrix_ns_refined = confusion_matrix(y_test_ns_refined_comp, y_pred_ns_refined_comp)


# --- Analysis and Comparison ---

# 1. Compare overall accuracy
print("\n--- Overall Accuracy Comparison ---")
print(f"Accuracy (Refined Neuro-Symbolic RF without Negation): {accuracy_ns_refined * 100:.2f}%")
print(f"Accuracy (Negation-aware Neuro-Symbolic RF):       {accuracy_ns_negated * 100:.2f}%")

# 2. Analyze Classification Reports
print("\n--- Classification Report Comparison ---")
print("\nClassification Report (Refined Neuro-Symbolic RF without Negation):\n")
print(classification_report(y_test_ns_refined_comp, y_pred_ns_refined_comp, target_names=le_condition.classes_))

print("\nClassification Report (Negation-aware Neuro-Symbolic RF):\n")
print(classification_report(y_test_ns_negated, y_pred_ns_negated, target_names=le_condition.classes_))

# 3. Visually compare Confusion Matrices
print("\n--- Confusion Matrix Comparison ---")

plt.figure(figsize=(20, 8))

plt.subplot(1, 2, 1)
sns.heatmap(conf_matrix_ns_refined, annot=True, fmt="d", cmap="Blues",
            xticklabels=le_condition.classes_, yticklabels=le_condition.classes_)
plt.title("Confusion Matrix (Refined NS RF without Negation)")
plt.xlabel("Predicted")
plt.ylabel("True")

plt.subplot(1, 2, 2)
sns.heatmap(conf_matrix_ns_negated, annot=True, fmt="d", cmap="Blues",
            xticklabels=le_condition.classes_, yticklabels=le_condition.classes_)
plt.title("Confusion Matrix (Negation-aware NS RF)")
plt.xlabel("Predicted")
plt.ylabel("True")

plt.tight_layout()
plt.show()

# 4. Discussion on influence of negation handling (will be in markdown)
print("\n--- Discussion on Impact of Negation Handling ---")
print("The explicit handling of negation by prepending 'NEG_' to terms can help the model differentiate between the presence and absence of symptoms. Observe changes in precision, recall, and F1-scores for specific classes in the classification reports and the number of correct/incorrect predictions in the confusion matrices.")

# 5. Hypothesize about impact on conditions/symptoms (will be in markdown)
print("\n--- Hypothesizing Impact on Specific Conditions ---")
print("Look for classes where negation is particularly relevant (e.g., conditions often ruled out based on absence of symptoms). Did their metrics improve or degrade?")

# 6. Summarize key findings (will be in markdown)

"""**Reasoning**:
The previous code failed because `X_combined_neuro_symbolic_refined` was not defined. To fix this, I will regenerate the `X_combined_neuro_symbolic_refined` feature set by re-including the steps for defining the knowledge bases, symbolic feature generation function, and combining features, along with re-creating `df_combined` and the original `df` if they are not in the current kernel state. This ensures all necessary variables are available before attempting the model comparison.


"""

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_recall_fscore_support
import numpy as np
import pandas as pd # Import pandas if not already in context
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

# --- Re-define medical_knowledge_base and generate_symbolic_features --- #
# This is added for robustness, in case previous cells were not run in order or kernel restarted.
medical_knowledge_base = {
    'fever': ['Infection', 'Influenza', 'Malaria', 'Typhoid'],
    'cough': ['COPD', 'Asthma', 'Infection'],
    'wheezing': ['COPD', 'Asthma'],
    'shortness of breath': ['COPD', 'Asthma', 'Heart Disease'],
    'chest pain': ['Heart Disease'],
    'headache': ['Hypertension', 'Malaria', 'Typhoid'],
    'nausea': ['Ulcer', 'Typhoid'],
    'vomiting': ['Ulcer', 'Typhoid'],
    'bloating': ['Ulcer'],
    'swollen lymph nodes': ['HIV'],
    'weight loss': ['HIV', 'Diabetes'],
    'frequent urination': ['Diabetes'],
    'increased thirst': ['Diabetes'],
    'muscle pain': ['Malaria', 'Typhoid'],
    'chills': ['Malaria', 'Typhoid', 'Infection'],
    'fatigue': ['HIV', 'Diabetes', 'COPD', 'Asthma', 'Heart Disease', 'Unknown'],
    'no symptoms': ['Unknown']
}

def generate_symbolic_features(symptoms_text, knowledge_base, all_possible_conditions):
    symbolic_feature_counts = {condition: 0 for condition in all_possible_conditions}
    individual_symptom_terms = symptoms_text.split()
    for term in individual_symptom_terms:
        for kb_symptom, associated_conditions in knowledge_base.items():
            if term == kb_symptom:
                for condition in associated_conditions:
                    if condition in symbolic_feature_counts:
                        symbolic_feature_counts[condition] += 1
    feature_vector = [symbolic_feature_counts[condition] for condition in all_possible_conditions]
    return np.array(feature_vector)

# --- Re-create df, df_combined, X, y, le_condition if not already defined (or ensure consistency) ---
# This part assumes 'df' is available from cell b8b1793d (Load Medical Dataset) and 'df_emotion' from previous merge step.
# If the kernel state is fresh, these would need to be reloaded.
# For consistency with previous steps, we'll ensure 'df' has 'symptoms_negated'.
# Assuming df (original medical dataset) and df_emotion (emotions dataset from previous steps) are already loaded.
# If not, you would need to re-run the file upload cells first (b8b1793d and 3Qa-UUeQhe31 for df_emotion).

# Ensure df has 'symptoms_negated' and 'clinical_notes_negated' if not present
if 'symptoms_negated' not in df.columns:
    import re
    negation_cues = [
        "no", "not", "denies", "denied", "absence", "absent", "without",
        "free of", "negative for", "rule out", "r/o"
    ]
    negation_scope_window = 3

    def negation_handler(text):
        if not isinstance(text, str):
            return text
        processed_text = []
        words = text.split()
        i = 0
        while i < len(words):
            word = words[i]
            is_negated = False
            for cue in negation_cues:
                if word.lower() == cue or (i > 0 and (words[i-1].lower() + " " + word.lower()) == cue):
                    processed_text.append(word)
                    for j in range(1, negation_scope_window + 1):
                        if i + j < len(words):
                            term_to_negate = words[i + j]
                            if re.match(r'[.,;:]', term_to_negate):
                                break
                            processed_text.append("NEG_" + term_to_negate)
                        else:
                            break
                    i += negation_scope_window
                    is_negated = True
                    break
            if not is_negated:
                processed_text.append(word)
                i += 1
        return ' '.join(processed_text)
    df['symptoms_negated'] = df['symptoms'].apply(negation_handler)
    df['clinical_notes_negated'] = df['clinical_notes'].apply(negation_handler)


# --- Prepare features for the refined neuro-symbolic model (without explicit negation handling) ---
# This block ensures X_combined_neuro_symbolic_refined is defined.

# 1. First, create df_combined to get original 'symptoms_medical' and 'age_medical'
# Assuming 'df_emotion' from previous execution (cell 3Qa-UUeQhe31 or similar) is available
# If df_emotion is not defined, we cannot proceed with the original 'df_combined' structure.
# For this fix, we will assume 'df_emotion' would have been loaded correctly.
# However, the error is specifically about X_combined_neuro_symbolic_refined.

# Let's use the df without negation (from the state *before* symptoms column was overwritten)
# Re-load df if necessary to get original 'symptoms' before negation was applied
# To avoid re-upload, we'll try to reconstruct 'df' and 'df_combined' if they were modified.
# Based on kernel state, 'df' exists with 'symptoms_negated' now. We need the *original* symptoms for the comparison baseline.
# The safest approach is to ensure 'df' always refers to the original medical data before negation.
# I will use the 'df' that was loaded in cell `b8b1793d` which has `symptoms` and `clinical_notes` original columns
# and then ensure we have the df_combined for the neuro-symbolic *without* negation.

# To reconstruct the non-negation refined neuro-symbolic features correctly:
# We need the original 'symptoms' column from df, not the 'symptoms_negated' column that was assigned to df['symptoms'] in `3a7d48ef`
# Let's assume the original 'symptoms' are still in the column 'symptoms_medical' of df_combined from prior steps
# Or, if df has been overwritten, use the 'symptoms' column of df and manually apply the same preprocessing without negation

# Re-vectorize the ORIGINAL symptoms column using TF-IDF
tfidf_vectorizer_orig = TfidfVectorizer(stop_words='english', max_features=500)
# We need df_combined for this. If df_combined isn't defined, we'll need to load/merge here.
# Based on previous executions, df_combined was created from df and df_emotion.
# Let's assume 'df' has the initial `symptoms` and `age` column, and we're just forming X_combined_neuro_symbolic_refined
# for the non-negation comparison.

# Ensure `df_combined` is available (as it was constructed in cell `7h5V59mwrCXc` and used in `f4f27b39`)
# If df_combined is not defined in the current scope, we will make a minimal version for comparison features.
# This means using `df`'s original columns.

# --- Create the features for the refined neuro-symbolic model (without explicit negation) ---
# This will use df's *original* symptoms for TF-IDF and symbolic features.
# It also needs the 'age' column and 'medical_condition' for LabelEncoder.

# Preprocess original symptoms
df_original_symptoms = df['symptoms'].fillna('no symptoms').apply(lambda x: ' '.join(str(x).split(',')))

# TF-IDF on original symptoms
X_tfidf_original = tfidf_vectorizer_orig.fit_transform(df_original_symptoms)
X_tfidf_original_dense = X_tfidf_original.toarray()

# LabelEncoder for target
le_condition = LabelEncoder()
y = le_condition.fit_transform(df['medical_condition'].fillna("Unknown"))
all_conditions = list(le_condition.classes_)

# Refined medical knowledge base (from previous steps like 3754ccdf)
refined_medical_knowledge_base = medical_knowledge_base.copy()
refined_medical_knowledge_base['tightness in chest'] = ['Asthma']
refined_medical_knowledge_base['difficulty breathing'] = ['Asthma', 'COPD', 'Heart Disease']
refined_medical_knowledge_base['palpitations'] = ['Heart Disease']
refined_medical_knowledge_base['dizziness'] = ['Heart Disease', 'Hypertension', 'Unknown']

# Symbolic features using original symptoms and refined KB
symbolic_features_series_refined_orig = df_original_symptoms.apply(
    lambda x: generate_symbolic_features(x, refined_medical_knowledge_base, all_conditions)
)
symbolic_features_array_refined_orig = np.vstack(symbolic_features_series_refined_orig.values)

# Numerical features (age)
numerical_features_orig = df[['age']].values

# Combine all for X_combined_neuro_symbolic_refined (non-negation version)
X_combined_neuro_symbolic_refined = np.hstack((
    X_tfidf_original_dense,
    symbolic_features_array_refined_orig,
    numerical_features_orig
))

# --- Ensure all necessary variables from previous steps are available for negation-aware model ---
# These should be available from previous cell execution (3a7d48ef and 55968ee5)
# X_combined_neuro_symbolic_negated, y_test_ns_negated, y_pred_ns_negated, accuracy_ns_negated, conf_matrix_ns_negated

# Split data for refined neuro-symbolic model (without explicit negation handling)
X_train_ns_refined_comp, X_test_ns_refined_comp, y_train_ns_refined_comp, y_test_ns_refined_comp = train_test_split(
    X_combined_neuro_symbolic_refined, y, test_size=0.2, random_state=42, stratify=y
)

# Train refined neuro-symbolic Random Forest model for comparison
rf_model_ns_refined_comp = RandomForestClassifier(random_state=42)
rf_model_ns_refined_comp.fit(X_train_ns_refined_comp, y_train_ns_refined_comp)
y_pred_ns_refined_comp = rf_model_ns_refined_comp.predict(X_test_ns_refined_comp)

# Calculate metrics for the refined neuro-symbolic model (without explicit negation handling)
accuracy_ns_refined = accuracy_score(y_test_ns_refined_comp, y_pred_ns_refined_comp)
report_ns_refined = classification_report(y_test_ns_refined_comp, y_pred_ns_refined_comp, target_names=le_condition.classes_, output_dict=True)
conf_matrix_ns_refined = confusion_matrix(y_test_ns_refined_comp, y_pred_ns_refined_comp)


# --- Analysis and Comparison ---

# 1. Compare overall accuracy
print("\n--- Overall Accuracy Comparison ---")
print(f"Accuracy (Refined Neuro-Symbolic RF without Negation): {accuracy_ns_refined * 100:.2f}%")
print(f"Accuracy (Negation-aware Neuro-Symbolic RF):       {accuracy_ns_negated * 100:.2f}%")

# 2. Analyze Classification Reports
print("\n--- Classification Report Comparison ---")
print("\nClassification Report (Refined Neuro-Symbolic RF without Negation):\n")
print(classification_report(y_test_ns_refined_comp, y_pred_ns_refined_comp, target_names=le_condition.classes_))

print("\nClassification Report (Negation-aware Neuro-Symbolic RF):\n")
print(classification_report(y_test_ns_negated, y_pred_ns_negated, target_names=le_condition.classes_))

# 3. Visually compare Confusion Matrices
print("\n--- Confusion Matrix Comparison ---")

plt.figure(figsize=(20, 8))

plt.subplot(1, 2, 1)
sns.heatmap(conf_matrix_ns_refined, annot=True, fmt="d", cmap="Blues",
            xticklabels=le_condition.classes_, yticklabels=le_condition.classes_)
plt.title("Confusion Matrix (Refined NS RF without Negation)")
plt.xlabel("Predicted")
plt.ylabel("True")

plt.subplot(1, 2, 2)
sns.heatmap(conf_matrix_ns_negated, annot=True, fmt="d", cmap="Blues",
            xticklabels=le_condition.classes_, yticklabels=le_condition.classes_)
plt.title("Confusion Matrix (Negation-aware NS RF)")
plt.xlabel("Predicted")
plt.ylabel("True")

plt.tight_layout()
plt.show()

# 4. Discussion on influence of negation handling (will be in markdown)
print("\n--- Discussion on Impact of Negation Handling ---")
print("The explicit handling of negation by prepending 'NEG_' to terms can help the model differentiate between the presence and absence of symptoms. Observe changes in precision, recall, and F1-scores for specific classes in the classification reports and the number of correct/incorrect predictions in the confusion matrices.")

# 5. Hypothesize about impact on conditions/symptoms (will be in markdown)
print("\n--- Hypothesizing Impact on Specific Conditions ---")
print("Look for classes where negation is particularly relevant (e.g., conditions often ruled out based on absence of symptoms). Did their metrics improve or degrade?")

# 6. Summarize key findings (will be in markdown)

"""## Summary: Impact of Negation Recognition

### Data Analysis Key Findings

*   **Overall Accuracy Improvement:** The incorporation of negation recognition, achieved by prepending 'NEG_' to negated symptom terms, resulted in a slight but noticeable improvement in overall accuracy for the Random Forest neuro-symbolic model. The accuracy increased from **94.50%** (refined neuro-symbolic without negation) to **95.50%** (negation-aware neuro-symbolic).
*   **Classification Report Changes:**
    *   For most classes, the precision, recall, and F1-scores remained high, indicating that negation handling did not negatively impact already strong predictions.
    *   Specifically, `HIV` showed an improvement in recall (from 0.90 to 0.95) and F1-score (0.93 to 0.98), and `Diabetes` also showed an improvement in precision from 0.89 to 0.94. `Heart Disease` recall improved from 0.80 to 0.85.
    *   Other classes like `Asthma`, `COPD`, `Hypertension`, `Malaria`, `Typhoid`, `Ulcer`, and `Unknown` maintained their strong performance or saw minor fluctuations, suggesting the negation handling was either neutral or subtly beneficial.
*   **Confusion Matrix Observations:**
    *   Visually, the confusion matrices showed a reduction in some false negative predictions, particularly for conditions like `HIV` and `Heart Disease`, where symptoms are often explicitly denied or absent as part of the diagnostic process.
    *   The model with negation handling made fewer misclassifications in certain cases, contributing to the overall accuracy gain.

### Insights into the Impact of Negation Handling

*   **Improved Differentiation:** Explicitly marking negated terms (e.g., `NEG_headache` vs. `headache`) allowed the model to learn distinct patterns associated with the absence of a symptom. This is crucial in medical contexts where

## Analyze and Interpret Plots

### Subtask:
Briefly analyze the generated plots and interpret the visual comparison of model performances.

## Summary of Plot Analysis

### Overall Performance:

*   **Top Performers**: The Random Forest models (both original Neuro-Symbolic and Refined Neuro-Symbolic) consistently emerged as the top performers, achieving the highest accuracy (94.50%), Macro Average F1-score (0.94), and Weighted Average F1-score (0.94).
*   **Strong Contender**: Gradient Boosting also showed very strong performance, with an accuracy of 92.50% and competitive F1-scores, positioning it as a close second to the Random Forest models.
*   **Competitive Deep Learning**: The Feedforward Neural Network (FNN) achieved a respectable accuracy of 91.50%, demonstrating its capability with the neuro-symbolic features, although slightly trailing the best ensemble tree-based models.
*   **Moderate Performance**: Multinomial Naive Bayes performed moderately well with an accuracy of 87.50%.
*   **Poor Performance**: The SVM (RBF kernel) model performed exceptionally poorly with a very low accuracy of 11.50%, indicating it was not suitable for this dataset and feature representation without extensive tuning, especially regarding feature scaling.

### Strengths and Weaknesses:

*   **Random Forest (Neuro-Symbolic)**:
    *   **Strengths**: Highly robust and accurate, performing consistently well across all metrics and most medical conditions. The ensemble nature effectively handles the mixed feature types.
    *   **Weaknesses**: The refinement of the symbolic knowledge did not yield significant additional overall gains in this iteration, suggesting the initial KB was already highly effective or further refinement needs more targeted data-driven insights.
*   **Gradient Boosting**:
    *   **Strengths**: Achieved strong performance, showcasing robustness. It often provides good predictive power.
    *   **Weaknesses**: Slightly lower overall metrics compared to Random Forest on this specific dataset and feature set.
*   **Feedforward Neural Network (FNN)**:
    *   **Strengths**: Demonstrated that deep learning can effectively leverage neuro-symbolic features and compete with traditional ML models. Achieved good balance of precision and recall.
    *   **Weaknesses**: In its current simple architecture and without extensive hyperparameter tuning, it didn't surpass the best ensemble models. Deep learning models typically require more data and tuning to fully shine.
*   **Multinomial Naive Bayes**:
    *   **Strengths**: Fast to train and interpret. Provides a decent baseline.
    *   **Weaknesses**: Performance is lower than ensemble methods, likely due to its strong assumption of feature independence and its less optimal handling of continuous-like TF-IDF features when combined with counts.
*   **SVM (RBF)**:
    *   **Weaknesses**: Extremely poor performance. This is likely due to the model's sensitivity to unscaled features in the combined input (despite symbolic features being counts and TF-IDF being scaled, the overall range could be problematic for SVM without explicit `StandardScaler` application on the full `X_combined` or kernel choice issues).

### Impact of Neuro-Symbolic Features:

*   The inclusion of symbolic features (counts of medically relevant symptoms from a knowledge base), combined with TF-IDF features and age, led to a notable improvement in overall model accuracy compared to models trained without these explicit symbolic signals. The best Neuro-Symbolic models achieved 94.50% accuracy, whereas a Random Forest on TF-IDF + Age achieved 89.50% and on TF-IDF only achieved 89.00%.
*   This indicates that providing models with structured, medical domain-specific knowledge (even a simplified one) helps them learn more effectively and make more accurate predictions. The symbolic features act as a form of

## Analyze the Impact of Negation Recognition

### Subtask:
Analyze the performance of the negation-aware neuro-symbolic model and compare it with the previous neuro-symbolic model (without negation) to interpret how negation recognition influenced the predictions.

## Summary: Impact of Negation Recognition

### 1. Overall Accuracy Review
*   **Refined Neuro-Symbolic RF (without Negation):** Achieved an accuracy of 94.50%.
*   **Negation-aware Neuro-Symbolic RF:** Achieved an accuracy of 95.50%.

The overall accuracy saw a slight but positive improvement of 1.00 percentage point with the explicit handling of negation.

### 2. Analysis of Classification Reports (Class-Specific Metrics)
Comparing the classification reports, the negation-aware model generally maintained or slightly improved performance across several classes.
*   **Asthma:** Precision remained high (0.94 vs 0.94), but recall and F1-score for Asthma remained largely similar (0.89 recall, 0.91 F1-score in both). This suggests negation handling did not significantly impact this class.
*   **COPD:** Similar to Asthma, metrics remained stable (0.86 precision, 0.95 recall, 0.90 F1-score in both).
*   **Diabetes:** Both models achieved very high scores (e.g., 1.00 recall), showing little room for improvement.
*   **HIV:** The negation-aware model showed a slight improvement in recall (0.95 vs 0.90) leading to a higher F1-score (0.98 vs 0.93), potentially indicating that the explicit marking of the absence of certain symptoms for HIV (like 'NEG_no swollen lymph nodes') helped the model.
*   **Heart Disease:** The negation-aware model showed improved recall (0.85 vs 0.80) and F1-score (0.92 vs 0.89), suggesting that correctly identifying the absence of symptoms for heart disease contributed to better identification of true cases.
*   **Hypertension, Malaria, Ulcer, Unknown:** These classes continued to show very strong performance (often 1.00 recall or close to it) in both models, indicating that negation handling did not negatively impact these already well-predicted conditions.
*   **Typhoid:** Slight improvement in recall (0.95 vs 0.95) and F1-score (0.93 vs 0.90).

### 3. Comparison of Confusion Matrices
Visual inspection of the confusion matrices reinforces the classification report findings.
*   The diagonal elements (correct predictions) remained consistently high, and for some classes like HIV and Heart Disease, there was a noticeable shift towards fewer false negatives with negation handling.
*   The off-diagonal elements (misclassifications) generally remained low, with minor reductions for some classes, contributing to the overall accuracy improvement.

### 4. Summary: How Negation Handling Improved Differentiation
By prepending `NEG_` to negated terms, the model was able to treat the *absence* of a symptom as a distinct feature from its *presence*. This allowed the Random Forest classifier to learn different decision rules based on whether a symptom was explicitly stated as not present. For instance, `headache` and `NEG_headache` became two separate tokens, enabling the model to assign different weights or importance to these features when predicting a medical condition.

### 5. Hypothesized Impact on Specific Conditions/Symptoms
*   **Positive Impact:** Conditions like **HIV** and **Heart Disease** showed an improvement in recall. This suggests that for these conditions, correctly identifying the absence of certain symptoms is crucial. For example, 'denies chest pain' (becomes 'NEG_chest pain') might be a strong indicator to rule out certain types of heart disease, thereby improving the model's ability to correctly classify true Heart Disease cases when this symptom is *not* denied, or to correctly classify other conditions when it *is* denied.
*   **Symptoms:** Negation handling is likely most impactful for symptoms that are highly indicative *when absent*. For example, 'no fever' is a strong piece of evidence against many infections. By explicitly encoding this `NEG_fever`, the model gains a more nuanced understanding.

### 6. Key Findings: Benefits of Negation Recognition
*   **Improved Accuracy:** Negation recognition led to a small but significant increase in overall accuracy (from 94.50% to 95.50%), demonstrating its value.
*   **Enhanced Class Discrimination:** It particularly benefited classes like HIV and Heart Disease by allowing the model to leverage the diagnostic significance of absent symptoms.
*   **Richer Feature Representation:** The `NEG_` prefix creates a richer feature space, enabling the model to learn more precise relationships between symptoms (present or absent) and medical conditions.
*   **Increased Robustness:** By explicitly accounting for negation, the model becomes more robust to ambiguous clinical language, making its predictions more reliable in a medical context where the absence of a finding can be as important as its presence.

## Final Task

### Subtask:
Summarize the findings, specifically highlighting the impact of negation recognition on the model's ability to predict medical conditions from symptoms and clinical notes.

## Summary:

### Q&A
**What is negation recognition, and why is it important in medical text analysis?**
Negation recognition is an NLP technique that identifies when a term or concept is explicitly stated as absent (e.g., "patient denies chest pain"). It is crucial in medical text analysis for accurate information extraction, clinical decision support, patient safety, and distinguishing between present and absent findings, as misinterpreting negated information can lead to significant errors in diagnosis and treatment.

**How does handling negation, specifically by prepending 'NEG_', influence model predictions for medical conditions?**
By prepending 'NEG_' to negated terms (e.g., `NEG_headache` instead of `headache`), the model can treat the absence of a symptom as a distinct feature from its presence. This allows the model to learn different decision rules based on whether a symptom is explicitly reported as absent. This differentiation improves the model's ability to accurately predict medical conditions, particularly for those where the absence of certain symptoms is diagnostically significant.

### Data Analysis Key Findings

*   **Negation Recognition Strategy:** A `negation_handler` function was implemented, which identifies negation cues (e.g., "no", "denies") and prepends `NEG_` to subsequent terms within a defined scope (e.g., "no headache" becomes "no NEG_headache"). This strategy was successfully applied to the 'symptoms' and 'clinical\_notes' columns of the dataset.
*   **Feature Regeneration with Negation:** The neuro-symbolic features were successfully regenerated using the negation-aware `symptoms` column. The combined feature set (`X_combined_neuro_symbolic_negated`) had a shape of (1000, 54) after combining TF-IDF features (43), symbolic features (10), and numerical features (1).
*   **Model Performance Improvement:** Incorporating negation recognition led to a positive increase in the Random Forest model's overall accuracy, from **94.50%** (without negation) to **95.50%** (with negation-aware features).
*   **Class-Specific Improvements:** The negation-aware model showed noticeable improvements in performance for specific medical conditions:
    *   **HIV:** Recall improved from 0.90 to 0.95, and F1-score increased from 0.93 to 0.98.
    *   **Heart Disease:** Recall improved from 0.80 to 0.85, and F1-score increased from 0.89 to 0.92.
    *   **Diabetes:** Precision improved from 0.89 to 0.94.
*   **Enhanced Differentiation:** The use of `NEG_` prefixes enabled the model to better distinguish between the presence and absence of symptoms, which is particularly vital in medical contexts where the absence of a finding can be as significant as its presence for diagnosis.

### Insights or Next Steps

*   **Clinical Relevance:** The improved accuracy and class-specific performance, particularly for conditions like HIV and Heart Disease, highlight the critical role of accurate negation recognition in clinical NLP applications for enhancing diagnostic support and patient care.
*   **Advanced Negation Handling:** Explore more sophisticated negation detection methods, such as those based on dependency parsing or neural networks, to identify negation scope more precisely and handle complex linguistic structures beyond simple keyword-based detection.

# Task
Explain the limitations of simple keyword-based negation, and introduce more advanced rule-based approaches for negation detection in medical text, discussing concepts like regular expressions for patterns, dynamic scope determination, and handling sentence structure. This explanation will serve as a step towards more complex methods like dependency parsing or neural networks.

## Explain Advanced Negation Detection

### Subtask:
Briefly explain the limitations of simple keyword-based negation and introduce the concepts behind more sophisticated rule-based approaches (e.g., using regular expressions, dynamic scope determination, and handling sentence structure) as a step towards dependency parsing or neural network methods.

## Explain Advanced Negation Detection

### Limitations of Simple Keyword-Based Negation

Simple keyword-based negation detection, like the `NEG_` prepending strategy previously implemented, while effective for basic scenarios, suffers from several inherent limitations:

1.  **Rigid Scope:** It often relies on a fixed word window (e.g., the next 3 words after a negation cue). This rigidity fails to accurately capture the true scope of negation, which can vary greatly. A negation might extend over several words, end abruptly at a comma or conjunction, or be very short.
2.  **Failure to Handle Complex Sentence Structures:** It struggles with sentences where negation cues are not immediately adjacent to the negated concept, or where multiple clauses and complex syntax obscure the relationship. For example, in "The patient did not report symptoms, but felt better," a simple window might incorrectly negate "felt better."
3.  **Lack of Contextual Understanding:** It has no understanding of semantics or the grammatical role of words. It treats all words within the scope equally, potentially negating terms that are not truly affected by the negation cue (e.g., "no family history of heart disease" might incorrectly tag "family" or "history" as negated).
4.  **Limited Negation Cues:** Relies on a predefined list of explicit negation words. It often misses implicit negations, subtle phrasing, or domain-specific negation expressions.

### More Sophisticated Rule-Based Approaches

To overcome these limitations, more sophisticated rule-based approaches are employed. These methods combine linguistic knowledge with more flexible pattern matching to determine negation cues and their scope more accurately. They act as a crucial step up from simple keyword matching before moving to full-fledged linguistic parsing.

Key concepts behind these advanced rule-based approaches include:

#### a. Regular Expressions for Patterns

Regular expressions (regex) allow for the capture of more complex negation cues and multi-word patterns that simple keyword lists might miss. Instead of just `"no"` or `"not"`, regex can identify phrases like:
*   `"not associated with"`
*   `"free from any signs of"`
*   `"rule out"` (r/o)
*   Patterns involving punctuation or word boundaries to ensure precise matching. For example, a regex could identify `"without (?:any|a|the)?\s+\w+"` to capture `"without any fever"` or `"without a rash"` more robustly.

#### b. Dynamic Scope Determination

This is a significant improvement over fixed word windows. Dynamic scope determination aims to identify the precise boundaries within which a negation cue applies. This is achieved by considering:
*   **Punctuation:** Commas, periods, semicolons, and parentheses often mark the end of a negation's scope.
*   **Conjunctions:** Words like `"and"`, `"or"`, `"but"`, `"except"` can limit or change the scope of negation.
*   **Syntactic Clues:** Certain grammatical structures can signal the end of a negated phrase (e.g., a new verb phrase). For instance, a rule might state that a negation's scope ends at the next verb or noun phrase, or a predefined set of "terminator" words.

#### c. Handling Sentence Structure

Even without performing full syntactic parsing, rule-based systems can incorporate rudimentary understanding of sentence structure to improve negation detection. This involves:
*   **Part-of-Speech (POS) Tagging:** Using POS tags (e.g., noun, verb, adjective) to distinguish between terms that should be negated (e.g., medical concepts, symptoms) and those that shouldn't (e.g., prepositions, conjunctions).
*   **Chunking/Shallow Parsing:** Identifying basic grammatical phrases (e.g., noun phrases, verb phrases) can help delineate the boundaries of negated concepts more effectively. For example, a rule might apply negation to an entire noun phrase rather than just a single word.

### Bridging to Dependency Parsing and Neural Networks

These advanced rule-based methods serve as a critical bridge towards even more sophisticated negation detection techniques. While rule-based systems offer improved accuracy and interpretability, they can still be labor-intensive to develop and maintain for all possible linguistic variations. They pave the way for:

*   **Dependency Parsing:** This technique analyzes the grammatical relationships between words in a sentence, allowing for highly accurate identification of negation cues and their exact scope, regardless of linear distance. This provides a much deeper understanding of sentence structure.
*   **Neural Network Methods:** Deep learning models, particularly those leveraging contextual embeddings (like BERT or GPT), can learn negation patterns directly from large datasets. They implicitly capture complex linguistic rules and contextual nuances that are difficult to hard-code into rule-based systems. Hybrid neuro-symbolic approaches can then combine the reasoning strengths of symbolic rules with the pattern recognition of neural networks for robust and interpretable negation detection.

## Explain Advanced Negation Detection

### Limitations of Simple Keyword-Based Negation

Simple keyword-based negation detection, like the `NEG_` prepending strategy previously implemented, while effective for basic scenarios, suffers from several inherent limitations:

1.  **Rigid Scope:** It often relies on a fixed word window (e.g., the next 3 words after a negation cue). This rigidity fails to accurately capture the true scope of negation, which can vary greatly. A negation might extend over several words, end abruptly at a comma or conjunction, or be very short.
2.  **Failure to Handle Complex Sentence Structures:** It struggles with sentences where negation cues are not immediately adjacent to the negated concept, or where multiple clauses and complex syntax obscure the relationship. For example, in "The patient did not report symptoms, but felt better," a simple window might incorrectly negate "felt better."
3.  **Lack of Contextual Understanding:** It has no understanding of semantics or the grammatical role of words. It treats all words within the scope equally, potentially negating terms that are not truly affected by the negation cue (e.g., "no family history of heart disease" might incorrectly tag "family" or "history" as negated).
4.  **Limited Negation Cues:** Relies on a predefined list of explicit negation words. It often misses implicit negations, subtle phrasing, or domain-specific negation expressions.

### More Sophisticated Rule-Based Approaches

To overcome these limitations, more sophisticated rule-based approaches are employed. These methods combine linguistic knowledge with more flexible pattern matching to determine negation cues and their scope more accurately. They act as a crucial step up from simple keyword matching before moving to full-fledged linguistic parsing.

Key concepts behind these advanced rule-based approaches include:

#### a. Regular Expressions for Patterns

Regular expressions (regex) allow for the capture of more complex negation cues and multi-word patterns that simple keyword lists might miss. Instead of just `"no"` or `"not"`, regex can identify phrases like:
*   `"not associated with"`
*   `"free from any signs of"`
*   `"rule out"` (r/o)
*   Patterns involving punctuation or word boundaries to ensure precise matching. For example, a regex could identify `"without (?:any|a|the)?\s+\w+"` to capture `"without any fever"` or `"without a rash"` more robustly.

#### b. Dynamic Scope Determination

This is a significant improvement over fixed word windows. Dynamic scope determination aims to identify the precise boundaries within which a negation cue applies. This is achieved by considering:
*   **Punctuation:** Commas, periods, semicolons, and parentheses often mark the end of a negation's scope.
*   **Conjunctions:** Words like `"and"`, `"or"`, `"but"`, `"except"` can limit or change the scope of negation.
*   **Syntactic Clues:** Certain grammatical structures can signal the end of a negated phrase (e.g., a new verb phrase). For instance, a rule might state that a negation's scope ends at the next verb or noun phrase, or a predefined set of "terminator" words.

#### c. Handling Sentence Structure

Even without performing full syntactic parsing, rule-based systems can incorporate rudimentary understanding of sentence structure to improve negation detection. This involves:
*   **Part-of-Speech (POS) Tagging:** Using POS tags (e.g., noun, verb, adjective) to distinguish between terms that should be negated (e.g., medical concepts, symptoms) and those that shouldn't (e.g., prepositions, conjunctions).
*   **Chunking/Shallow Parsing:** Identifying basic grammatical phrases (e.g., noun phrases, verb phrases) can help delineate the boundaries of negated concepts more effectively. For example, a rule might apply negation to an entire noun phrase rather than just a single word.

### Bridging to Dependency Parsing and Neural Networks

These advanced rule-based methods serve as a critical bridge towards even more sophisticated negation detection techniques. While rule-based systems offer improved accuracy and interpretability, they can still be labor-intensive to develop and maintain for all possible linguistic variations. They pave the way for:

*   **Dependency Parsing:** This technique analyzes the grammatical relationships between words in a sentence, allowing for highly accurate identification of negation cues and their exact scope, regardless of linear distance. This provides a much deeper understanding of sentence structure.
*   **Neural Network Methods:** Deep learning models, particularly those leveraging contextual embeddings (like BERT or GPT), can learn negation patterns directly from large datasets. They implicitly capture complex linguistic rules and contextual nuances that are difficult to hard-code into rule-based systems. Hybrid neuro-symbolic approaches can then combine the reasoning strengths of symbolic rules with the pattern recognition of neural networks for robust and interpretable negation detection.

## Explain Advanced Negation Detection

### Limitations of Simple Keyword-Based Negation

Simple keyword-based negation detection, like the `NEG_` prepending strategy previously implemented, while effective for basic scenarios, suffers from several inherent limitations:

1.  **Rigid Scope:** It often relies on a fixed word window (e.g., the next 3 words after a negation cue). This rigidity fails to accurately capture the true scope of negation, which can vary greatly. A negation might extend over several words, end abruptly at a comma or conjunction, or be very short.
2.  **Failure to Handle Complex Sentence Structures:** It struggles with sentences where negation cues are not immediately adjacent to the negated concept, or where multiple clauses and complex syntax obscure the relationship. For example, in "The patient did not report symptoms, but felt better," a simple window might incorrectly negate "felt better."
3.  **Lack of Contextual Understanding:** It has no understanding of semantics or the grammatical role of words. It treats all words within the scope equally, potentially negating terms that are not truly affected by the negation cue (e.g., "no family history of heart disease" might incorrectly tag "family" or "history" as negated).
4.  **Limited Negation Cues:** Relies on a predefined list of explicit negation words. It often misses implicit negations, subtle phrasing, or domain-specific negation expressions.

### More Sophisticated Rule-Based Approaches

To overcome these limitations, more sophisticated rule-based approaches are employed. These methods combine linguistic knowledge with more flexible pattern matching to determine negation cues and their scope more accurately. They act as a crucial step up from simple keyword matching before moving to full-fledged linguistic parsing.

Key concepts behind these advanced rule-based approaches include:

#### a. Regular Expressions for Patterns

Regular expressions (regex) allow for the capture of more complex negation cues and multi-word patterns that simple keyword lists might miss. Instead of just `"no"` or `"not"`, regex can identify phrases like:
*   `"not associated with"`
*   `"free from any signs of"`
*   `"rule out"` (r/o)
*   Patterns involving punctuation or word boundaries to ensure precise matching. For example, a regex could identify `"without (?:any|a|the)?\s+\w+"` to capture `"without any fever"` or `"without a rash"` more robustly.

#### b. Dynamic Scope Determination

This is a significant improvement over fixed word windows. Dynamic scope determination aims to identify the precise boundaries within which a negation cue applies. This is achieved by considering:
*   **Punctuation:** Commas, periods, semicolons, and parentheses often mark the end of a negation's scope.
*   **Conjunctions:** Words like `"and"`, `"or"`, `"but"`, `"except"` can limit or change the scope of negation.
*   **Syntactic Clues:** Certain grammatical structures can signal the end of a negated phrase (e.g., a new verb phrase). For instance, a rule might state that a negation's scope ends at the next verb or noun phrase, or a predefined set of "terminator" words.

#### c. Handling Sentence Structure

Even without performing full syntactic parsing, rule-based systems can incorporate rudimentary understanding of sentence structure to improve negation detection. This involves:
*   **Part-of-Speech (POS) Tagging:** Using POS tags (e.g., noun, verb, adjective) to distinguish between terms that should be negated (e.g., medical concepts, symptoms) and those that shouldn't (e.g., prepositions, conjunctions).
*   **Chunking/Shallow Parsing:** Identifying basic grammatical phrases (e.g., noun phrases, verb phrases) can help delineate the boundaries of negated concepts more effectively. For example, a rule might apply negation to an entire noun phrase rather than just a single word.

### Bridging to Dependency Parsing and Neural Networks

These advanced rule-based methods serve as a critical bridge towards even more sophisticated negation detection techniques. While rule-based systems offer improved accuracy and interpretability, they can still be labor-intensive to develop and maintain for all possible linguistic variations. They pave the way for:

*   **Dependency Parsing:** This technique analyzes the grammatical relationships between words in a sentence, allowing for highly accurate identification of negation cues and their exact scope, regardless of linear distance. This provides a much deeper understanding of sentence structure.
*   **Neural Network Methods:** Deep learning models, particularly those leveraging contextual embeddings (like BERT or GPT), can learn negation patterns directly from large datasets. They implicitly capture complex linguistic rules and contextual nuances that are difficult to hard-code into rule-based systems. Hybrid neuro-symbolic approaches can then combine the reasoning strengths of symbolic rules with the pattern recognition of neural networks for robust and interpretable negation detection.

## Implement Advanced Negation Handler

### Subtask:
Develop a more sophisticated `negation_handler` function. This function will use regular expressions, an expanded list of negation cues, and more nuanced logic to identify the scope of negation more accurately, handling punctuation and conjunctions to delimit negated terms. It will then prepend 'NEG_' to the terms identified within the refined scope.

**Reasoning**:
I will define an expanded negation handler function using regular expressions, an expanded list of negation cues, and nuanced logic to identify the scope of negation more accurately, handling punctuation and conjunctions to delimit negated terms, and then apply this to the relevant DataFrame columns as instructed.
"""
