In [None]:
# ===== notebooks/01_exploration.ipynb =====
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 🧪 Stroke Report NLP Exploration\n",
    "\n",
    "This notebook explores the Classical NLP pipeline for extracting keywords from German stroke radiology reports.\n",
    "\n",
    "## Setup and Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "import os\n",
    "sys.path.append('..')  # Add parent directory to import our modules\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from collections import Counter\n",
    "import re\n",
    "\n",
    "# Our custom modules\n",
    "from extractor.preprocessing import TextPreprocessor\n",
    "from extractor.keyword_rules import KeywordExtractor\n",
    "from extractor.spacy_ner_wrapper import SpacyNERExtractor\n",
    "\n",
    "# Configure plotting\n",
    "plt.style.use('seaborn-v0_8')\n",
    "sns.set_palette(\"husl\")\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Initialize NLP Components"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize our NLP pipeline components\n",
    "preprocessor = TextPreprocessor()\n",
    "keyword_extractor = KeywordExtractor()\n",
    "ner_extractor = SpacyNERExtractor()\n",
    "\n",
    "print(\"✅ NLP components initialized successfully!\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Sample Data Creation and Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create extended sample reports for analysis\n",
    "sample_reports = [\n",
    "    {\n",
    "        'id': 'report_001',\n",
    "        'text': \"\"\"\n",
    "        Patient wurde in Allgemeinanästhesie behandelt. Beginn der Intervention um 08:32 Uhr.\n",
    "        Verwendung des Trevo Stentretriever Systems. rtPA wurde um 07:45 verabreicht.\n",
    "        Mechanische Thrombektomie mit SOFIA Katheter durchgeführt.\n",
    "        Finales Ergebnis: TICI 3. Keine Komplikationen aufgetreten.\n",
    "        \"\"\"\n",
    "    },\n",
    "    {\n",
    "        'id': 'report_002', \n",
    "        'text': \"\"\"\n",
    "        Sedierung für die Prozedur. Start: 09:15 Uhr mit Aspiration using Penumbra System.\n",
    "        Catch Mini device eingesetzt. Urokinase als Thrombolytikum verwendet.\n",
    "        Leichte Blutung nach der Intervention beobachtet. TICI 2b erreicht.\n",
    "        \"\"\"\n",
    "    },\n",
    "    {\n",
    "        'id': 'report_003',\n",
    "        'text': \"\"\"\n",
    "        Lokale Anästhesie für den Eingriff. Beginn: 10:20. \n",
    "        Solitaire Stentretriever verwendet für die mechanische Rekanalisation.\n",
    "        Heparin antikoagulation. Embotrap als backup device.\n",
    "        Perforation der Gefäßwand aufgetreten. TICI 1 Ergebnis.\n",
    "        \"\"\"\n",
    "    },\n",
    "    {\n",
    "        'id': 'report_004',\n",
    "        'text': \"\"\"\n",
    "        Vollnarkose eingeleitet um 11:45 Uhr. Tenecteplase i.v. verabreicht.\n",
    "        Mechanische Thrombektomie mit Trevo Stentretriever und SOFIA Aspiration.\n",
    "        Zusätzlich Aspirin und Heparin gegeben. TICI 2a erreicht.\n",
    "        Postinterventionelles Hämatom festgestellt.\n",
    "        \"\"\"\n",
    "    },\n",
    "    {\n",
    "        'id': 'report_005',\n",
    "        'text': \"\"\"\n",
    "        Patient unter Sedierung. Interventionsbeginn: 14:30.\n",
    "        Primäre Aspiration mit Penumbra System erfolgreich.\n",
    "        Keine Medikation erforderlich. Komplikationsloser Verlauf.\n",
    "        Finales TICI 3 Ergebnis nach mechanischer Embolektomie.\n",
    "        \"\"\"\n",
    "    }\n",
    "]\n",
    "\n",
    "print(f\"Created {len(sample_reports)} sample reports for analysis\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Text Preprocessing Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Analyze preprocessing effects\n",
    "print(\"=== PREPROCESSING ANALYSIS ===\")\n",
    "print()\n",
    "\n",
    "for i, report in enumerate(sample_reports[:2]):  # Show first 2 reports\n",
    "    original = report['text']\n",
    "    cleaned = preprocessor.clean_text(original)\n",
    "    \n",
    "    print(f\"📄 Report {report['id']}:\")\n",
    "    print(f\"Original length: {len(original)} chars\")\n",
    "    print(f\"Cleaned length: {len(cleaned)} chars\")\n",
    "    print(f\"Original: {original[:100]}...\")\n",
    "    print(f\"Cleaned: {cleaned[:100]}...\")\n",
    "    print(\"-\" * 50)\n",
    "    print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Keyword Extraction Pattern Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Test keyword extraction on all samples\n",
    "extraction_results = []\n",
    "\n",
    "for report in sample_reports:\n",
    "    cleaned_text = preprocessor.clean_text(report['text'])\n",
    "    results = keyword_extractor.extract_all(cleaned_text, report['id'])\n",
    "    extraction_results.append(results)\n",
    "\n",
    "# Convert to DataFrame for analysis\n",
    "results_df = pd.DataFrame(extraction_results)\n",
    "\n",
    "print(\"=== KEYWORD EXTRACTION RESULTS ===\")\n",
    "print(results_df.to_string())\n",
    "print()\n",
    "\n",
    "# Show extraction statistics\n",
    "print(\"=== EXTRACTION STATISTICS ===\")\n",
    "categories = ['anesthesia', 'medication', 'treatment_method', 'device', 'tici_score', 'complications']\n",
    "\n",
    "for category in categories:\n",
    "    non_null_count = results_df[category].notna().sum()\n",
    "    coverage = (non_null_count / len(results_df)) * 100\n",
    "    print(f\"{category.replace('_', ' ').title()}: {non_null_count}/{len(results_df)} reports ({coverage:.1f}%)\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Pattern Matching Deep Dive"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Analyze individual pattern performance\n",
    "print(\"=== PATTERN MATCHING ANALYSIS ===\")\n",
    "print()\n",
    "\n",
    "# Test each category's patterns\n",
    "test_text = \" \".join([report['text'] for report in sample_reports])\n",
    "cleaned_test_text = preprocessor.clean_text(test_text)\n",
    "\n",
    "for category, patterns in keyword_extractor.patterns.items():\n",
    "    print(f\"🔍 {category.upper()}:\")\n",
    "    total_matches = 0\n",
    "    \n",
    "    for pattern in patterns:\n",
    "        matches = re.findall(pattern, cleaned_test_text, re.IGNORECASE)\n",
    "        if matches:\n",
    "            print(f\"  Pattern '{pattern}' → {matches}\")\n",
    "            total_matches += len(matches)\n",
    "        else:\n",
    "            print(f\"  Pattern '{pattern}' → No matches\")\n",
    "    \n",
    "    print(f\"  Total matches: {total_matches}\")\n",
    "    print()\n",
    "\n",
    "# Test time patterns separately\n",
    "print(\"⏰ TIME PATTERNS:\")\n",
    "for pattern in keyword_extractor.time_patterns:\n",
    "    matches = re.findall(pattern, cleaned_test_text, re.IGNORECASE)\n",
    "    if matches:\n",
    "        print(f\"  Pattern '{pattern}' → {matches}\")\n",
    "    else:\n",
    "        print(f\"  Pattern '{pattern}' → No matches\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. spaCy NER Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Analyze spaCy NER performance\n",
    "print(\"=== spaCy NER ANALYSIS ===\")\n",
    "print()\n",
    "\n",
    "all_entities = []\n",
    "all_noun_phrases = []\n",
    "\n",
    "for report in sample_reports:\n",
    "    cleaned_text = preprocessor.clean_text(report['text'])\n",
    "    \n",
    "    # Extract entities\n",
    "    entities = ner_extractor.extract_entities(cleaned_text)\n",
    "    noun_phrases = ner_extractor.extract_noun_phrases(cleaned_text)\n",
    "    \n",
    "    all_entities.extend(entities)\n",
    "    all_noun_phrases.extend(noun_phrases)\n",
    "    \n",
    "    print(f\"📄 {report['id']}:\")\n",
    "    print(f\"  Entities: {[ent['text'] for ent in entities]}\")\n",
    "    print(f\"  Noun phrases: {noun_phrases[:5]}...\")  # Show first 5\n",
    "    print()\n",
    "\n",
    "# Entity statistics\n",
    "if all_entities:\n",
    "    entity_labels = [ent['label'] for ent in all_entities]\n",
    "    entity_counts = Counter(entity_labels)\n",
    "    \n",
    "    print(\"Entity Label Distribution:\")\n",
    "    for label, count in entity_counts.most_common():\n",
    "        print(f\"  {label}: {count}\")\n",
    "else:\n",
    "    print(\"No entities found by spaCy NER\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. Visualization and Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create visualizations\n",
    "fig, axes = plt.subplots(2, 2, figsize=(15, 10))\n",
    "fig.suptitle('Stroke Report NLP Analysis', fontsize=16, fontweight='bold')\n",
    "\n",
    "# 1. Extraction coverage by category\n",
    "ax1 = axes[0, 0]\n",
    "categories = ['anesthesia', 'medication', 'treatment_method', 'device', 'tici_score', 'complications']\n",
    "coverages = [(results_df[cat].notna().sum() / len(results_df)) * 100 for cat in categories]\n",
    "\n",
    "bars = ax1.bar(range(len(categories)), coverages, color=sns.color_palette(\"husl\", len(categories)))\n",
    "ax1.set_title('Extraction Coverage by Category')\n",
    "ax1.set_ylabel('Coverage (%)')\n",
    "ax1.set_xticks(range(len(categories)))\n",
    "ax1.set_xticklabels([cat.replace('_', '\\n') for cat in categories], rotation=45, ha='right')\n",
    "ax1.set_ylim(0, 100)\n",
    "\n",
    "# Add value labels on bars\n",
    "for bar, coverage in zip(bars, coverages):\n",
    "    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 2, \n",
    "             f'{coverage:.0f}%', ha='center', va='bottom')\n",
    "\n",
    "# 2. Text length distribution\n",
    "ax2 = axes[0, 1]\n",
    "text_lengths = [len(report['text']) for report in sample_reports]\n",
    "ax2.hist(text_lengths, bins=5, color='lightblue', edgecolor='black', alpha=0.7)\n",
    "ax2.set_title('Report Text Length Distribution')\n",
    "ax2.set_xlabel('Character Count')\n",
    "ax2.set_ylabel('Frequency')\n",
    "\n",
    "# 3. Extracted items count\n",
    "ax3 = axes[1, 0]\n",
    "items_per_report = []\n",
    "for _, row in results_df.iterrows():\n",
    "    count = sum(1 for cat in categories if pd.notna(row[cat]))\n",
    "    items_per_report.append(count)\n",
    "\n",
    "ax3.bar(range(1, len(items_per_report) + 1), items_per_report, \n",
    "        color='lightgreen', edgecolor='black')\n",
    "ax3.set_title('Extracted Items per Report')\n",
    "ax3.set_xlabel('Report Number')\n",
    "ax3.set_ylabel('Number of Extracted Items')\n",
    "ax3.set_xticks(range(1, len(items_per_report) + 1))\n",
    "\n",
    "# 4. TICI score distribution\n",
    "ax4 = axes[1, 1]\n",
    "tici_scores = results_df['tici_score'].dropna().tolist()\n",
    "if tici_scores:\n",
    "    # Handle both single values and lists\n",
    "    flat_tici = []\n",
    "    for score in tici_scores:\n",
    "        if isinstance(score, list):\n",
    "            flat_tici.extend(score)\n",
    "        else:\n",
    "            flat_tici.append(score)\n",
    "    \n",
    "    tici_counts = Counter(flat_tici)\n",
    "    scores = list(tici_counts.keys())\n",
    "    counts = list(tici_counts.values())\n",
    "    \n",
    "    ax4.pie(counts, labels=scores, autopct='%1.1f%%', startangle=90)\n",
    "    ax4.set_title('TICI Score Distribution')\n",
    "else:\n",
    "    ax4.text(0.5, 0.5, 'No TICI scores\\nfound', ha='center', va='center', \n",
    "             transform=ax4.transAxes, fontsize=12)\n",
    "    ax4.set_title('TICI Score Distribution')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8. Pattern Refinement and Testing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Test pattern variations and improvements\n",
    "print(\"=== PATTERN REFINEMENT TESTING ===\")\n",
    "print()\n",
    "\n",
    "# Test different anesthesia pattern variations\n",
    "test_sentences = [\n",
    "    \"Patient in Allgemeinanästhesie\",\n",
    "    \"Unter Vollnarkose durchgeführt\",\n",
    "    \"Lokale Betäubung verwendet\",\n",
    "    \"Sedierung während der Prozedur\",\n",
    "    \"ITN eingeleitet\"  # Intubationsnarkose\n",
    "]\n",
    "\n",
    "# Current patterns\n",
    "current_anesthesia_patterns = keyword_extractor.patterns['anesthesia']\n",
    "\n",
    "# Enhanced patterns\n",
    "enhanced_anesthesia_patterns = [\n",
    "    r'\\b(allgemein)?anästhesie\\b',\n",
    "    r'\\b(voll)?narkose\\b',\n",
    "    r'\\blokal(e|anästhesie)\\b',\n",
    "    r'\\bsedierung\\b',\n",
    "    r'\\bbetäubung\\b',\n",
    "    r'\\bitn\\b',  # Intubationsnarkose\n",
    "    r'\\bintubation\\b'\n",
    "]\n",
    "\n",
    "print(\"Testing anesthesia pattern variations:\")\n",
    "for sentence in test_sentences:\n",
    "    print(f\"\\nSentence: '{sentence}'\")\n",
    "    \n",
    "    # Test current patterns\n",
    "    current_matches = []\n",
    "    for pattern in current_anesthesia_patterns:\n",
    "        matches = re.findall(pattern, sentence.lower(), re.IGNORECASE)\n",
    "        current_matches.extend(matches)\n",
    "    \n",
    "    # Test enhanced patterns\n",
    "    enhanced_matches = []\n",
    "    for pattern in enhanced_anesthesia_patterns:\n",
    "        matches = re.findall(pattern, sentence.lower(), re.IGNORECASE)\n",
    "        enhanced_matches.extend(matches)\n",
    "    \n",
    "    print(f\"  Current patterns: {current_matches if current_matches else 'No matches'}\")\n",
    "    print(f\"  Enhanced patterns: {enhanced_matches if enhanced_matches else 'No matches'}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 9. Performance Metrics and Evaluation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate performance metrics\n",
    "print(\"=== PERFORMANCE EVALUATION ===\")\n",
    "print()\n",
    "\n",
    "# Processing time analysis\n",
    "import time\n",
    "\n",
    "processing_times = []\n",
    "for report in sample_reports:\n",
    "    start_time = time.time()\n",
    "    \n",
    "    # Full pipeline\n",
    "    cleaned_text = preprocessor.clean_text(report['text'])\n",
    "    keyword_results = keyword_extractor.extract_all(cleaned_text)\n",
    "    entities = ner_extractor.extract_entities(cleaned_text)\n",
    "    \n",
    "    end_time = time.time()\n",
    "    processing_times.append(end_time - start_time)\n",
    "\n",
    "print(f\"Average processing time per report: {np.mean(processing_times):.3f} seconds\")\n",
    "print(f\"Total processing time for {len(sample_reports)} reports: {sum(processing_times):.3f} seconds\")\n",
    "print()\n",
    "\n",
    "# Coverage analysis\n",
    "print(\"COVERAGE ANALYSIS:\")\n",
    "total_extractions = 0\n",
    "possible_extractions = len(results_df) * len(categories)\n",
    "\n",
    "for category in categories:\n",
    "    extracted = results_df[category].notna().sum()\n",
    "    total_extractions += extracted\n",
    "    print(f\"{category}: {extracted}/{len(results_df)} ({extracted/len(results_df)*100:.1f}%)\")\n",
    "\n",
    "overall_coverage = (total_extractions / possible_extractions) * 100\n",
    "print(f\"\\nOverall extraction coverage: {overall_coverage:.1f}%\")\n",
    "print()\n",
    "\n",
    "# Quality assessment (manual review needed)\n",
    "print(\"QUALITY ASSESSMENT (Sample Review):\")\n",
    "for i, (_, row) in enumerate(results_df.head(3).iterrows()):\n",
    "    print(f\"\\nReport {row['report_id']}:\")\n",
    "    for category in categories:\n",
    "        value = row[category]\n",
    "        if pd.notna(value):\n",
    "            print(f\"  {category}: {value} ✓\")\n",
    "        else:\n",
    "            print(f\"  {category}: Not found\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 10. Export and Summary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Export results for further analysis\n",
    "results_df.to_csv('../output/exploration_results.csv', index=False)\n",
    "print(\"✅ Results exported to '../output/exploration_results.csv'\")\n",
    "\n",
    "# Summary statistics\n",
    "print(\"\\n=== FINAL SUMMARY ===\")\n",
    "print(f\"📊 Processed {len(sample_reports)} stroke reports\")\n",
    "print(f\"⚡ Average processing time: {np.mean(processing_times):.3f}s per report\")\n",
    "print(f\"🎯 Overall extraction coverage: {overall_coverage:.1f}%\")\n",
    "print(f\"📝 Total unique extractions: {total_extractions}\")\n",
    "\n",
    "print(\"\\n🏆 BEST PERFORMING CATEGORIES:\")\n",
    "category_performance = [(cat, (results_df[cat].notna().sum() / len(results_df)) * 100) for cat in categories]\n",
    "category_performance.sort(key=lambda x: x[1], reverse=True)\n",
    "\n",
    "for cat, perf in category_performance:\n",
    "    print(f\"  {cat.replace('_', ' ').title()}: {perf:.1f}%\")\n",
    "\n",
    "print(\"\\n💡 RECOMMENDATIONS:\")\n",
    "print(\"1. Expand pattern libraries for low-performing categories\")\n",
    "print(\"2. Add fuzzy matching for common medical abbreviations\")\n",
    "print(\"3. Consider context-aware extraction for ambiguous terms\")\n",
    "print(\"4. Implement confidence scoring for manual review prioritization\")\n",
    "print(\"5. Add support for multi-language medical terms (English/German mix)\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}