In [1]:
!which python

/Users/yaskhanloo/Developer/nlp-stroke/.venv-py39/bin/python


In [2]:
import spacy



In [3]:
# Load spaCy's small English model
nlp = spacy.load("en_core_web_sm")

# Clinical-style sentence
text = "The patient had a right-sided ischemic stroke with an NIHSS score of 8 and was treated with aspirin."

# Process the text
doc = nlp(text)

# Tokenization, POS tagging, dependency parsing
print("🧩 Token Details:")
for token in doc:
    print(f"{token.text:<15} | POS: {token.pos_:<10} | Dep: {token.dep_:<15} | Lemma: {token.lemma_}")

# Named Entity Recognition
print("\n🏷️ Named Entities:")
for ent in doc.ents:
    print(f"{ent.text:<30} | Label: {ent.label_}")

🧩 Token Details:
The             | POS: DET        | Dep: det             | Lemma: the
patient         | POS: NOUN       | Dep: nsubj           | Lemma: patient
had             | POS: VERB       | Dep: ROOT            | Lemma: have
a               | POS: DET        | Dep: det             | Lemma: a
right           | POS: ADV        | Dep: advmod          | Lemma: right
-               | POS: PUNCT      | Dep: punct           | Lemma: -
sided           | POS: ADJ        | Dep: amod            | Lemma: sided
ischemic        | POS: ADJ        | Dep: amod            | Lemma: ischemic
stroke          | POS: NOUN       | Dep: dobj            | Lemma: stroke
with            | POS: ADP        | Dep: prep            | Lemma: with
an              | POS: DET        | Dep: det             | Lemma: an
NIHSS           | POS: ADJ        | Dep: amod            | Lemma: nihss
score           | POS: NOUN       | Dep: pobj            | Lemma: score
of              | POS: ADP        | Dep: prep           

In [7]:
from spacy.matcher import Matcher

# Load model
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

# Example clinical note
text = "The patient had a left-sided ischemic stroke with an NIHSS score of 7 and received aspirin."

doc = nlp(text)

# --- Add pattern for NIHSS score ---
pattern_nihss = [
    {"LOWER": "nihss"},
    {"LOWER": "score"},
    {"LOWER": "of"},
    {"LIKE_NUM": True}
]
matcher.add("NIHSS_SCORE", [pattern_nihss])

# --- Add pattern for stroke side ---
# Fix: Define pattern properly within the matcher.add call
matcher.add("STROKE_SIDE", [
    [
        {"LOWER": "left"},
        {"ORTH": "-", "OP": "?"},  # Optional hyphen
        {"LOWER": "sided", "OP": "?"},  # Optional "sided"
        {"IS_ALPHA": True, "OP": "*"},
        {"LOWER": "stroke"}
    ],
    [
        {"LOWER": "right"},
        {"ORTH": "-", "OP": "?"},  # Optional hyphen
        {"LOWER": "sided", "OP": "?"},  # Optional "sided"
        {"IS_ALPHA": True, "OP": "*"},
        {"LOWER": "stroke"}
    ],
    [
        {"LOWER": {"IN": ["left", "right"]}},
        {"LOWER": "hemisphere"}
    ]
])

# --- Add pattern for medication names ---
pattern_medications = [
    {"LOWER": {"IN": ["aspirin", "clopidogrel", "alteplase"]}}
]
matcher.add("MEDICATION", [pattern_medications])

# --- Apply matcher ---
matches = matcher(doc)

for match_id, start, end in matches:
    span = doc[start:end]
    print(f"🔎 {nlp.vocab.strings[match_id]:<12} | {span.text}")
    
# ---- Build structured result ----
results = {
    "stroke_side": None,
    "nihss": None,
    "medications": []
}

for match_id, start, end in matches:
    label = nlp.vocab.strings[match_id]
    span = doc[start:end]

    if label == "STROKE_SIDE" and not results["stroke_side"]:
        results["stroke_side"] = span.text

    elif label == "NIHSS_SCORE" and not results["nihss"]:
        # Extract the number (last token)
        for token in span:
            if token.like_num:
                results["nihss"] = int(token.text)

    elif label == "MEDICATION":
        med = span.text.lower()
        if med not in results["medications"]:
            results["medications"].append(med)

# Print structured results
print("\n🧾 Extracted Data:")
print(results)

🔎 STROKE_SIDE  | left-sided ischemic stroke
🔎 NIHSS_SCORE  | NIHSS score of 7
🔎 MEDICATION   | aspirin

🧾 Extracted Data:
{'stroke_side': 'left-sided ischemic stroke', 'nihss': 7, 'medications': ['aspirin']}
