In [None]:
!pip -q install lxml symspellpy

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.6/159.6 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
BASE = "/content/mesh_build"
DICT_DIR = f"{BASE}/dicts"
TOOLS_DIR = f"{BASE}/tools"

import os, pathlib
pathlib.Path(DICT_DIR).mkdir(parents=True, exist_ok=True)
pathlib.Path(TOOLS_DIR).mkdir(parents=True, exist_ok=True)

print("Base:", BASE)


Base: /content/mesh_build


In [None]:
from google.colab import files, output
uploaded = files.upload()  # choose descYYYY.xml and/or suppYYYY.xml (gz also ok)
import shutil
for name in uploaded.keys():
    shutil.move(name, f"{BASE}/{name}")
print("Uploaded:", list(uploaded.keys()))


Saving supp2026.xml to supp2026.xml
Uploaded: ['supp2026.xml']


In [None]:
%%writefile {TOOLS_DIR}/mesh_to_symspell.py
import argparse, os, re, csv, gzip
from collections import Counter
from lxml import etree

TOKEN_RE = re.compile(r"[A-Za-z][A-Za-z\-]+")  # allow hyphenated biomedical tokens

def open_any(path):
    return gzip.open(path, "rb") if path.endswith(".gz") else open(path, "rb")

def parse_desc_terms(xml_path, add_phrase, add_token):
    with open_any(xml_path) as f:
        ctx = etree.iterparse(f, events=("end",), tag=("DescriptorRecord",))
        for _, rec in ctx:
            name_el = rec.find(".//DescriptorName/String")
            if name_el is not None and name_el.text:
                s = name_el.text.strip()
                if s:
                    add_phrase(s)
                    for t in TOKEN_RE.findall(s):
                        if len(t) >= 3: add_token(t.lower())
            for term in rec.findall(".//ConceptList/Concept/TermList/Term/String"):
                if term.text:
                    s = term.text.strip()
                    if s:
                        add_phrase(s)
                        for t in TOKEN_RE.findall(s):
                            if len(t) >= 3: add_token(t.lower())
            rec.clear()
            while rec.getprevious() is not None:
                del rec.getparent()[0]

def parse_supp_terms(xml_path, add_phrase, add_token):
    with open_any(xml_path) as f:
        ctx = etree.iterparse(f, events=("end",), tag=("SupplementalRecord",))
        for _, rec in ctx:
            name_el = rec.find(".//SupplementalRecordName/String")
            if name_el is not None and name_el.text:
                s = name_el.text.strip()
                if s:
                    add_phrase(s)
                    for t in TOKEN_RE.findall(s):
                        if len(t) >= 3: add_token(t.lower())
            for term in rec.findall(".//ConceptList/Concept/TermList/Term/String"):
                if term.text:
                    s = term.text.strip()
                    if s:
                        add_phrase(s)
                        for t in TOKEN_RE.findall(s):
                            if len(t) >= 3: add_token(t.lower())
            rec.clear()
            while rec.getprevious() is not None:
                del rec.getparent()[0]

def build(args):
    os.makedirs(args.out_dir, exist_ok=True)
    phrases = set()
    counts = Counter()

    def add_phrase(s): phrases.add(s)
    def add_token(tok): counts[tok] += 1

    # Parse each input file
    for p in args.inputs:
        lp = p.lower()
        if "supp" in lp:
            parse_supp_terms(p, add_phrase, add_token)
        else:
            parse_desc_terms(p, add_phrase, add_token)

    # Optional filter to shrink vocabulary size (>=1 keeps everything)
    min_count = max(1, int(args.min_count))
    if min_count > 1:
        counts = Counter({t:c for t,c in counts.items() if c >= min_count})

    # Outputs
    phrases_path = os.path.join(args.out_dir, "mesh_phrases.txt")
    words_path   = os.path.join(args.out_dir, "pubmed_words.txt")
    symspell_tsv = os.path.join(args.out_dir, "medical_vocab.tsv")

    with open(phrases_path, "w", encoding="utf-8") as f:
        for s in sorted(phrases):
            f.write(s + "\n")
    with open(words_path, "w", encoding="utf-8") as f:
        for w in sorted(counts):
            f.write(w + "\n")
    with open(symspell_tsv, "w", encoding="utf-8", newline="") as f:
        w = csv.writer(f, delimiter="\t")
        for wtok, cnt in sorted(counts.items()):
            w.writerow([wtok, max(int(cnt), 1)])

    print(f"✔ Phrases: {phrases_path}")
    print(f"✔ Word list: {words_path}")
    print(f"✔ SymSpell TSV: {symspell_tsv}")
    print(f"Total unique tokens kept ≥{min_count}: {len(counts)}")

if __name__ == "__main__":
    ap = argparse.ArgumentParser(description="Build SymSpell dictionary from MeSH XML files.")
    ap.add_argument("--inputs", nargs="+", required=True, help="Paths to descYYYY.xml / suppYYYY.xml")
    ap.add_argument("--out-dir", default="dicts", help="Output folder")
    ap.add_argument("--min-count", default="1", help="Keep tokens with frequency >= this")
    args = ap.parse_args()
    build(args)

Overwriting /content/mesh_build/tools/mesh_to_symspell.py


In [None]:
import glob, os, textwrap

# Auto-pick any desc/supp files you uploaded into BASE
all_mesh = sorted(glob.glob(f"{BASE}/desc*.xml*") + glob.glob(f"{BASE}/supp*.xml*"))
print("Found:", all_mesh)

MIN_COUNT = "1"  # change to "2" or "3" to filter rare tokens

# Build
!python "{TOOLS_DIR}/mesh_to_symspell.py" \
  --inputs {" ".join(all_mesh)} \
  --out-dir "{DICT_DIR}" \
  --min-count {MIN_COUNT}


Found: ['/content/mesh_build/supp2026.xml']
✔ Phrases: /content/mesh_build/dicts/mesh_phrases.txt
✔ Word list: /content/mesh_build/dicts/pubmed_words.txt
✔ SymSpell TSV: /content/mesh_build/dicts/medical_vocab.tsv
Total unique tokens kept ≥1: 322454


In [None]:
import os, zipfile

zip_path = f"{BASE}/mesh_dicts.zip"
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as z:
    for fn in ["mesh_phrases.txt", "pubmed_words.txt", "medical_vocab.tsv"]:
        fp = os.path.join(DICT_DIR, fn)
        if os.path.exists(fp):
            z.write(fp, arcname=fn)
            print("Added:", fn)

print("ZIP ready:", zip_path)


Added: mesh_phrases.txt
Added: pubmed_words.txt
Added: medical_vocab.tsv
ZIP ready: /content/mesh_build/mesh_dicts.zip


In [None]:
from google.colab import files
files.download(f"{BASE}/mesh_dicts.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>