# Acronym Finder + Dictionary Check (Notebook)

This notebook scans **one specific file** for acronyms, checks each one against an **engineering dictionary CSV**, and writes a results CSV named like your input file with `_acronyms` appended.

**What you get**
- Extracted acronyms (no guesses or internet lookups).
- Counts of how many times each acronym appears.
- Whether it's in the engineering dictionary.
- The dictionary definition if found.

**How to use**
1. In the **Parameters** cell below, set `INPUT_FILE` to your document path (supports `.txt`, `.md`, `.csv`, `.xlsx`, `.docx`).
2. (Optional) Set `DICT_CSV` if your dictionary CSV is elsewhere. By default it uses `/mnt/data/engineering_dictionary.csv`.
3. Run the **Run** cell to generate the CSV output. The output is saved as `<input_basename>_acronyms.csv` in the same folder as the input.

> If you get missing package errors, install them in a new cell:
> ```
> !pip install pandas openpyxl python-docx
> ```


In [None]:
import os, re, csv
from collections import Counter

# Optional dependencies
try:
    import pandas as pd
except Exception:
    pd = None

try:
    import docx  # python-docx
except Exception:
    docx = None

# Regex to capture acronyms like NASA, GPS-III, H2O, RFC1234 (2–10 chars, caps/digits, optional dashes)
ACRONYM_TOKEN_RE = re.compile(r"\b([A-Z][A-Z0-9-]{1,9})\b")
# Also catch acronyms inside parentheses e.g. (LEO), (GPS-III)
PAREN_ACRO_RE = re.compile(r"\(([A-Z][A-Z0-9-]{1,9})\)")

def extract_acronyms_from_text(text, min_len=2, max_len=10):
    """Return a Counter of acronym strings (uppercased) found in text."""
    counts = Counter()
    for m in ACRONYM_TOKEN_RE.finditer(text):
        tok = m.group(1)
        if min_len <= len(tok) <= max_len and sum(1 for ch in tok if "A" <= ch <= "Z") >= 2:
            counts[tok.upper()] += 1
    for m in PAREN_ACRO_RE.finditer(text):
        tok = m.group(1)
        if min_len <= len(tok) <= max_len and sum(1 for ch in tok if "A" <= ch <= "Z") >= 2:
            counts[tok.upper()] += 1
    return counts

def read_text_from_file(path):
    """Read all textual content from a supported single file path."""
    ext = os.path.splitext(path)[1].lower()

    if ext in {".txt", ".md"}:
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            return f.read()

    if ext == ".csv":
        if pd is None:
            raise RuntimeError("pandas is required to read CSV input files.")
        try:
            df = pd.read_csv(path, dtype=str, encoding="utf-8", engine="python")
        except Exception:
            df = pd.read_csv(path, dtype=str, encoding="latin-1", engine="python")
        return "\n".join(df.astype(str).fillna("").values.flatten().tolist())

    if ext == ".xlsx":
        if pd is None:
            raise RuntimeError("pandas + openpyxl are required to read .xlsx input files.")
        xls = pd.ExcelFile(path, engine="openpyxl")
        parts = []
        for sheet in xls.sheet_names:
            try:
                df = pd.read_excel(xls, sheet_name=sheet, dtype=str)
                parts.append("\n".join(df.astype(str).fillna("").values.flatten().tolist()))
            except Exception:
                continue
        return "\n".join(parts)

    if ext == ".docx":
        if docx is None:
            raise RuntimeError("python-docx is required to read .docx files.")
        d = docx.Document(path)
        parts = [p.text for p in d.paragraphs]
        for table in d.tables:
            for row in table.rows:
                for cell in row.cells:
                    parts.append(cell.text)
        return "\n".join(parts)

    raise ValueError(f"Unsupported input type: {ext}")

def guess_dictionary_columns(df):
    """Try to find acronym and definition columns. Returns (acro_col, def_col or None)."""
    cols = list(df.columns)
    low = [c.lower().strip() for c in cols]

    def pick(cands, default=None):
        for i, name in enumerate(low):
            if any(key in name for key in cands):
                return cols[i]
        return default

    acro_col = pick(["acronym", "abbr", "abbrev", "short", "code", "initialism"], cols[0] if cols else None)
    def_col = pick(["definition", "meaning", "expansion", "description", "full", "notes"], None)

    # If no explicit definition col, but there are >=2 columns, pick the first non-acronym one
    if def_col is None and len(cols) >= 2:
        for c in cols:
            if c != acro_col:
                def_col = c
                break
    return acro_col, def_col

def load_dictionary(dict_csv_path):
    """Load dictionary CSV -> mapping ACRONYM -> definition (case-insensitive)."""
    if pd is None:
        raise RuntimeError("pandas is required to read the dictionary CSV.")
    try:
        df = pd.read_csv(dict_csv_path, dtype=str, encoding="utf-8", engine="python")
    except Exception:
        df = pd.read_csv(dict_csv_path, dtype=str, encoding="latin-1", engine="python")

    if df.empty or df.shape[1] == 0:
        raise RuntimeError("Dictionary CSV appears to be empty.")

    acro_col, def_col = guess_dictionary_columns(df)
    if acro_col is None:
        raise RuntimeError("Could not identify the acronym column in the dictionary CSV.")

    dmap = {}
    for _, row in df.fillna("").iterrows():
        acro = str(row[acro_col]).strip()
        if not acro:
            continue
        key = acro.upper()
        definition = str(row[def_col]).strip() if def_col and def_col in row else ""
        if key in dmap:
            if definition and definition not in dmap[key]:
                dmap[key] = dmap[key] + "; " + definition
        else:
            dmap[key] = definition
    return dmap, df

def make_output_csv_path(input_path):
    """Derive output CSV path as <input_basename>_acronyms.csv in the same folder."""
    folder, fname = os.path.split(input_path)
    base, _ext = os.path.splitext(fname)
    out_name = f"{base}_acronyms.csv"
    return os.path.join(folder or ".", out_name)


In [None]:
# === Parameters (edit these) ===
INPUT_FILE = r"/path/to/your/file.docx"   # <-- change this to your file
DICT_CSV   = r"/mnt/data/engineering_dictionary.csv"  # default dictionary path
MIN_LEN    = 2
MAX_LEN    = 10

# If needed, install deps in a separate cell:
# !pip install pandas openpyxl python-docx


In [None]:
# --- Run extraction & lookup ---
from pathlib import Path

if not os.path.isfile(INPUT_FILE):
    raise FileNotFoundError(f"INPUT_FILE not found: {INPUT_FILE}")

# Load dictionary
dict_map, dict_df = load_dictionary(DICT_CSV)

# Read input text and extract acronym counts
text = read_text_from_file(INPUT_FILE)
counts = extract_acronyms_from_text(text, min_len=MIN_LEN, max_len=MAX_LEN)

# Prepare result rows
import pandas as pd  # ensure pandas is available for DataFrame export
rows = []
for acro in sorted(counts.keys()):
    definition = dict_map.get(acro, "")
    in_dict = "Yes" if acro in dict_map else "No"
    rows.append({"acronym": acro, "count": counts[acro], "in_dictionary": in_dict, "definition": definition})

df_out = pd.DataFrame(rows, columns=["acronym", "count", "in_dictionary", "definition"])

# Derive output path based on input
OUT_CSV = make_output_csv_path(INPUT_FILE)

# Save CSV
os.makedirs(os.path.dirname(OUT_CSV) or ".", exist_ok=True)
df_out.to_csv(OUT_CSV, index=False, encoding="utf-8")

print(f"Scanned: {INPUT_FILE}")
print(f"Unique acronyms: {len(df_out)}")
print(f"Wrote: {OUT_CSV}")

# Preview first few rows
df_out.head(20)
