In [1]:
#Import libraries and load the CSV
import pandas as pd
import re
from collections import Counter
from math import log
import math
from nltk.corpus import wordnet as wn
import nltk
import tkinter as tk
from tkinter import ttk, messagebox
from numpy.linalg import norm
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

# Load the dataset
df = pd.read_csv("NLP_train.csv")

# Displaying the first rows
print(df.head())
print(df.info())


   abstract_id        line_id  \
0     24491034  24491034_0_11   
1     24491034  24491034_1_11   
2     24491034  24491034_2_11   
3     24491034  24491034_3_11   
4     24491034  24491034_4_11   

                                       abstract_text  line_number  \
0  The emergence of HIV as a chronic condition me...            0   
1  This paper describes the design and evaluation...            1   
2  This study is designed as a randomised control...            2   
3  The intervention group will participate in the...            3   
4  The program is based on self-efficacy theory a...            4   

   total_lines      target  
0           11  BACKGROUND  
1           11  BACKGROUND  
2           11     METHODS  
3           11     METHODS  
4           11     METHODS  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2211861 entries, 0 to 2211860
Data columns (total 6 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   abstract_id    int64 
 1   line_id       

In [2]:
#Build the medical corpus
# join all abstract_text into one big corpus
texts = df["abstract_text"].astype(str).tolist()
corpus = " ".join(texts)

# tokenize: keep letters and '@' (numbers replaced in this dataset)
TOK_RE = re.compile(r"[A-Za-z@]+")
tokens = TOK_RE.findall(corpus.lower())

len(tokens), tokens[:20]

(50183758,
 ['the',
  'emergence',
  'of',
  'hiv',
  'as',
  'a',
  'chronic',
  'condition',
  'means',
  'that',
  'people',
  'living',
  'with',
  'hiv',
  'are',
  'required',
  'to',
  'take',
  'more',
  'responsibility'])

In [3]:
#More than 50 million tokens in the corpus

In [4]:
#Build Unigram and Bigram Models
uni = Counter(tokens)
bigrams = list(zip(tokens[:-1], tokens[1:]))
bi = Counter(bigrams)

# totals
N_tokens = sum(uni.values())
V = len(uni)

# Probability Smoothing
def p_uni(w, k=1.0):
    return (uni.get(w, 0) + k) / (N_tokens + k * V)

def p_bi(w1, w2, k=1.0):
    return (bi.get((w1, w2), 0) + k) / (uni.get(w1, 0) + k * V)

# vocab set
VOCAB = set(uni.keys())


In [5]:
# full, frequency-sorted list (descending)
sorted_vocab = sorted(uni.items(), key=lambda x: (-x[1], x[0]))

def search_vocab(q, top=30):
    q = q.lower()
    return [(w, c) for (w, c) in sorted_vocab if q in w][:top]

# examples
sorted_vocab[:20]           
print(sorted_vocab[:20])


[('@', 3750167), ('the', 2173891), ('of', 1715701), ('and', 1670719), ('in', 1345241), ('to', 939291), ('with', 755421), ('a', 750779), ('were', 653116), ('was', 595127), ('patients', 524715), ('for', 488957), ('group', 427238), ('p', 359232), ('or', 338495), ('at', 293007), ('treatment', 247640), ('on', 237590), ('study', 233969), ('after', 213605)]


In [6]:
#Candidate Generation with Minimum Edit Distance
alphabet = "abcdefghijklmnopqrstuvwxyz@"

def edits1(word):
    splits = [(word[:i], word[i:]) for i in range(len(word)+1)]
    deletes    = [L + R[1:]           for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
    replaces   = [L + c + R[1:]       for L, R in splits if R for c in alphabet]
    inserts    = [L + c + R           for L, R in splits for c in alphabet]
    return set(deletes + transposes + replaces + inserts)

def known(words):
    """Return only words that are actually in the vocabulary."""
    return [w for w in words if w in VOCAB]

def candidates(word, max_edits=2):
    """Generate spelling correction candidates for a word."""
    cands = set()
    e1 = edits1(word)
    cands.update(known(e1))
    if max_edits >= 2:
        for w in list(e1)[:5000]: 
            cands.update(known(edits1(w)))
    if word in VOCAB:
        cands.add(word)
    return list(cands)


In [7]:
#Drop @ from suggestions

USE_AT = False 
if not USE_AT and '@' in VOCAB:
    VOCAB.discard('@')


In [8]:
#La Place Smoothing for Candidate Words

def p_uni_smoothed(w, k=1.0):
    return (uni.get(w, 0) + k) / (N_tokens + k * len(uni))

def p_bi_smoothed(w1, w2, k=1.0):
    return (bi.get((w1, w2), 0) + k) / (uni.get(w1, 0) + k * len(uni))

def logscore_context(left, w, right, k=1.0, lam=0.7):
    """
    Interpolate bigram(s) with unigram:
      score = lam*(log P(w|left) + log P(right|w)) + (1-lam)*log P(w)
    If left/right missing, those terms drop out.
    """
    s = 0.0
    used = 0
    if left:
        s += log(p_bi_smoothed(left, w, k)); used += 1
    if right:
        s += log(p_bi_smoothed(w, right, k)); used += 1
    if used:
        return lam * s + (1 - lam) * log(p_uni_smoothed(w, k))
    else:
        return log(p_uni_smoothed(w, k))


In [9]:
#Damerau-Levenshtein distance between the original word and the candidate word
def approx_edit_distance(a, b):
    # True Damerauâ€“Levenshtein: insertions, deletions, substitutions, transpositions
    return nltk.edit_distance(a, b, transpositions=True)



In [10]:
#User Input Tokenization

TOK_RE = re.compile(r"[A-Za-z@]+")

def tokenize_user(text):
    return TOK_RE.findall(text.lower())


In [11]:
def suggest_for_token(tokens, idx, topk=5, k=1.0, lam=0.7, edit_penalty=0.75):
    """
    Simple baseline suggestion generator (medical corpus only).
    No POS, no IDF, no normalization.
    """
    w = tokens[idx].lower()
    left  = tokens[idx-1].lower() if idx-1 >= 0 else None
    right = tokens[idx+1].lower() if idx+1 < len(tokens) else None

    cand_list = candidates(w, max_edits=2)
    if not cand_list:
        return []

    ranked = []
    for c in cand_list:
        ed = approx_edit_distance(w, c)
        # force numeric conversion to avoid str issues
        raw_score = logscore_context(left, c, right, k=k, lam=lam)
        score = float(raw_score) - edit_penalty * ed
        ranked.append({
            "cand": c,
            "score": round(score, 3),
            "edit": ed
        })

    ranked.sort(key=lambda x: (-x["score"], x["edit"], x["cand"]))
    return ranked[:topk]



In [12]:
def detect_and_suggest(text, topk=5, ctx_margin=1.0):
    toks = tokenize_user(text)
    results = []

    for i, w in enumerate(toks):
        w = w.lower()
        if w == '@':   # skip placeholder tokens
            continue

        in_vocab = w in VOCAB
        suggestions = suggest_for_token(toks, i, topk=topk)

        # If it's a non-word: show suggestions if any
        if not in_vocab and suggestions:
            results.append({
                "index": i,
                "word": w,
                "type": "NON-WORD",
                "suggestions": suggestions   # already dicts
            })
            continue

        # If it's a real word: only flag if best alternative beats the original by ctx_margin
        if in_vocab and suggestions:
            left  = toks[i-1].lower() if i-1 >= 0 else None
            right = toks[i+1].lower() if i+1 < len(toks) else None
            orig_score = float(logscore_context(left, w, right))

            best_cand = suggestions[0]["cand"]
            best_score = suggestions[0]["score"]
            best_ed = suggestions[0]["edit"]

            if best_cand != w and (best_score - orig_score) >= ctx_margin:
                results.append({
                    "index": i,
                    "word": w,
                    "type": "REAL-WORD?",
                    "orig_score": round(orig_score, 3),
                    "best_delta": round(best_score - orig_score, 3),
                    "suggestions": suggestions   # already dicts
                })

    return toks, results

                   


In [13]:
tests = [
    "The patints were given aspirin daily.",      # patients
    "The study focused on diabtes treatment.",    # diabetes
    "He suffered from a hearth attack.",          # heart
    "The drug aspitin was administered.",         # aspirin
]

for t in tests:
    toks, res = detect_and_suggest(t, topk=5, ctx_margin=0.8) #more lenient ctx_margin
    print("\nINPUT:", t)
    print("TOKENS:", toks)
    for r in res:
        print(r)



INPUT: The patints were given aspirin daily.
TOKENS: ['the', 'patints', 'were', 'given', 'aspirin', 'daily']
{'index': 1, 'word': 'patints', 'type': 'REAL-WORD?', 'orig_score': -23.673, 'best_delta': 16.593, 'suggestions': [{'cand': 'patients', 'score': -7.08, 'edit': 1}, {'cand': 'patient', 'score': -12.032, 'edit': 2}, {'cand': 'points', 'score': -14.245, 'edit': 2}, {'cand': 'parents', 'score': -14.328, 'edit': 2}, {'cand': 'ratings', 'score': -15.768, 'edit': 2}]}

INPUT: The study focused on diabtes treatment.
TOKENS: ['the', 'study', 'focused', 'on', 'diabtes', 'treatment']
{'index': 4, 'word': 'diabtes', 'type': 'NON-WORD', 'suggestions': [{'cand': 'diabetes', 'score': -13.741, 'edit': 1}, {'cand': 'diaries', 'score': -20.89, 'edit': 2}, {'cand': 'dates', 'score': -22.662, 'edit': 2}, {'cand': 'diabtel', 'score': -22.882, 'edit': 1}, {'cand': 'debates', 'score': -23.248, 'edit': 2}]}

INPUT: He suffered from a hearth attack.
TOKENS: ['he', 'suffered', 'from', 'a', 'hearth', 'at

In [14]:
def apply_corrections(text, results, ctx_only=True):
    """
    ctx_only=True: for REAL-WORD? apply only when flagged (i.e., delta >= margin)
    """
    toks = tokenize_user(text)
    idx_to_best = {}
    for r in results:
        if not r.get("suggestions"):
            continue
        best = r["suggestions"][0]["cand"]
        if r["type"] == "NON-WORD":
            idx_to_best[r["index"]] = best
        elif r["type"] == "REAL-WORD?":
            # already passed the margin filter in detect_and_suggest
            idx_to_best[r["index"]] = best

    corrected = [ (idx_to_best[i] if i in idx_to_best else t) for i, t in enumerate(toks) ]
    return " ".join(corrected)

In [15]:
#Testing detect_and_suggest for the medical corpus
text = "The patints suffered a hearth attack."
toks, res = detect_and_suggest(text)
print("TOKENS:", toks)
for r in res:
    print(r)

corrected = apply_corrections(text, res)
print("\nCorrected:", corrected)




TOKENS: ['the', 'patints', 'suffered', 'a', 'hearth', 'attack']
{'index': 1, 'word': 'patints', 'type': 'REAL-WORD?', 'orig_score': -23.673, 'best_delta': 12.572, 'suggestions': [{'cand': 'patients', 'score': -11.101, 'edit': 1}, {'cand': 'patient', 'score': -13.414, 'edit': 2}, {'cand': 'parents', 'score': -18.068, 'edit': 2}, {'cand': 'points', 'score': -19.623, 'edit': 2}, {'cand': 'ratings', 'score': -19.975, 'edit': 2}]}
{'index': 4, 'word': 'hearth', 'type': 'REAL-WORD?', 'orig_score': -22.731, 'best_delta': 8.645, 'suggestions': [{'cand': 'heart', 'score': -14.086, 'edit': 1}, {'cand': 'health', 'score': -16.241, 'edit': 1}, {'cand': 'healthy', 'score': -17.504, 'edit': 2}, {'cand': 'heat', 'score': -19.692, 'edit': 2}, {'cand': 'death', 'score': -19.743, 'edit': 2}]}

Corrected: the patients suffered a heart attack


In [16]:
#Adding a general English corpus to boost the performance of the spelling correction (Loading and tokenization of general English corpus)

# Load the general English corpus
with open("sentences.txt", "r", encoding="utf-8") as f:
    general_text = f.read()

# Tokenize: keep only letters
general_tokens = TOK_RE.findall(general_text.lower())
print("General corpus size:", len(general_tokens))
print("Sample:", general_tokens[:20])


General corpus size: 9076
Sample: ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', 'my', 'mum', 'tries', 'to', 'be', 'cool', 'by', 'saying', 'that', 'she', 'likes']


In [17]:
#Building unigram and bigram counts for general corpus

uni_general = Counter(general_tokens)
bi_general = Counter(zip(general_tokens[:-1], general_tokens[1:]))

print("Unique words in general corpus:", len(uni_general))


Unique words in general corpus: 2563


In [18]:
#Building the hyprid corpus

# Merge unigram counts
uni_hybrid = uni + uni_general   # uni = PubMed unigrams  uni_general = General English unigrams

# Merge bigram counts
bi_hybrid = bi + bi_general      # bi = your PubMed bigrams bi_general = General English bigrams

# Hybrid totals
N_tokens_hybrid = sum(uni_hybrid.values())
V_hybrid = len(uni_hybrid)

# Hybrid vocab
VOCAB_HYBRID = set(uni_hybrid.keys())


In [19]:
#Semantic Context Scoring
def get_context_vector(tokens, idx, window=2):
    ctx_words = []
    for j in range(max(0, idx-window), min(len(tokens), idx+window+1)):
        if j != idx and tokens[j] in word_vectors:
            ctx_words.append(word_vectors[tokens[j]])
    if ctx_words:
        return np.mean(ctx_words, axis=0)
    return None

def semantic_similarity(candidate, context_vec):
    if candidate in word_vectors and context_vec is not None:
        cand_vec = word_vectors[candidate]
        return float(np.dot(cand_vec, context_vec) / (norm(cand_vec) * norm(context_vec)))
    return 0.0


In [20]:
#Filtering out words from the hybrid corpus that appear less than 15 times

token_counts = Counter(tokens)

# filter out very rare words (likely misspellings)
VOCAB_HYBRID = {w for w, c in token_counts.items() if c >= 15}



In [21]:
# Build hybrid "documents"
# Use PubMed abstracts + general English sentences
pubmed_docs = df["abstract_text"].astype(str).tolist()
general_docs = general_text.split("\n")

docs = pubmed_docs + general_docs  

# Count document frequencies
doc_freqs = Counter()
for doc in docs:
    unique_words = set(TOK_RE.findall(doc.lower()))
    doc_freqs.update(unique_words)

N_docs = len(docs)
print("Docs counted:", N_docs)
print("Sample df:", list(doc_freqs.items())[:10])


Docs counted: 2212585
Sample df: [('responsibility', 228), ('to', 733327), ('their', 40928), ('take', 2127), ('with', 606982), ('a', 597165), ('for', 407227), ('the', 1264429), ('self', 24094), ('management', 18978)]


In [22]:
#Updating probability functions

def p_uni_hybrid(w, k=1.0):
    return (uni_hybrid.get(w, 0) + k) / (N_tokens_hybrid + k * V_hybrid)

def p_bi_hybrid(w1, w2, k=1.0):
    return (bi_hybrid.get((w1, w2), 0) + k) / (uni_hybrid.get(w1, 0) + k * V_hybrid)


In [23]:
# Hybrid vocab browsing
sorted_vocab_hybrid = sorted(uni_hybrid.items(), key=lambda x: (-x[1], x[0]))

def search_vocab_hybrid(q, top=30):
    q = q.lower()
    return [(w, c) for (w, c) in sorted_vocab_hybrid if q in w][:top]

# --- Aliases so rest of code uses hybrid by default
VOCAB = VOCAB_HYBRID
p_uni = p_uni_hybrid
p_bi = p_bi_hybrid



In [24]:
# Inverse Document Frequency

def idf_score(word, doc_freqs, N_docs):
    """Inverse document frequency: penalize very common words."""
    df = doc_freqs.get(word, 1)
    return math.log((N_docs + 1) / (df + 1)) + 1


In [25]:
#Cosine Similarity for Contextuality
def semantic_similarity(w, ctx_vec):
    """
    Cosine similarity between a candidate word and the context vector.
    Returns a score in [-1, 1], usually 0â€“1 for embeddings.
    """
    if w not in word_vectors or ctx_vec is None:
        return 0.0
    v = word_vectors[w]
    return float(np.dot(v, ctx_vec) / (norm(v) * norm(ctx_vec)))


In [26]:
#POS-Tagging

nltk.download("averaged_perceptron_tagger")

def same_pos(word1, word2):
    """
    Return True if word1 and word2 share the same POS tag.
    """
    try:
        pos1 = nltk.pos_tag([word1])[0][1]
        pos2 = nltk.pos_tag([word2])[0][1]
        return pos1 == pos2
    except:
        return True  


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/elenawachter/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [27]:
#Preparation and Loading of Pre-Trained Word Embeddings
glove_input_file = "glove.6B.50d.txt"
word2vec_output_file = "glove.6B.50d.word2vec.txt"
glove2word2vec(glove_input_file, word2vec_output_file)

word_vectors = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)



  glove2word2vec(glove_input_file, word2vec_output_file)


In [28]:
#Suggest for token for hybrid

def suggest_for_token(tokens, idx, topk=5, k=1.0, lam=0.7, edit_penalty=0.75):
    w = tokens[idx].lower()
    left  = tokens[idx-1].lower() if idx-1 >= 0 else None
    right = tokens[idx+1].lower() if idx+1 < len(tokens) else None

    cand_list = candidates(w, max_edits=2)
    if not cand_list:
        return []

    ranked = []
    for c in cand_list:
        ed = approx_edit_distance(w, c)
        raw_score = logscore_context(left, c, right, k=k, lam=lam)
        score = float(raw_score) - edit_penalty * ed
        ranked.append({
            "cand": c,
            "score": round(score, 3),
            "edit": ed
        })

    ranked.sort(key=lambda x: (-x["score"], x["edit"], x["cand"]))
    return ranked[:topk]


In [29]:
#Detect and Suggest Hybrid
def detect_and_suggest_hybrid(text, topk=5, ctx_margin=0.8, k=3, lam=0.8, edit_penalty=0.5):
    toks = tokenize_user(text)
    results = []

    for i, w in enumerate(toks):
        w_norm = w.lower()
        if w_norm == '@':   # skip placeholder tokens
            continue

        in_vocab = w_norm in VOCAB_HYBRID
        suggestions = suggest_for_token(toks, i, topk=topk, k=k, lam=lam, edit_penalty=edit_penalty)

        # --- NON-WORD: always flag, even if no suggestions ---
        if not in_vocab:
            results.append({
                "index": i,
                "word": w_norm,
                "type": "NON-WORD",
                "suggestions": suggestions  
            })
            continue

        # --- REAL-WORD: only flag if best candidate clearly better ---
        if in_vocab and suggestions:
            left = toks[i-1].lower() if i-1 >= 0 else None
            right = toks[i+1].lower() if i+1 < len(toks) else None
            orig_score = float(logscore_context(left, w_norm, right, k=k, lam=lam))

            best_cand = suggestions[0]["cand"]
            best_score = suggestions[0]["score"]
            best_ed = suggestions[0]["edit"]

            MAX_REALWORD_EDIT = 1
            if (best_ed <= MAX_REALWORD_EDIT
                and best_cand != w_norm
                and (best_score - orig_score) >= ctx_margin
            ):
                results.append({
                    "index": i,
                    "word": w_norm,
                    "type": "REAL-WORD",
                    "orig_score": round(orig_score, 3),
                    "best_delta": round(best_score - orig_score, 3),
                    "suggestions": suggestions
                })

    return toks, results
      

In [30]:
# For hybrid 'Apply Correction'
def apply_corrections_hybrid(text, results, ctx_only=True):
    """
    ctx_only=True: for REAL-WORD? apply only when flagged (i.e., delta >= margin)
    """
    toks = tokenize_user(text)
    idx_to_best = {}
    for r in results:
        if not r.get("suggestions"):
            continue
        best = r["suggestions"][0]["cand"]
        if r["type"] == "NON-WORD":
            idx_to_best[r["index"]] = best
        elif r["type"] == "REAL-WORD?":
            # already passed the margin filter in detect_and_suggest_hybrid
            idx_to_best[r["index"]] = best

    corrected = [ (idx_to_best[i] if i in idx_to_best else t) for i, t in enumerate(toks) ]
    return " ".join(corrected)


In [31]:
tests = [
    "The patints were given aspitin to reduce pain.",
    "High blodsugar levels are dangerous for diabtes patients.",
    "The doctor monitored the insuline dosage carefully.",
    "He sufferd an atack yesterday."
]


for t in tests:
    toks, res = detect_and_suggest_hybrid(t, topk=5, ctx_margin=0.8)
    print("\nINPUT:", t)
    print("TOKENS:", toks)

    for r in res:  # r is a dictionary now
        print(f" -> Word: {r['word']} ({r['type']})")
        for s in r["suggestions"]:
            print(f"    cand={s['cand']}, score={s['score']}, edit={s['edit']}")






INPUT: The patints were given aspitin to reduce pain.
TOKENS: ['the', 'patints', 'were', 'given', 'aspitin', 'to', 'reduce', 'pain']
 -> Word: patints (NON-WORD)
    cand=patients, score=-7.459, edit=1
    cand=patient, score=-12.865, edit=2
    cand=parents, score=-15.299, edit=2
    cand=points, score=-15.31, edit=2
    cand=ratings, score=-16.886, edit=2
 -> Word: aspitin (NON-WORD)
    cand=aspirin, score=-17.063, edit=1
    cand=ascitic, score=-22.759, edit=2

INPUT: High blodsugar levels are dangerous for diabtes patients.
TOKENS: ['high', 'blodsugar', 'levels', 'are', 'dangerous', 'for', 'diabtes', 'patients']
 -> Word: blodsugar (NON-WORD)
 -> Word: diabtes (NON-WORD)
    cand=diabetes, score=-13.752, edit=1
    cand=diaries, score=-21.628, edit=2
    cand=dates, score=-22.527, edit=2
    cand=debates, score=-23.539, edit=2

INPUT: The doctor monitored the insuline dosage carefully.
TOKENS: ['the', 'doctor', 'monitored', 'the', 'insuline', 'dosage', 'carefully']
 -> Word: insu

In [32]:
%gui tk

In [None]:
import tkinter as tk
from tkinter import ttk, messagebox

MAX_CHARS = 500

# Regex for tokenization
try:
    TOK_RE
except NameError:
    TOK_RE = re.compile(r"[A-Za-z@]+")

# --- tokenize with spans (for highlighting & click mapping)
def tokenize_with_spans(text):
    spans = []
    for m in TOK_RE.finditer(text.lower()):
        spans.append((m.group(0), m.start(), m.end()))
    return spans

class SpellApp(tk.Tk):
    def __init__(self):
        super().__init__()
        self.title("MedSpell Studio â€” Hybrid Corpus")
        self.geometry("980x620")
        self.configure(bg="#3E2723")  # dark brown background
        self._build_style()
        self._build_layout()
        self._bind_keys()

        # state
        self.spans = []
        self.tokens = []
        self.detect_results = []
        self.issue_indices = []
        self.issue_cursor = 0

        # demo text (changed here)
        demo = "The doctor prescribed insuline for the patient."
        self.txt.insert("1.0", demo)
        self._enforce_limit()
        self.run_check()

    # ---------- UI ----------
    def _build_style(self):
        style = ttk.Style(self)
        style.theme_use("clam")
        style.configure(".", background="#3E2723", foreground="#FFF3E0")
        style.configure("TButton", padding=6, background="#5D4037", foreground="#FFF3E0")
        style.map("TButton", background=[("active", "#6D4C41")])
        style.configure("TLabel", background="#3E2723", foreground="#FFF3E0")
        style.configure("TFrame", background="#3E2723")
        style.configure("Treeview", background="#4E342E", fieldbackground="#4E342E",
                        foreground="#FFF3E0", rowheight=24)
        style.configure("Treeview.Heading", background="#5D4037", foreground="#FFF3E0")

    def _build_layout(self):
        # Top toolbar
        bar = ttk.Frame(self); bar.pack(fill="x", padx=10, pady=10)
        ttk.Button(bar, text="Check (Ctrl/Cmd+Enter)", command=self.run_check).pack(side="left", padx=4)
        ttk.Button(bar, text="Auto-correct Top", command=self.auto_correct).pack(side="left", padx=4)
        ttk.Button(bar, text="Prev Issue", command=lambda: self.jump_issue(-1)).pack(side="left", padx=4)
        ttk.Button(bar, text="Next Issue", command=lambda: self.jump_issue(+1)).pack(side="left", padx=4)
        ttk.Button(bar, text="Corpus Word List", command=self.open_vocab_browser).pack(side="left", padx=4)
        self.lbl_status = ttk.Label(bar, text="Ready."); self.lbl_status.pack(side="right")

        # Split view
        split = ttk.Panedwindow(self, orient="horizontal"); split.pack(fill="both", expand=True, padx=10, pady=(0,10))

        # Left: editor
        left = ttk.Frame(split); split.add(left, weight=3)
        self.txt = tk.Text(left, wrap="word", height=20, undo=True,
                           bg="#4E342E", fg="#FFF3E0", insertbackground="#FFF3E0",
                           selectbackground="#6D4C41", font=("Menlo", 12))
        self.txt.pack(fill="both", expand=True, side="left")
        sb = ttk.Scrollbar(left, command=self.txt.yview); sb.pack(side="right", fill="y")
        self.txt.configure(yscrollcommand=sb.set)

        # Char counter
        meta = ttk.Frame(self); meta.pack(fill="x", padx=10, pady=(0,10))
        self.lbl_count = ttk.Label(meta, text=f"0/{MAX_CHARS} chars")
        self.lbl_count.pack(side="right")

        # Right: Inspector
        right = ttk.Frame(split); split.add(right, weight=2)
        ttk.Label(right, text="Suggestion Inspector", font=("TkDefaultFont", 11, "bold")).pack(anchor="w", padx=4, pady=(0,6))
        cols = ("Index", "Word", "Type", "Best", "Î”Score", "Edit")
        self.tree = ttk.Treeview(right, columns=cols, show="headings", height=16)
        for c, w in zip(cols, (60, 120, 100, 140, 80, 60)):
            self.tree.heading(c, text=c)
            self.tree.column(c, width=w, anchor="center")
        self.tree.pack(fill="both", expand=True)
        self.tree.bind("<<TreeviewSelect>>", self._on_tree_select)

        # Suggestions detail + actions
        act = ttk.Frame(right); act.pack(fill="x", pady=6)
        self.btn_apply_best = ttk.Button(act, text="Replace Best", command=self.replace_best, state="disabled")
        self.btn_apply_best.pack(side="left", padx=4)
        self.btn_apply_sel  = ttk.Button(act, text="Replace Selectedâ€¦", command=self.replace_selected, state="disabled")
        self.btn_apply_sel.pack(side="left", padx=4)

        ttk.Label(right, text="Candidates for selected word:").pack(anchor="w", padx=4, pady=(6,2))
        s_cols = ("Candidate", "Score", "Edit")
        self.tree_sug = ttk.Treeview(right, columns=s_cols, show="headings", height=6)
        for c, w in zip(s_cols, (160, 100, 60)):
            self.tree_sug.heading(c, text=c)
            self.tree_sug.column(c, width=w, anchor="center")
        self.tree_sug.pack(fill="x", padx=0, pady=(0,6))

        # highlight tag styles
        self.txt.tag_config("miss_nonword", underline=True, foreground="#FF7043")
        self.txt.tag_config("miss_real", underline=True, foreground="#FFB74D")
        self.txt.tag_config("focus_word", background="#6D4C41")

        # bind events
        self.txt.bind("<KeyRelease>", lambda e: (self._enforce_limit(), self._clear_highlights()))
        self.txt.bind("<Button-1>", self._click_in_text)

    def _bind_keys(self):
        self.bind_all("<Control-Return>", lambda e: self.run_check())
        self.bind_all("<Command-Return>", lambda e: self.run_check())

    # ---------- logic ----------
    def _enforce_limit(self):
        content = self.txt.get("1.0", "end-1c")
        if len(content) > MAX_CHARS:
            self.txt.delete("1.0", "end")
            self.txt.insert("1.0", content[:MAX_CHARS])
            content = self.txt.get("1.0", "end-1c")
        self.lbl_count.config(text=f"{len(content)}/{MAX_CHARS} chars")

    def _clear_highlights(self):
        for tag in self.txt.tag_names():
            if tag.startswith("miss_") or tag == "focus_word":
                self.txt.tag_remove(tag, "1.0", "end")

    def _highlight_results(self):
        self._clear_highlights()
        flagged = {r["index"]: r for r in self.detect_results}
        self.issue_indices = list(flagged.keys())
        for i, (_, s, e) in enumerate(self.spans):
            if i in flagged:
                kind = flagged[i]["type"]
                tag = "miss_nonword" if kind == "NON-WORD" else "miss_real"
                self.txt.tag_add(tag, f"1.0+{s}c", f"1.0+{e}c")

    def run_check(self):
        content = self.txt.get("1.0", "end-1c")
        self.spans = [(t, s, e) for t, s, e in tokenize_with_spans(content)]
        self.tokens = [t for t, _, _ in self.spans]
        try:
            # ðŸ”§ fixed: call detect_and_suggest_hybrid
            toks, results = detect_and_suggest_hybrid(content, topk=5, ctx_margin=1.8)
        except Exception as e:
            messagebox.showerror("Error", f"detect_and_suggest failed:\n{e}")
            return
        self.detect_results = results or []
        self._highlight_results()
        self._refresh_inspector()
        n = len(self.detect_results)
        self.lbl_status.config(text=("No issues found âœ…" if n == 0 else f"Found {n} potential issue(s)."))

    def _refresh_inspector(self):
        self.tree.delete(*self.tree.get_children())
        for r in self.detect_results:
            idx = r["index"]
            word = r["word"]
            typ = r["type"]
            best = r["suggestions"][0]["cand"] if r.get("suggestions") else "-"
            delta = r.get("best_delta", "")
            editd = r["suggestions"][0]["edit"] if r.get("suggestions") else ""
            self.tree.insert("", "end", iid=str(idx), values=(idx, word, typ, best, delta, editd))
        self.tree_sug.delete(*self.tree_sug.get_children())
        self.btn_apply_best.config(state="disabled")
        self.btn_apply_sel.config(state="disabled")

    def _on_tree_select(self, event=None):
        sel = self.tree.selection()
        self.tree_sug.delete(*self.tree_sug.get_children())
        self._clear_focus_word()
        if not sel:
            self.btn_apply_best.config(state="disabled")
            self.btn_apply_sel.config(state="disabled")
            return
        idx = int(sel[0])
        self._focus_token(idx)
        rec = next((r for r in self.detect_results if r["index"] == idx), None)
        if not rec:
            self.btn_apply_best.config(state="disabled")
            self.btn_apply_sel.config(state="disabled")
            return
        for s in rec.get("suggestions", []):
            self.tree_sug.insert("", "end", values=(s["cand"], s["score"], s["edit"]))
        state = "normal" if rec.get("suggestions") else "disabled"
        self.btn_apply_best.config(state=state)
        self.btn_apply_sel.config(state=state)

    def _focus_token(self, idx):
        if 0 <= idx < len(self.spans):
            _, s, e = self.spans[idx]
            self.txt.tag_add("focus_word", f"1.0+{s}c", f"1.0+{e}c")
            self.txt.see(f"1.0+{s}c")

    def _clear_focus_word(self):
        self.txt.tag_remove("focus_word", "1.0", "end")

    def _click_in_text(self, event):
        index = self.txt.index(f"@{event.x},{event.y}")
        upto = self.txt.get("1.0", index)
        pos = len(upto)
        clicked = None
        for i, (_, s, e) in enumerate(self.spans):
            if s <= pos <= e:
                clicked = i; break
        if clicked is None:
            return
        if str(clicked) in self.tree.get_children():
            self.tree.selection_set(str(clicked))
            self.tree.focus(str(clicked))
            self._on_tree_select()

    def replace_best(self):
        sel = self.tree.selection()
        if not sel: return
        idx = int(sel[0])
        rec = next((r for r in self.detect_results if r["index"] == idx), None)
        if not rec or not rec.get("suggestions"): return
        best = rec["suggestions"][0]["cand"]
        self._replace_token(idx, best)

    def replace_selected(self):
        sel = self.tree.selection()
        sel2 = self.tree_sug.selection()
        if not sel or not sel2: return
        idx = int(sel[0])
        cand = self.tree_sug.item(sel2[0])["values"][0]
        self._replace_token(idx, cand)

    def _replace_token(self, idx, cand):
        if not (0 <= idx < len(self.spans)): return
        _, s, e = self.spans[idx]
        self.txt.delete(f"1.0+{s}c", f"1.0+{e}c")
        self.txt.insert(f"1.0+{s}c", cand)
        self.run_check()

    def auto_correct(self):
        content = self.txt.get("1.0", "end-1c")
        try:
            # ðŸ”§ fixed: call detect_and_suggest_hybrid
            toks, results = detect_and_suggest_hybrid(content, topk=5, ctx_margin=1.8)
            corrected = apply_corrections(content, results)
        except Exception as e:
            messagebox.showerror("Error", f"auto-correct failed:\n{e}")
            return
        self.txt.delete("1.0", "end")
        self.txt.insert("1.0", corrected)
        self.run_check()

    def jump_issue(self, step):
        if not self.detect_results: return
        self.issue_cursor = (self.issue_cursor + step) % len(self.detect_results)
        idx = self.detect_results[self.issue_cursor]["index"]
        if str(idx) in self.tree.get_children():
            self.tree.selection_set(str(idx))
            self.tree.focus(str(idx))
            self._on_tree_select()

    def open_vocab_browser(self):
        vb = tk.Toplevel(self)
        vb.title("Corpus Vocabulary â€” Hybrid")
        vb.geometry("560x520")
        vb.configure(bg="#3E2723")
        ttk.Label(vb, text="Search vocabulary (Hybrid Corpus):", font=("TkDefaultFont", 10, "bold")).pack(anchor="w", padx=10, pady=(10,4))

        frm = ttk.Frame(vb); frm.pack(fill="x", padx=10, pady=6)
        qvar = tk.StringVar()
        entry = ttk.Entry(frm, textvariable=qvar, width=30); entry.pack(side="left", padx=(0,6))
        entry.configure(foreground="black", background="white")
        def refresh():
            lb.delete(*lb.get_children())
            q = (qvar.get() or "").lower().strip()
            try:
                items = search_vocab_hybrid(q, top=600) if q else sorted_vocab_hybrid[:600]
            except Exception:
                items = []
            for w, c in items:
                lb.insert("", "end", values=(w, c))

        ttk.Button(frm, text="Search", command=refresh).pack(side="left")
        ttk.Button(frm, text="Top 600", command=lambda:(qvar.set(""), refresh())).pack(side="left", padx=6)

        cols = ("Word", "Count")
        lb = ttk.Treeview(vb, columns=cols, show="headings", height=18)
        for c, w in zip(cols, (300, 120)):
            lb.heading(c, text=c)
            lb.column(c, width=w, anchor="w")
        lb.pack(fill="both", expand=True, padx=10, pady=(0,10))

        refresh()
        entry.bind("<Return>", lambda e: refresh())

# --- Launch app ---
if __name__ == "__main__":
    app = SpellApp()
    app.mainloop()
