In [6]:

!pip install pyspellchecker


<class 'OSError'>: Not available

In [7]:
import os
from pathlib import Path
from spellchecker import SpellChecker  # from pyspellchecker
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from tkinter import filedialog
import tkinter as tk
import csv
from datetime import datetime
from collections import Counter


# ----------------------------
# NLTK setup
# ----------------------------
def ensure_nltk():
    """Ensure required NLTK resources exist (download once if missing)."""
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt', quiet=True)


# ----------------------------
# Folder selection helpers
# ----------------------------
def select_folder_dialog(title):
    """Use a single hidden Tk root for each dialog."""
    root = tk.Tk()
    root.withdraw()
    try:
        return filedialog.askdirectory(title=title) or None
    finally:
        root.destroy()


# ----------------------------
# Tokenization and filtering
# ----------------------------
def tokenize_text(text):
    """Tokenize text into tokens."""
    return word_tokenize(text)


def is_candidate_word(tok):
    """
    Decide whether a token should be spell-checked.
    Skip short tokens, non-alphabetic items, and ALL-CAPS (likely acronyms).
    """
    if not isinstance(tok, str):
        return False
    if not tok.isalpha():
        return False
    if len(tok) <= 2:
        return False
    if tok.isupper():
        return False
    return True


def count_real_words(text):
    """Count alphabetic tokens to compute a meaningful error rate."""
    return sum(1 for t in tokenize_text(text) if isinstance(t, str) and t.isalpha())


# ----------------------------
# Spelling analysis and correction
# ----------------------------
def analyze_spelling(text, spell_checker: SpellChecker):
    """
    Return (corrections_dict, error_count).
    corrections_dict maps the original surface form -> suggested correction (or original if none).
    error_count is the number of misspelled token instances (not unique types).
    """
    tokens = tokenize_text(text)
    candidate_indices = [i for i, t in enumerate(tokens) if is_candidate_word(t)]
    candidate_words = [tokens[i].lower() for i in candidate_indices]
    misspelled = spell_checker.unknown(candidate_words)

    corrections = {}
    error_count = 0
    for idx, lw in zip(candidate_indices, candidate_words):
        if lw in misspelled:
            surface = tokens[idx]
            suggestion = spell_checker.correction(lw) or surface
            # Record only first suggestion per surface form for the report
            corrections.setdefault(surface, suggestion)
            error_count += 1

    return corrections, error_count


def correct_spelling(text, spell_checker: SpellChecker):
    """
    Correct misspelled candidate tokens while preserving punctuation/spacing.
    Safely coalesce None suggestions to the original token.
    """
    detok = TreebankWordDetokenizer()
    tokens = tokenize_text(text)

    candidate_indices = [i for i, t in enumerate(tokens) if is_candidate_word(t)]
    candidate_words = [tokens[i].lower() for i in candidate_indices]
    misspelled = spell_checker.unknown(candidate_words)

    for i, lw in zip(candidate_indices, candidate_words):
        if lw in misspelled:
            orig = tokens[i]
            suggestion = spell_checker.correction(lw)
            if not isinstance(suggestion, str) or not suggestion:
                # No safe suggestion—keep original token
                continue
            if orig.istitle():
                suggestion = suggestion.capitalize()
            elif orig.isupper():
                suggestion = suggestion.upper()
            tokens[i] = suggestion

    # Final guard: every token must be a string for detokenizer
    tokens = [t if isinstance(t, str) else "" for t in tokens]
    return detok.detokenize(tokens)


# ----------------------------
# CSV export and report
# ----------------------------
def export_to_csv(error_summary: dict, output_folder: Path):
    """
    Write summary and detailed CSVs.
    error_summary structure:
      filename: {
        'error_count': int,
        'errors': {surface: suggestion},
        'original_text': str
      }
    """
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    summary_csv_path = output_folder / f"spelling_error_summary_{timestamp}.csv"
    with summary_csv_path.open('w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Filename', 'Total Word Count', 'Error Count', 'Error Rate (%)'])
        for filename, data in error_summary.items():
            total_words = count_real_words(data['original_text'])
            error_rate = (data['error_count'] / total_words * 100) if total_words > 0 else 0
            writer.writerow([filename, total_words, data['error_count'], f"{error_rate:.2f}"])

    detailed_csv_path = output_folder / f"spelling_error_details_{timestamp}.csv"
    with detailed_csv_path.open('w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Filename', 'Misspelled Word (surface)', 'Correction'])
        for filename, data in error_summary.items():
            for word in sorted(data['errors'].keys(), key=str.lower):
                writer.writerow([filename, data['errors'].get(word, word), data['errors'][word]])

    return str(summary_csv_path), str(detailed_csv_path)


def write_text_report(error_summary: dict, output_folder: Path):
    """Write a human-readable report to _spelling_error_report.txt."""
    report_path = output_folder / "_spelling_error_report.txt"
    with report_path.open('w', encoding='utf-8') as report:
        report.write("Spelling Error Summary Report\n")
        report.write("=" * 30 + "\n\n")
        for filename, data in error_summary.items():
            report.write(f"File: {filename}\n")
            report.write(f"Total errors: {data['error_count']}\n")
            report.write("Corrections:\n")
            for word in sorted(data['errors'].keys(), key=str.lower):
                report.write(f"  {word} -> {data['errors'][word]}\n")
            report.write("\n" + "-" * 30 + "\n\n")
    return str(report_path)


# ----------------------------
# Main processing
# ----------------------------
def process_folder(input_folder: Path, output_folder: Path):
    spell = SpellChecker(language='en')
    error_summary = {}

    for entry in sorted(input_folder.iterdir()):
        if entry.is_file() and entry.suffix.lower() == '.txt':
            input_path = entry
            output_path = output_folder / entry.name

            try:
                content = input_path.read_text(encoding='utf-8')
            except UnicodeDecodeError:
                content = input_path.read_text(encoding='utf-8', errors='replace')

            errors, error_count = analyze_spelling(content, spell)
            error_summary[entry.name] = {
                'error_count': error_count,
                'errors': errors,
                'original_text': content
            }

            corrected = correct_spelling(content, spell)
            output_path.write_text(corrected, encoding='utf-8')

            print(f"Processed: {entry.name}")
            print(f"Number of spelling errors found: {error_count}")
            if errors:
                print("Misspelled words and their corrections:")
                for w in sorted(errors.keys(), key=str.lower):
                    print(f"  {w} -> {errors[w]}")
            print("-" * 50)

    summary_csv, detailed_csv = export_to_csv(error_summary, output_folder)
    report_path = write_text_report(error_summary, output_folder)

    # ⭐ New feature: show top 5 most frequent misspelled words across all files
    all_errors = []
    for data in error_summary.values():
        for w in data["errors"].keys():
            all_errors.append(w.lower())

    if all_errors:
        counter = Counter(all_errors)
        top5 = counter.most_common(5)
        print("\nTop 5 Most Frequent Misspelled Words:")
        for w, freq in top5:
            print(f"  {w} (appeared {freq} times across files)")
        print("-" * 50)
    else:
        print("\nNo spelling errors detected in the processed files.")
        print("-" * 50)

    print("\nSpelling correction completed!")
    print(f"Corrected files and reports are in: {output_folder}")
    print(f"Summary CSV: {Path(summary_csv).name}")
    print(f"Detailed CSV: {Path(detailed_csv).name}")
    print(f"Text report: {Path(report_path).name}")


# ----------------------------
# Entrypoint
# ----------------------------
def main():
    ensure_nltk()

    print("Please select input folder...")
    input_dir = select_folder_dialog("Select Input Folder")
    if not input_dir:
        print("No input folder selected. Exiting...")
        return

    print("Please select output folder...")
    output_dir = select_folder_dialog("Select Output Folder")
    if not output_dir:
        print("No output folder selected. Exiting...")
        return

    input_folder = Path(input_dir)
    output_folder = Path(output_dir)
    output_folder.mkdir(parents=True, exist_ok=True)

    process_folder(input_folder, output_folder)


if __name__ == "__main__":
    main()




<class 'ModuleNotFoundError'>: No module named 'spellchecker'

In [None]:
pip uninstall spellchecker

Found existing installation: spellchecker 0.4
,Uninstalling spellchecker-0.4:
,  Would remove:
,    /opt/anaconda3/lib/python3.11/site-packages/spellchecker-0.4.dist-info/*
,    /opt/anaconda3/lib/python3.11/site-packages/spellchecker/*
,Proceed (Y/n)? 

In [None]:
Y