In [1]:
# Canonisation / Normalisation demo in NLP
# Works in Google Colab

# Install required libraries
!pip install nltk spacy contractions pyspellchecker

import re
import string
import nltk
import spacy
import contractions
from spellchecker import SpellChecker

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting pyspellchecker
  Downloading pyspellchecker-0.8.3-py3-none-any.whl.metadata (9.5 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.3-py3-none-any.whl.metadata (1.6 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading pyspellchecker-0.8.3-py3-none-any.whl (7.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m46.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.3-py3-none-any.whl (345 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━

In [2]:
# Download resources
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [3]:
spell = SpellChecker()

In [4]:
def text_canonisation(text):
    # 1. Lowercasing
    text = text.lower()

    # 2. Expand contractions
    text = contractions.fix(text)

    # 3. Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # 4. Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()

    # 5. Tokenize
    tokens = nltk.word_tokenize(text)

    # 6. Spelling correction
    corrected = [spell.correction(word) if word not in spell else word for word in tokens]

    # 7. Lemmatization
    doc = nlp(" ".join(corrected))
    lemmatized = [token.lemma_ for token in doc]

    # 8. Entity Canonisation (manual dictionary example)
    entity_map = {
        "nyc": "new york city",
        "usa": "united states",
        "u.s.": "united states",
        "colour": "color"
    }
    final_tokens = [entity_map.get(word, word) for word in lemmatized]

    return " ".join(final_tokens)


In [5]:
# ---- Example Usage ---- #
sample_texts = [
    "I’m running to NYC!!!",
    "He don’t like COLOURS, but loves the U.S.",
    "Thiss is a testt sentnce with mispelling."
]

In [7]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [8]:
for txt in sample_texts:
    print(f"\nOriginal: {txt}")
    print(f"Canonised: {text_canonisation(txt)}")



Original: I’m running to NYC!!!
Canonised: I be run to nyx

Original: He don’t like COLOURS, but loves the U.S.
Canonised: he do not like color but love the yous

Original: Thiss is a testt sentnce with mispelling.
Canonised: this be be a test sentence with dispelling
