In [1]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources if you haven't
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def process_document(text):
    # 1. Lowercase the text
    text = text.lower()
    
    # 2. Remove all characters that are not lowercase a-z or spaces
    text = re.sub(r'[^a-z\s]', '', text)
    
    # 3. Normalize all whitespace to a single space
    text = re.sub(r'\s+', ' ', text).strip()
    
    # 4. Remove links (tokens containing http or www)
    text = ' '.join([word for word in text.split() if not ('http' in word or 'www' in word)])
    
    # 5. Remove words with length >= 17
    text = ' '.join([word for word in text.split() if len(word) < 17])
    
    # 6. Handle characters repeated 3 or more times (e.g., 'aaa' -> 'a')
    text = re.sub(r'(.)\1{2,}', r'\1', text)
    
    # 7. Lemmatize the words
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    
    # 8. Remove stop words
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    # 9. Remove tokens with char length = 1
    text = ' '.join([word for word in text.split() if len(word) > 1])
    
    return text

test_samples = [
    "The quick brown fox jumps over the lazy dog. 1234567890!",
    "Check out this link: https://www.example.com, it's amazing!",
    "HELLO WORLD! THIS IS A TEST SENTENCE.",
    "Spaces    and     tabs should be   normalized properly.",
    "Some random wordsssss with extremelengthwordthatexceedsseventeencharacters.",
    "I went to the park yesterday, and I saw 10 dogs playing!",
    "Supercalifragilisticexpialidocious is a very long word.",
    "The weather is really nice today, but it's quite windy.",
    "Cats and dogs are common pets, but some prefer birds.",
    "Why did the chicken cross the road? To get to the other side!",
    "Visit www.somesite.com for more details about the product.",
    "Error 404: Page not found. Please try again later.",
    "This document is filled with various characters like #, $, and %!",
    "Short example: A B C D E F G.",
    "Another longwordtoberemovedfromthissentence just for testing.",
    "This isss a TESTTT documentttttt with repeatedddd letterssss.",
    "I went to the mallllllllllllllllll yesterday.",
    "Shhhhhhhhhhhhh be quietttttttttttttttt!",
    "Multipleeeeeeeee character repetitionnnnnnnnnn should stopppppp here.",
    "Some random wordsssss with extremelengthwordthatexceedsseventeencharacters."
]

for sample in test_samples:
    print(f"Original: {sample}")
    print(f"Processed: {process_document(sample)}\n")

Original: The quick brown fox jumps over the lazy dog. 1234567890!
Processed: quick brown fox jump lazy dog

Original: Check out this link: https://www.example.com, it's amazing!
Processed: check link amazing

Original: HELLO WORLD! THIS IS A TEST SENTENCE.
Processed: hello world test sentence

Original: Spaces    and     tabs should be   normalized properly.
Processed: space tab normalized properly

Original: Some random wordsssss with extremelengthwordthatexceedsseventeencharacters.
Processed: random word

Original: I went to the park yesterday, and I saw 10 dogs playing!
Processed: went park yesterday saw dog playing

Original: Supercalifragilisticexpialidocious is a very long word.
Processed: long word

Original: The weather is really nice today, but it's quite windy.
Processed: weather really nice today quite windy

Original: Cats and dogs are common pets, but some prefer birds.
Processed: cat dog common pet prefer bird

Original: Why did the chicken cross the road? To get to the 