# Deep Learning for Business Applications course

## TOPIC 10: Text data preprocerssing

Notebook is generated with help of DeepSeek with prompt:
```
Give me example of Python code for Jupyter notebook to demonstrate NLP techniques for text preprocessing
(1) `re` regexp library
(2) `nltk` library
(3) `spaCy` library
Show how these libraries work with the same piece of text
```

### 1. Libraries and parameters

In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm

In [None]:
import re
import nltk
import spacy
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

### 2. Preprocessing demo

#### 2.1. Sample document

In [None]:
sample_text = """
Hello! This is a sample text for NLP preprocessing demonstration. 
It contains various elements: emails like john.doe@email.com, 
URLs like https://www.example.com, and phone numbers like (555) 123-4567.
We'll also process some SPECIAL characters & numbers like 42! 
The quick brown foxes are jumping over 10 lazy dogs. 
Running, jumped, and runs will be stemmed/lemmatized.
Let's test contractions: don't, can't, we're going to handle them!
"""

#### 2.2. Regex library

In [None]:
def preprocess_with_regex(text):
    """
    Uses of `re` library for text preprocessing.

    Args:
      :text: text to process

    Returns:
      processed text
    
    """
    print("Original Text:")
    print(text)
    print("\n" + "-" * 40)
    
    # 1. Convert to lowercase
    text_lower = text.lower()
    print("1. Lowercase:")
    print(text_lower)
    
    # 2. Remove URLs
    text_no_urls = re.sub(r'https?://\S+|www\.\S+', '', text_lower)
    print("\n2. Remove URLs:")
    print(text_no_urls)
    
    # 3. Remove email addresses
    text_no_emails = re.sub(r'\S+@\S+', '', text_no_urls)
    print("\n3. Remove Emails:")
    print(text_no_emails)
    
    # 4. Remove phone numbers
    text_no_phones = re.sub(r'\(\d{3}\)\s*\d{3}-\d{4}', '', text_no_emails)
    print("\n4. Remove Phone Numbers:")
    print(text_no_phones)
    
    # 5. Remove special characters and numbers (keep only letters and spaces)
    text_clean = re.sub(r'[^a-zA-Z\s]', ' ', text_no_phones)
    print("\n5. Remove Special Characters & Numbers:")
    print(text_clean)
    
    # 6. Remove extra whitespace
    text_final = re.sub(r'\s+', ' ', text_clean).strip()
    print("\n6. Remove Extra Whitespace:")
    print(text_final)
    
    # 7. Simple tokenization using regex
    tokens = re.findall(r'\b\w+\b', text_final)
    print("\n7. Tokens:")
    print(tokens)
    
    return tokens

In [None]:
print("=" * 60)
print("Text preprocessing with `re` library")
print("=" * 60, '\n')

re_tokens = preprocess_with_regex(sample_text)

#### 2.3. NLTL library

In [None]:
def preprocess_with_nltk(text):
    """
    Uses of `NLTK` library for text preprocessing.

    Args:
      :text: text to process

    Returns:
      processed text
    
    """
    print("Original Text:")
    print(text)
    print("\n" + "-" * 40)
    
    # 1. Sentence tokenization
    sentences = sent_tokenize(text)
    print("1. Sentence Tokenization:")
    for i, sent in enumerate(sentences, 1):
        print(f"   {i}. {sent}")
    
    # 2. Word tokenization
    words = word_tokenize(text)
    print(f"\n2. Word Tokenization ({len(words)} tokens):")
    print(words[:15])  # Show first 15 tokens
    
    # 3. Convert to lowercase
    words_lower = [word.lower() for word in words]
    print(f"\n3. Lowercase Tokens:")
    print(words_lower[:15])
    
    # 4. Remove punctuation and numbers
    words_alpha = [word for word in words_lower if word.isalpha()]
    print(f"\n4. Alphabetical Tokens Only:")
    print(words_alpha[:15])
    
    # 5. Remove stopwords
    stop_words = set(stopwords.words('english'))
    words_no_stopwords = [word for word in words_alpha if word not in stop_words]
    print(f"\n5. Remove Stopwords:")
    print(f"   Stopwords removed: {[word for word in words_alpha if word in stop_words]}")
    print(f"   Remaining tokens: {words_no_stopwords}")
    
    # 6. Stemming
    stemmer = PorterStemmer()
    words_stemmed = [stemmer.stem(word) for word in words_no_stopwords]
    print(f"\n6. Stemming:")
    stem_examples = list(zip(words_no_stopwords[:10], words_stemmed[:10]))
    for original, stemmed in stem_examples:
        print(f"   {original} -> {stemmed}")
    
    # 7. Lemmatization
    lemmatizer = WordNetLemmatizer()
    words_lemmatized = [lemmatizer.lemmatize(word) for word in words_no_stopwords]
    print(f"\n7. Lemmatization:")
    lemma_examples = list(zip(words_no_stopwords[:10], words_lemmatized[:10]))
    for original, lemma in lemma_examples:
        print(f"   {original} -> {lemma}")
    
    # 8. Part-of-Speech Tagging
    pos_tags = pos_tag(words_no_stopwords[:15])
    print(f"\n8. POS Tagging (first 15 tokens):")
    for word, pos in pos_tags:
        print(f"   {word}: {pos}")
    
    return words_no_stopwords, words_stemmed, words_lemmatized

In [None]:
print("=" * 60)
print("Text preprocessing with `NLTK` library")
print("=" * 60, '\n')

nltk_tokens, nltk_stemmed, nltk_lemmatized = preprocess_with_nltk(sample_text)

#### 2.4. SpaCy library

In [None]:
def preprocess_with_spacy(text):
    """
    Uses of `spaCy` library for text preprocessing.

    Args:
      :text: text to process

    Returns:
      processed text
    
    """
    print("Original Text:")
    print(text)
    print("\n" + "-" * 40)
    
    # Process text with spaCy
    doc = nlp(text)
    
    # 1. Sentence segmentation
    print("1. Sentence Segmentation:")
    for i, sent in enumerate(doc.sents, 1):
        print(f"   {i}. {sent.text}")
    
    # 2. Tokenization with linguistic features
    print(f"\n2. Detailed Token Analysis (first 20 tokens):")
    print(
        f"{'Token':<15} {'Lemma':<15} {'POS':<10}",
        f"{'Tag':<10} {'Stopword':<10} {'Alpha':<8}"
    )
    print("-" * 80)
    for token in doc[:20]:
        print(
            f"{token.text:<15} {token.lemma_:<15} {token.pos_:<10}",
            f"{token.tag_:<10} {token.is_stop:<10} {token.is_alpha:<8}"
        )
    
    # 3. Extract various components
    # Clean tokens (no stopwords, punctuation, or spaces)
    clean_tokens = [
        token.lemma_.lower() for token in doc 
        if not token.is_stop and not token.is_punct and not token.is_space
    ]
    
    print(f"\n3. Clean Tokens (lemmatized, no stopwords/punctuation):")
    print(clean_tokens)
    
    # 4. Named Entity Recognition
    print(f"\n4. Named Entity Recognition:")
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    if entities:
        for entity, label in entities:
            print(f"   {entity} -> {label}")
    else:
        print("   No named entities detected in this text.")
    
    # 5. Noun phrases
    print(f"\n5. Noun Phrases:")
    noun_phrases = [chunk.text for chunk in doc.noun_chunks]
    for i, phrase in enumerate(noun_phrases[:10], 1):
        print(f"   {i}. {phrase}")
    
    # 6. Dependency parsing visualization (first sentence)
    first_sent = next(doc.sents)
    print(f"\n6. Dependency Parsing (first sentence - sample):")
    print(f"   Sentence: {first_sent}")
    print(f"   Root token: {first_sent.root.text} (POS: {first_sent.root.pos_})")
    print("   Sample dependencies:")
    for token in first_sent[:8]:  # Show first 8 tokens
        print(f"     {token.text:<10} -> {token.dep_:<12} -> {token.head.text}")
    
    return clean_tokens, entities, noun_phrases

In [None]:
print("=" * 60)
print("Text preprocessing with `spaCy` library")
print("=" * 60, '\n')

spacy_tokens, spacy_entities, spacy_noun_phrases = preprocess_with_spacy(sample_text)