# Text Cleaning Toolbox Demo

In [None]:
import re
import pandas as pd
from nluztoolbox import TextCleaning, get_contractions_dict, download_nltk_data

print("✅ Imports successful!")
print("📦 TextCleaning module loaded")

✅ Imports successful!
📦 TextCleaning module loaded


## Part 1: Single Text String Examples


In [2]:
# Sample text with various issues
sample_text = "  Hello WORLD! I can't believe it's 2024. Visit https://example.com or email test@example.com. <p>HTML tags here</p>  "
print(f"\nOriginal text:\n{repr(sample_text)}")


Original text:
"  Hello WORLD! I can't believe it's 2024. Visit https://example.com or email test@example.com. <p>HTML tags here</p>  "


In [3]:
# Example 1: Lowercase conversion
print("\n--- Example 1: Lowercase ---")
result = TextCleaning(sample_text).lowercase().get()
print(f"Result: {repr(result)}")

# Example 2: Uppercase conversion
print("\n--- Example 2: Uppercase ---")
result = TextCleaning(sample_text).uppercase().get()
print(f"Result: {repr(result)}")

# Example 3: Remove punctuation
print("\n--- Example 3: Remove Punctuation ---")
result = TextCleaning(sample_text).remove_punctuation().get()
print(f"Result: {repr(result)}")

# Example 4: Remove punctuation but keep some
print("\n--- Example 4: Remove Punctuation (Keep some) ---")
result = TextCleaning(sample_text).remove_punctuation(keep=".,!?").get()
print(f"Result: {repr(result)}")

# Example 5: Expand contractions
print("\n--- Example 5: Expand Contractions ---")
result = TextCleaning(sample_text).expand_contractions().get()
print(f"Result: {repr(result)}")

# Example 6: Remove URLs and emails
print("\n--- Example 6: Remove URLs and Emails ---")
result = TextCleaning(sample_text).remove_urls().remove_emails().get()
print(f"Result: {repr(result)}")

# Example 7: Remove HTML tags
print("\n--- Example 7: Remove HTML Tags ---")
result = TextCleaning(sample_text).remove_html_tags().get()
print(f"Result: {repr(result)}")

# Example 8: Remove extra whitespace
print("\n--- Example 8: Remove Extra Whitespace ---")
result = TextCleaning(sample_text).remove_whitespace(mode="extra").get()
print(f"Result: {repr(result)}")


--- Example 1: Lowercase ---
Result: "  hello world! i can't believe it's 2024. visit https://example.com or email test@example.com. <p>html tags here</p>  "

--- Example 2: Uppercase ---
Result: "  HELLO WORLD! I CAN'T BELIEVE IT'S 2024. VISIT HTTPS://EXAMPLE.COM OR EMAIL TEST@EXAMPLE.COM. <P>HTML TAGS HERE</P>  "

--- Example 3: Remove Punctuation ---
Result: '  Hello WORLD I cant believe its 2024 Visit httpsexamplecom or email testexamplecom pHTML tags herep  '

--- Example 4: Remove Punctuation (Keep some) ---
Result: '  Hello WORLD! I cant believe its 2024. Visit httpsexample.com or email testexample.com. pHTML tags herep  '

--- Example 5: Expand Contractions ---
Result: '  Hello WORLD! I cannot believe it is 2024. Visit https://example.com or email test@example.com. <p>HTML tags here</p>  '

--- Example 6: Remove URLs and Emails ---
Result: "  Hello WORLD! I can't believe it's 2024. Visit  or email . <p>HTML tags here</p>  "

--- Example 7: Remove HTML Tags ---
Result: "  Hello

In [4]:
# Example 9: Chaining multiple operations
print("\n--- Example 9: Chaining Multiple Operations ---")
result = (TextCleaning(sample_text)
          .lowercase()
          .expand_contractions()
          .remove_urls()
          .remove_emails()
          .remove_html_tags()
          .remove_punctuation()
          .remove_whitespace(mode="extra")
          .get())
print(f"Result: {repr(result)}")


--- Example 9: Chaining Multiple Operations ---
Result: 'hello world i cannot believe it is 2024 visit or email html tags here'


In [5]:
# Example 10: Split text
print("\n--- Example 10: Split Text ---")
text = "apple,banana,orange,grape"
result = TextCleaning(text).split_text(delimiter=',')
print(f"Result: {result}")

# Example 11: Simple tokenization (no NLTK required)
print("\n--- Example 11: Simple Tokenization ---")
text = "This is a simple sentence for tokenization"
result = TextCleaning(text).tokenize(method="simple")
print(f"Result: {result}")


--- Example 10: Split Text ---
Result: ['apple', 'banana', 'orange', 'grape']

--- Example 11: Simple Tokenization ---
Result: ['This', 'is', 'a', 'simple', 'sentence', 'for', 'tokenization']


## Part 2: Dataframe Column Examples

In [6]:
df = pd.DataFrame({
    'text': [
        "I can't believe it's working!",
        "Visit https://example.com for more INFO.",
        "  Extra   spaces   everywhere  ",
        "Remove <b>HTML</b> tags and email@test.com",
        "DON'T SHOUT AT ME!!!"
    ],
    'id': [1, 2, 3, 4, 5]
})
df

Unnamed: 0,text,id
0,I can't believe it's working!,1
1,Visit https://example.com for more INFO.,2
2,Extra spaces everywhere,3
3,Remove <b>HTML</b> tags and email@test.com,4
4,DON'T SHOUT AT ME!!!,5


In [7]:
# Example 12: Lowercase on DataFrame column
print("\n--- Example 12: Lowercase on DataFrame ---")
df['text_lowercase'] = TextCleaning(df, text_column='text').lowercase().get()['text']
df


--- Example 12: Lowercase on DataFrame ---


Unnamed: 0,text,id,text_lowercase
0,I can't believe it's working!,1,i can't believe it's working!
1,Visit https://example.com for more INFO.,2,visit https://example.com for more info.
2,Extra spaces everywhere,3,extra spaces everywhere
3,Remove <b>HTML</b> tags and email@test.com,4,remove <b>html</b> tags and email@test.com
4,DON'T SHOUT AT ME!!!,5,don't shout at me!!!


In [9]:
# Example 13: Multiple operations on DataFrame
print("\n--- Example 13: Multiple Operations on DataFrame ---")
df['text_cleaned'] = (TextCleaning(df, text_column='text')
                      .lowercase()
                      .expand_contractions()
                      .remove_urls()
                      .remove_emails()
                      .remove_html_tags()
                      .remove_punctuation()
                      .remove_whitespace(mode="extra")
                      .get()['text'])
df


--- Example 13: Multiple Operations on DataFrame ---


Unnamed: 0,text,id,text_lowercase,text_cleaned
0,I can't believe it's working!,1,i can't believe it's working!,i cannot believe it is working
1,Visit https://example.com for more INFO.,2,visit https://example.com for more info.,visit for more info
2,Extra spaces everywhere,3,extra spaces everywhere,extra spaces everywhere
3,Remove <b>HTML</b> tags and email@test.com,4,remove <b>html</b> tags and email@test.com,remove html tags and
4,DON'T SHOUT AT ME!!!,5,don't shout at me!!!,do not shout at me


In [None]:
# Example 14: Custom function processing
print("\n--- Example 14: Custom Function Processing ---")
def custom_cleanup(text):
    return re.sub(r'\d+', '[NUM]', text)

df_test = pd.DataFrame({
    'text': ["I have 5 apples and 10 oranges", "Call me at 123-456-7890"]
})
df['text_function'] = TextCleaning(df_test, text_column='text').process_text(custom_cleanup).get()
df['text_function']


--- Example 14: Custom Function Processing ---


0    I have [NUM] apples and [NUM] oranges
1             Call me at [NUM]-[NUM]-[NUM]
2                                      NaN
3                                      NaN
4                                      NaN
Name: text_function, dtype: object

## Part 3: NLTK-Based Features

In [11]:
try:
    import nltk
    
    print("\nNOTE: These examples require NLTK data to be downloaded.")
    print("If you get errors, run: download_nltk_data()")
    print("Or manually: import nltk; nltk.download('punkt'); nltk.download('stopwords'); nltk.download('wordnet')")
    
    # Example 15: Word tokenization with NLTK
    print("\n--- Example 15: Word Tokenization (NLTK) ---")
    text = "Hello! How are you doing today? I'm doing great."
    try:
        result = TextCleaning(text).tokenize(method="word")
        print(f"Result: {result}")
    except LookupError as e:
        print(f"Error: {e}")
    
    # Example 16: Sentence tokenization
    print("\n--- Example 16: Sentence Tokenization ---")
    text = "Hello! How are you? I'm doing great. What about you?"
    try:
        result = TextCleaning(text).tokenize(method="sentence")
        print(f"Result: {result}")
    except LookupError as e:
        print(f"Error: {e}")
    
    # Example 17: Remove stopwords
    print("\n--- Example 17: Remove Stopwords (English) ---")
    text = "This is a sample sentence with some common words"
    try:
        result = TextCleaning(text).remove_stopwords(language="english").get()
        print(f"Result: {result}")
    except LookupError as e:
        print(f"Error: {e}")
    
    # Example 18: Porter Stemming
    print("\n--- Example 18: Porter Stemming ---")
    text = "running runs runner ran easily fairly"
    result = TextCleaning(text).stem(method="porter").get()
    print(f"Result: {result}")
    
    # Example 19: Snowball Stemming (supports multiple languages)
    print("\n--- Example 19: Snowball Stemming ---")
    text = "running runs runner ran easily fairly"
    result = TextCleaning(text).stem(method="snowball", language="english").get()
    print(f"Result: {result}")
    
    # Example 20: Lemmatization
    print("\n--- Example 20: Lemmatization (Noun) ---")
    text = "running runs runner ran easily fairly"
    try:
        result = TextCleaning(text).lemmatize(pos="n").get()
        print(f"Result: {result}")
    except LookupError as e:
        print(f"Error: {e}")
    
    # Example 21: Lemmatization (Verb)
    print("\n--- Example 21: Lemmatization (Verb) ---")
    text = "running runs runner ran"
    try:
        result = TextCleaning(text).lemmatize(pos="v").get()
        print(f"Result: {result}")
    except LookupError as e:
        print(f"Error: {e}")
    
    # Example 22: Complete pipeline with NLTK features
    print("\n--- Example 22: Complete NLP Pipeline ---")
    df_nlp = pd.DataFrame({
        'text': [
            "I'm running quickly through the forest!",
            "The cats were playing with their toys.",
            "She doesn't like swimming in cold water."
        ]
    })
    print("\nOriginal DataFrame:")
    print(df_nlp)
    
    try:
        df_nlp_cleaned = (TextCleaning(df_nlp, text_column='text')
                         .lowercase()
                         .expand_contractions()
                         .remove_punctuation()
                         .remove_stopwords(language="english")
                         .stem(method="porter")
                         .get())
        print("\nCleaned DataFrame (with stemming):")
        print(df_nlp_cleaned)
    except LookupError as e:
        print(f"Error: {e}")
        print("Please download NLTK data using: download_nltk_data()")

except ImportError:
    print("\nNLTK is not installed. Install it with: pip install nltk")
    print("Then run: download_nltk_data() to download required data")


NOTE: These examples require NLTK data to be downloaded.
If you get errors, run: download_nltk_data()
Or manually: import nltk; nltk.download('punkt'); nltk.download('stopwords'); nltk.download('wordnet')

--- Example 15: Word Tokenization (NLTK) ---
Result: ['Hello', '!', 'How', 'are', 'you', 'doing', 'today', '?', 'I', "'m", 'doing', 'great', '.']

--- Example 16: Sentence Tokenization ---
Result: ['Hello!', 'How are you?', "I'm doing great.", 'What about you?']

--- Example 17: Remove Stopwords (English) ---
Result: sample sentence common words

--- Example 18: Porter Stemming ---
Result: run run runner ran easili fairli

--- Example 19: Snowball Stemming ---
Result: run run runner ran easili fair

--- Example 20: Lemmatization (Noun) ---
Result: running run runner ran easily fairly

--- Example 21: Lemmatization (Verb) ---
Result: run run runner run

--- Example 22: Complete NLP Pipeline ---

Original DataFrame:
                                       text
0   I'm running quickly t

## Part 4: Contractions Dictionary

In [12]:
contractions = get_contractions_dict()
print(f"\nTotal contractions available: {len(contractions)}")
print("\nSample contractions:")
sample_keys = list(contractions.keys())[:10]
for key in sample_keys:
    print(f"  {key:15} -> {contractions[key]}")

# Example 23: Using custom contractions
print("\n--- Example 23: Custom Contractions ---")
custom_contractions = {
    "gonna": "going to",
    "wanna": "want to",
    "gotta": "got to"
}
text = "I'm gonna wanna do this"
result = TextCleaning(text).expand_contractions(custom_contractions=custom_contractions).get()
print(f"Original: {text}")
print(f"Result: {result}")


Total contractions available: 117

Sample contractions:
  ain't           -> am not
  aren't          -> are not
  can't           -> cannot
  can't've        -> cannot have
  'cause          -> because
  could've        -> could have
  couldn't        -> could not
  couldn't've     -> could not have
  didn't          -> did not
  doesn't         -> does not

--- Example 23: Custom Contractions ---
Original: I'm gonna wanna do this
Result: I am going to want to do this


## Part 5 - Operation Logging

In [13]:
# Example 24: View operation log
print("\n--- Example 24: Operation Log ---")
cleaner = (TextCleaning("Hello World!")
           .lowercase()
           .remove_punctuation()
           .remove_whitespace(mode="extra"))
result = cleaner.get()
log = cleaner.get_log()

print(f"Result: {result}")
print("\nOperations performed:")
for i, op in enumerate(log, 1):
    print(f"  {i}. {op['operation']}: {op['details']}")


--- Example 24: Operation Log ---
Result: hello world

Operations performed:
  1. lowercase: Converted text to lowercase
  2. remove_punctuation: Removed punctuation (kept: 'none')
  3. remove_whitespace: Removed whitespace using mode 'extra'
