In [1]:
from nltk.tokenize import RegexpTokenizer
import pandas as pd

def clean_text_with_regexp(text):
    tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+')
    tokens = tokenizer.tokenize(text)
    cleaned_text = ' '.join(tokens)

    analysis = {
        'original_text': text,
        'cleaned_text': cleaned_text,
        'token_count': len(tokens),
        'tokens': tokens
    }
    
    return cleaned_text, analysis

test_cases = [
    "Remove, this from .? ()the sentence !!!! !\"#&'()*+,-./:;<=>_",
]

print("Text Processing Results:\n")
for i, test in enumerate(test_cases, 1):
    cleaned, analysis = clean_text_with_regexp(test)
    
    print(f"Test Case {i}:")
    print(f"Original: {test}")
    print(f"Cleaned:  {cleaned}")
    print(f"Tokens:   {analysis['tokens']}")
    print(f"Token count: {analysis['token_count']}")
    print('-' * 70 + '\n')

comparison = pd.DataFrame([
    {
        'Original Text': text,
        'Cleaned Text': clean_text_with_regexp(text)[0],
        'Token Count': len(clean_text_with_regexp(text)[1]['tokens'])
    }
    for text in test_cases
])

print("Comparison Summary:")
print(comparison.to_string(index=False))

Text Processing Results:

Test Case 1:
Original: Remove, this from .? ()the sentence !!!! !"#&'()*+,-./:;<=>_
Cleaned:  Remove this from the sentence
Tokens:   ['Remove', 'this', 'from', 'the', 'sentence']
Token count: 5
----------------------------------------------------------------------

Comparison Summary:
                                               Original Text                  Cleaned Text  Token Count
Remove, this from .? ()the sentence !!!! !"#&'()*+,-./:;<=>_ Remove this from the sentence            5
