In [13]:
from nltk.tokenize import RegexpTokenizer
import pandas as pd

def analyze_text(text):
    word_tokenizer = RegexpTokenizer(r'\w+|[^\w\s]+')
    word_tokens = word_tokenizer.tokenize(text)
    
    sent_tokenizer = RegexpTokenizer(r'[^.!?]+[.!?]')
    sent_tokens = sent_tokenizer.tokenize(text)
    
    analysis = {
        'original_text': text,
        'word_tokens': word_tokens,
        'sentence_tokens': sent_tokens,
        'word_count': len(word_tokens),
        'sentence_count': len(sent_tokens)
    }
    
    return analysis

test_cases = [
    """Bitcoin is a cryptocurrency invented in 2008 by an unknown person or group of people using the name Satoshi Nakamoto. The currency began use in 2009 when its implementation was released as open-source software."""
]

print("Text Processing Results:\n")
for i, test in enumerate(test_cases, 1):
    analysis = analyze_text(test)
    
    print(f"Test Case {i}:")
    print(f"Sentences: {analysis['sentence_tokens']}")
    print(f"Words: {analysis['word_tokens']}")
    print('-' * 70 + '\n')

comparison = pd.DataFrame([
    {
        'Word Count': analyze_text(text)['word_count'],
        'Sentence Count': analyze_text(text)['sentence_count']
    }
    for text in test_cases
])


Text Processing Results:

Test Case 1:
Sentences: ['Bitcoin is a cryptocurrency invented in 2008 by an unknown person or group of people using the name Satoshi Nakamoto.', ' The currency began use in 2009 when its implementation was released as open-source software.']
Words: ['Bitcoin', 'is', 'a', 'cryptocurrency', 'invented', 'in', '2008', 'by', 'an', 'unknown', 'person', 'or', 'group', 'of', 'people', 'using', 'the', 'name', 'Satoshi', 'Nakamoto', '.', 'The', 'currency', 'began', 'use', 'in', '2009', 'when', 'its', 'implementation', 'was', 'released', 'as', 'open', '-', 'source', 'software', '.']
----------------------------------------------------------------------

