## NarrativeNexus Project: Text Cleaning Implementation

**Objectives:**
- Remove special characters, punctuation, and stop words
- Apply preprocessing to BBC, CNN/DailyMail, and IMDB datasets
- Save cleaned datasets

In [9]:
# Import required libraries
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Download NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

# Initialize tools
stop_words = set(stopwords.words('english'))
print(f"‚úÖ Setup complete. Loaded {len(stop_words)} stop words.")

‚úÖ Setup complete. Loaded 198 stop words.


In [10]:
# Define text cleaning functions
def clean_special_characters(text):
    """Remove special characters, keep only letters, numbers, and spaces"""
    if pd.isna(text):
        return ""
    text = str(text)
    # Remove special characters
    cleaned = re.sub(r'[^\w\s]', ' ', text)
    # Remove extra whitespace
    cleaned = re.sub(r'\s+', ' ', cleaned)
    return cleaned.strip()

def remove_stop_words(text, keep_negations=True):
    """Remove stop words while preserving negations"""
    if pd.isna(text):
        return ""
    text = str(text)
    
    # Keep important negation words
    stop_words_filtered = stop_words.copy()
    if keep_negations:
        important_words = {'not', 'no', 'never', 'none', 'neither', 'nobody', 'nothing'}
        stop_words_filtered = stop_words_filtered - important_words
    
    # Tokenize and filter
    tokens = word_tokenize(text.lower())
    filtered_tokens = [word for word in tokens if word not in stop_words_filtered]
    return ' '.join(filtered_tokens)

def clean_text_pipeline(text):
    """Complete text cleaning pipeline"""
    if pd.isna(text):
        return ""
    
    # Step 1: Convert to lowercase
    cleaned = str(text).lower()
    
    # Step 2: Remove special characters
    cleaned = clean_special_characters(cleaned)
    
    # Step 3: Remove stop words
    cleaned = remove_stop_words(cleaned, keep_negations=True)
    
    return cleaned

print("‚úÖ Text cleaning functions defined.")

‚úÖ Text cleaning functions defined.


In [11]:
# Load datasets
data_dir = "../data"
datasets = {}

# Load BBC News Dataset
try:
    bbc_df = pd.read_csv(f"{data_dir}/bbc-text.csv")
    datasets['BBC'] = bbc_df
    print(f"‚úÖ BBC Dataset: {len(bbc_df)} articles loaded")
except Exception as e:
    print(f"‚ùå Error loading BBC dataset: {e}")

# Load CNN/DailyMail Dataset
try:
    cnn_df = pd.read_csv(f"{data_dir}/cnn_dailymail.csv")
    datasets['CNN'] = cnn_df
    print(f"‚úÖ CNN Dataset: {len(cnn_df)} articles loaded")
except Exception as e:
    print(f"‚ùå Error loading CNN dataset: {e}")

# Load IMDB Dataset (subset for demo)
try:
    imdb_df = pd.read_csv(f"{data_dir}/imdb-dataset.csv", nrows=1000)
    datasets['IMDB'] = imdb_df
    print(f"‚úÖ IMDB Dataset: {len(imdb_df)} reviews loaded")
except Exception as e:
    print(f"‚ùå Error loading IMDB dataset: {e}")

print(f"\nüìä Total datasets loaded: {len(datasets)}")

‚úÖ BBC Dataset: 2225 articles loaded
‚úÖ CNN Dataset: 5000 articles loaded
‚úÖ IMDB Dataset: 1000 reviews loaded

üìä Total datasets loaded: 3


In [12]:
# Clean BBC News Dataset
if 'BBC' in datasets:
    print("üßπ Cleaning BBC News Dataset...")
    bbc_df = datasets['BBC'].copy()
    
    # Apply cleaning
    tqdm.pandas(desc="Processing BBC")
    bbc_df['text_cleaned'] = bbc_df['text'].progress_apply(clean_text_pipeline)
    
    # Calculate metrics
    original_avg = bbc_df['text'].str.len().mean()
    cleaned_avg = bbc_df['text_cleaned'].str.len().mean()
    reduction = ((original_avg - cleaned_avg) / original_avg * 100)
    
    print(f"   ‚Ä¢ Original avg length: {original_avg:.0f} characters")
    print(f"   ‚Ä¢ Cleaned avg length: {cleaned_avg:.0f} characters")
    print(f"   ‚Ä¢ Reduction: {reduction:.1f}%")
    
    datasets['BBC_cleaned'] = bbc_df
    print("‚úÖ BBC cleaning completed")
else:
    print("‚ùå BBC dataset not available")

üßπ Cleaning BBC News Dataset...


Processing BBC: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2225/2225 [00:02<00:00, 1064.84it/s]

   ‚Ä¢ Original avg length: 2263 characters
   ‚Ä¢ Cleaned avg length: 1584 characters
   ‚Ä¢ Reduction: 30.0%
‚úÖ BBC cleaning completed





In [13]:
# Clean CNN/DailyMail Dataset
if 'CNN' in datasets:
    print("üßπ Cleaning CNN/DailyMail Dataset...")
    cnn_df = datasets['CNN'].copy()
    
    # Identify text column
    text_column = 'article' if 'article' in cnn_df.columns else 'text'
    
    # Apply cleaning
    tqdm.pandas(desc="Processing CNN")
    cnn_df['text_cleaned'] = cnn_df[text_column].progress_apply(clean_text_pipeline)
    
    # Calculate metrics
    original_avg = cnn_df[text_column].str.len().mean()
    cleaned_avg = cnn_df['text_cleaned'].str.len().mean()
    reduction = ((original_avg - cleaned_avg) / original_avg * 100)
    
    print(f"   ‚Ä¢ Original avg length: {original_avg:.0f} characters")
    print(f"   ‚Ä¢ Cleaned avg length: {cleaned_avg:.0f} characters")
    print(f"   ‚Ä¢ Reduction: {reduction:.1f}%")
    
    datasets['CNN_cleaned'] = cnn_df
    print("‚úÖ CNN cleaning completed")
else:
    print("‚ùå CNN dataset not available")

üßπ Cleaning CNN/DailyMail Dataset...


Processing CNN: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5000/5000 [00:04<00:00, 1006.23it/s]

   ‚Ä¢ Original avg length: 2518 characters
   ‚Ä¢ Cleaned avg length: 2518 characters
   ‚Ä¢ Reduction: 0.0%
‚úÖ CNN cleaning completed





In [14]:
# Clean IMDB Dataset
if 'IMDB' in datasets:
    print("üßπ Cleaning IMDB Reviews Dataset...")
    imdb_df = datasets['IMDB'].copy()
    
    # Apply cleaning
    tqdm.pandas(desc="Processing IMDB")
    imdb_df['review_cleaned'] = imdb_df['review'].progress_apply(clean_text_pipeline)
    
    # Calculate metrics
    original_avg = imdb_df['review'].str.len().mean()
    cleaned_avg = imdb_df['review_cleaned'].str.len().mean()
    reduction = ((original_avg - cleaned_avg) / original_avg * 100)
    
    print(f"   ‚Ä¢ Original avg length: {original_avg:.0f} characters")
    print(f"   ‚Ä¢ Cleaned avg length: {cleaned_avg:.0f} characters")
    print(f"   ‚Ä¢ Reduction: {reduction:.1f}%")
    
    datasets['IMDB_cleaned'] = imdb_df
    print("‚úÖ IMDB cleaning completed")
else:
    print("‚ùå IMDB dataset not available")

üßπ Cleaning IMDB Reviews Dataset...


Processing IMDB: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [00:00<00:00, 1660.28it/s]


   ‚Ä¢ Original avg length: 1311 characters
   ‚Ä¢ Cleaned avg length: 839 characters
   ‚Ä¢ Reduction: 36.0%
‚úÖ IMDB cleaning completed


In [15]:
# Save cleaned datasets
import os
import json
import time

# Create cleaned data directory
cleaned_dir = "../data/cleaned"
os.makedirs(cleaned_dir, exist_ok=True)

saved_files = []

# Save BBC cleaned dataset
if 'BBC_cleaned' in datasets:
    filepath = os.path.join(cleaned_dir, "bbc_news_cleaned.csv")
    datasets['BBC_cleaned'].to_csv(filepath, index=False)
    saved_files.append(f"BBC: {filepath}")
    print(f"‚úÖ BBC dataset saved: {len(datasets['BBC_cleaned'])} articles")

# Save CNN cleaned dataset
if 'CNN_cleaned' in datasets:
    filepath = os.path.join(cleaned_dir, "cnn_dailymail_cleaned.csv")
    datasets['CNN_cleaned'].to_csv(filepath, index=False)
    saved_files.append(f"CNN: {filepath}")
    print(f"‚úÖ CNN dataset saved: {len(datasets['CNN_cleaned'])} articles")

# Save IMDB cleaned dataset
if 'IMDB_cleaned' in datasets:
    filepath = os.path.join(cleaned_dir, "imdb_reviews_cleaned.csv")
    datasets['IMDB_cleaned'].to_csv(filepath, index=False)
    saved_files.append(f"IMDB: {filepath}")
    print(f"‚úÖ IMDB dataset saved: {len(datasets['IMDB_cleaned'])} reviews")

# Save metadata
metadata = {
    'cleaning_date': time.strftime('%Y-%m-%d %H:%M:%S'),
    'day': 'Day 8-9',
    'objective': 'Text cleaning: remove special characters, punctuation, stop words',
    'files_created': saved_files,
    'next_step': 'Week 3: Topic modeling with LDA/NMF'
}

metadata_path = os.path.join(cleaned_dir, "preprocessing_metadata.json")
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2)

‚úÖ BBC dataset saved: 2225 articles
‚úÖ CNN dataset saved: 5000 articles
‚úÖ IMDB dataset saved: 1000 reviews


## Day 10-11: Text Normalization with Stemming and Lemmatization

**Objectives:**
- Implement stemming using Porter Stemmer
- Implement lemmatization using WordNet Lemmatizer
- Compare performance between stemming and lemmatization
- Apply normalization to all cleaned datasets
- Analyze the impact of normalization on text data

In [16]:
# Import additional libraries for stemming and lemmatization
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
import time

# Download additional NLTK data
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('averaged_perceptron_tagger_eng', quiet=True)
nltk.download('maxent_ne_chunker', quiet=True)
nltk.download('words', quiet=True)

# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

print("‚úÖ Stemming and Lemmatization tools initialized")
print(f"üìù Porter Stemmer ready")
print(f"üìù WordNet Lemmatizer ready")

‚úÖ Stemming and Lemmatization tools initialized
üìù Porter Stemmer ready
üìù WordNet Lemmatizer ready


In [17]:
# Define normalization functions
def get_wordnet_pos(treebank_tag):
    """Convert TreeBank POS tags to WordNet POS tags for better lemmatization"""
    if treebank_tag.startswith('J'):
        return 'a'  # adjective
    elif treebank_tag.startswith('V'):
        return 'v'  # verb
    elif treebank_tag.startswith('N'):
        return 'n'  # noun
    elif treebank_tag.startswith('R'):
        return 'r'  # adverb
    else:
        return 'n'  # default to noun

def stem_text(text):
    """Apply Porter Stemming to text"""
    if pd.isna(text) or text == "":
        return ""
    
    text = str(text)
    tokens = word_tokenize(text.lower())
    stemmed_tokens = [stemmer.stem(token) for token in tokens if token.isalpha()]
    return ' '.join(stemmed_tokens)

def lemmatize_text(text):
    """Apply Lemmatization with POS tagging to text"""
    if pd.isna(text) or text == "":
        return ""
    
    text = str(text)
    tokens = word_tokenize(text.lower())
    
    # Filter only alphabetic tokens
    alpha_tokens = [token for token in tokens if token.isalpha()]
    
    # Get POS tags
    pos_tags = pos_tag(alpha_tokens)
    
    # Lemmatize with appropriate POS tags
    lemmatized_tokens = [
        lemmatizer.lemmatize(token, get_wordnet_pos(pos)) 
        for token, pos in pos_tags
    ]
    
    return ' '.join(lemmatized_tokens)

def normalize_text_pipeline(text, method='lemma'):
    """Complete text normalization pipeline with stemming or lemmatization"""
    if pd.isna(text) or text == "":
        return ""
    
    if method == 'stem':
        return stem_text(text)
    elif method == 'lemma':
        return lemmatize_text(text)
    else:
        raise ValueError("Method must be 'stem' or 'lemma'")

print("‚úÖ Text normalization functions defined")
print("üìã Available methods: 'stem' (Porter Stemmer), 'lemma' (WordNet Lemmatizer)")

‚úÖ Text normalization functions defined
üìã Available methods: 'stem' (Porter Stemmer), 'lemma' (WordNet Lemmatizer)


In [18]:
# Demonstrate stemming vs lemmatization
sample_text = "The children were running and jumping happily in the beautiful gardens while their parents were watching"

print("üîç Stemming vs Lemmatization Comparison:")
print(f"Original text: {sample_text}")
print()

# Apply stemming
stemmed = stem_text(sample_text)
print(f"Stemmed text: {stemmed}")

# Apply lemmatization
lemmatized = lemmatize_text(sample_text)
print(f"Lemmatized text: {lemmatized}")

print()
print("üìä Key Differences:")
print("‚Ä¢ Stemming: Faster, rule-based, may create non-words")
print("‚Ä¢ Lemmatization: Slower, dictionary-based, preserves valid words")
print("‚Ä¢ Lemmatization with POS: Most accurate, context-aware")

üîç Stemming vs Lemmatization Comparison:
Original text: The children were running and jumping happily in the beautiful gardens while their parents were watching

Stemmed text: the children were run and jump happili in the beauti garden while their parent were watch
Lemmatized text: the child be run and jump happily in the beautiful garden while their parent be watch

üìä Key Differences:
‚Ä¢ Stemming: Faster, rule-based, may create non-words
‚Ä¢ Lemmatization: Slower, dictionary-based, preserves valid words
‚Ä¢ Lemmatization with POS: Most accurate, context-aware


In [19]:
# Apply normalization to BBC News Dataset
if 'BBC_cleaned' in datasets:
    print("üîÑ Normalizing BBC News Dataset...")
    bbc_df = datasets['BBC_cleaned'].copy()
    
    # Apply both stemming and lemmatization
    print("   üìù Applying stemming...")
    tqdm.pandas(desc="BBC Stemming")
    bbc_df['text_stemmed'] = bbc_df['text_cleaned'].progress_apply(
        lambda x: normalize_text_pipeline(x, method='stem')
    )
    
    print("   üìù Applying lemmatization...")
    tqdm.pandas(desc="BBC Lemmatization")
    bbc_df['text_lemmatized'] = bbc_df['text_cleaned'].progress_apply(
        lambda x: normalize_text_pipeline(x, method='lemma')
    )
    
    # Calculate metrics
    cleaned_avg = bbc_df['text_cleaned'].str.len().mean()
    stemmed_avg = bbc_df['text_stemmed'].str.len().mean()
    lemmatized_avg = bbc_df['text_lemmatized'].str.len().mean()
    
    stem_reduction = ((cleaned_avg - stemmed_avg) / cleaned_avg * 100)
    lemma_reduction = ((cleaned_avg - lemmatized_avg) / cleaned_avg * 100)
    
    print(f"   ‚Ä¢ Cleaned avg length: {cleaned_avg:.0f} characters")
    print(f"   ‚Ä¢ Stemmed avg length: {stemmed_avg:.0f} characters (‚Üì{stem_reduction:.1f}%)")
    print(f"   ‚Ä¢ Lemmatized avg length: {lemmatized_avg:.0f} characters (‚Üì{lemma_reduction:.1f}%)")
    
    datasets['BBC_normalized'] = bbc_df
    print("‚úÖ BBC normalization completed")
else:
    print("‚ùå BBC cleaned dataset not available")

üîÑ Normalizing BBC News Dataset...
   üìù Applying stemming...


BBC Stemming: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2225/2225 [00:07<00:00, 278.82it/s]


   üìù Applying lemmatization...


BBC Lemmatization: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2225/2225 [00:21<00:00, 101.23it/s]

   ‚Ä¢ Cleaned avg length: 1584 characters
   ‚Ä¢ Stemmed avg length: 1368 characters (‚Üì13.6%)
   ‚Ä¢ Lemmatized avg length: 1471 characters (‚Üì7.1%)
‚úÖ BBC normalization completed





In [20]:
# Apply normalization to CNN/DailyMail Dataset
if 'CNN_cleaned' in datasets:
    print("üîÑ Normalizing CNN/DailyMail Dataset...")
    cnn_df = datasets['CNN_cleaned'].copy()
    
    # Apply both stemming and lemmatization
    print("   üìù Applying stemming...")
    tqdm.pandas(desc="CNN Stemming")
    cnn_df['text_stemmed'] = cnn_df['text_cleaned'].progress_apply(
        lambda x: normalize_text_pipeline(x, method='stem')
    )
    
    print("   üìù Applying lemmatization...")
    tqdm.pandas(desc="CNN Lemmatization")
    cnn_df['text_lemmatized'] = cnn_df['text_cleaned'].progress_apply(
        lambda x: normalize_text_pipeline(x, method='lemma')
    )
    
    # Calculate metrics
    cleaned_avg = cnn_df['text_cleaned'].str.len().mean()
    stemmed_avg = cnn_df['text_stemmed'].str.len().mean()
    lemmatized_avg = cnn_df['text_lemmatized'].str.len().mean()
    
    stem_reduction = ((cleaned_avg - stemmed_avg) / cleaned_avg * 100)
    lemma_reduction = ((cleaned_avg - lemmatized_avg) / cleaned_avg * 100)
    
    print(f"   ‚Ä¢ Cleaned avg length: {cleaned_avg:.0f} characters")
    print(f"   ‚Ä¢ Stemmed avg length: {stemmed_avg:.0f} characters (‚Üì{stem_reduction:.1f}%)")
    print(f"   ‚Ä¢ Lemmatized avg length: {lemmatized_avg:.0f} characters (‚Üì{lemma_reduction:.1f}%)")
    
    datasets['CNN_normalized'] = cnn_df
    print("‚úÖ CNN normalization completed")
else:
    print("‚ùå CNN cleaned dataset not available")

üîÑ Normalizing CNN/DailyMail Dataset...
   üìù Applying stemming...


CNN Stemming: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5000/5000 [00:28<00:00, 172.69it/s]


   üìù Applying lemmatization...


CNN Lemmatization: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5000/5000 [01:12<00:00, 69.02it/s]

   ‚Ä¢ Cleaned avg length: 2518 characters
   ‚Ä¢ Stemmed avg length: 2210 characters (‚Üì12.2%)
   ‚Ä¢ Lemmatized avg length: 2387 characters (‚Üì5.2%)
‚úÖ CNN normalization completed





In [21]:
# Apply normalization to IMDB Reviews Dataset
if 'IMDB_cleaned' in datasets:
    print("üîÑ Normalizing IMDB Reviews Dataset...")
    imdb_df = datasets['IMDB_cleaned'].copy()
    
    # Apply both stemming and lemmatization
    print("   üìù Applying stemming...")
    tqdm.pandas(desc="IMDB Stemming")
    imdb_df['review_stemmed'] = imdb_df['review_cleaned'].progress_apply(
        lambda x: normalize_text_pipeline(x, method='stem')
    )
    
    print("   üìù Applying lemmatization...")
    tqdm.pandas(desc="IMDB Lemmatization")
    imdb_df['review_lemmatized'] = imdb_df['review_cleaned'].progress_apply(
        lambda x: normalize_text_pipeline(x, method='lemma')
    )
    
    # Calculate metrics
    cleaned_avg = imdb_df['review_cleaned'].str.len().mean()
    stemmed_avg = imdb_df['review_stemmed'].str.len().mean()
    lemmatized_avg = imdb_df['review_lemmatized'].str.len().mean()
    
    
    stem_reduction = ((cleaned_avg - stemmed_avg) / cleaned_avg * 100)
    lemma_reduction = ((cleaned_avg - lemmatized_avg) / cleaned_avg * 100)
    
    print(f"   ‚Ä¢ Cleaned avg length: {cleaned_avg:.0f} characters")
    print(f"   ‚Ä¢ Stemmed avg length: {stemmed_avg:.0f} characters (‚Üì{stem_reduction:.1f}%)")
    print(f"   ‚Ä¢ Lemmatized avg length: {lemmatized_avg:.0f} characters (‚Üì{lemma_reduction:.1f}%)")
    
    datasets['IMDB_normalized'] = imdb_df
    print("‚úÖ IMDB normalization completed")
else:
    print("‚ùå IMDB cleaned dataset not available")

üîÑ Normalizing IMDB Reviews Dataset...
   üìù Applying stemming...


IMDB Stemming: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [00:01<00:00, 520.21it/s]


   üìù Applying lemmatization...


IMDB Lemmatization: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [00:05<00:00, 187.51it/s]

   ‚Ä¢ Cleaned avg length: 839 characters
   ‚Ä¢ Stemmed avg length: 740 characters (‚Üì11.7%)
   ‚Ä¢ Lemmatized avg length: 796 characters (‚Üì5.1%)
‚úÖ IMDB normalization completed





In [22]:
# Comparative Analysis of Normalization Results
print("üìä Normalization Performance Summary")
print("=" * 60)

analysis_results = []

# Analyze each dataset
for dataset_name in ['BBC', 'CNN', 'IMDB']:
    normalized_key = f"{dataset_name}_normalized"
    
    if normalized_key in datasets:
        df = datasets[normalized_key]
        
        if dataset_name == 'IMDB':
            text_col = 'review_cleaned'
            stem_col = 'review_stemmed'
            lemma_col = 'review_lemmatized'
        else:
            text_col = 'text_cleaned'
            stem_col = 'text_stemmed'
            lemma_col = 'text_lemmatized'
        
        # Calculate vocabulary reduction
        def get_vocab_size(series):
            all_words = set()
            for text in series:
                if pd.notna(text) and text != "":
                    all_words.update(text.split())
            return len(all_words)
        
        # Get sample for vocabulary analysis (first 100 entries for performance)
        sample_df = df.head(100)
        
        original_vocab = get_vocab_size(sample_df[text_col])
        stemmed_vocab = get_vocab_size(sample_df[stem_col])
        lemmatized_vocab = get_vocab_size(sample_df[lemma_col])
        
        stem_vocab_reduction = ((original_vocab - stemmed_vocab) / original_vocab * 100)
        lemma_vocab_reduction = ((original_vocab - lemmatized_vocab) / original_vocab * 100)
        
        # Average lengths
        clean_len = df[text_col].str.len().mean()
        stem_len = df[stem_col].str.len().mean()
        lemma_len = df[lemma_col].str.len().mean()
        
        result = {
            'dataset': dataset_name,
            'original_vocab': original_vocab,
            'stemmed_vocab': stemmed_vocab,
            'lemmatized_vocab': lemmatized_vocab,
            'stem_vocab_reduction': stem_vocab_reduction,
            'lemma_vocab_reduction': lemma_vocab_reduction,
            'clean_avg_length': clean_len,
            'stem_avg_length': stem_len,
            'lemma_avg_length': lemma_len
        }
        
        analysis_results.append(result)
        
        print(f"\nüìã {dataset_name} Dataset Analysis:")
        print(f"   Vocabulary Size (sample):")
        print(f"     ‚Ä¢ Original: {original_vocab:,} unique words")
        print(f"     ‚Ä¢ Stemmed: {stemmed_vocab:,} unique words (‚Üì{stem_vocab_reduction:.1f}%)")
        print(f"     ‚Ä¢ Lemmatized: {lemmatized_vocab:,} unique words (‚Üì{lemma_vocab_reduction:.1f}%)")
        print(f"   Average Text Length:")
        print(f"     ‚Ä¢ Cleaned: {clean_len:.0f} characters")
        print(f"     ‚Ä¢ Stemmed: {stem_len:.0f} characters")
        print(f"     ‚Ä¢ Lemmatized: {lemma_len:.0f} characters")

print(f"\n‚úÖ Normalization analysis completed for {len(analysis_results)} datasets")

üìä Normalization Performance Summary

üìã BBC Dataset Analysis:
   Vocabulary Size (sample):
     ‚Ä¢ Original: 5,947 unique words
     ‚Ä¢ Stemmed: 4,235 unique words (‚Üì28.8%)
     ‚Ä¢ Lemmatized: 4,669 unique words (‚Üì21.5%)
   Average Text Length:
     ‚Ä¢ Cleaned: 1584 characters
     ‚Ä¢ Stemmed: 1368 characters
     ‚Ä¢ Lemmatized: 1471 characters

üìã CNN Dataset Analysis:
   Vocabulary Size (sample):
     ‚Ä¢ Original: 8,999 unique words
     ‚Ä¢ Stemmed: 6,506 unique words (‚Üì27.7%)
     ‚Ä¢ Lemmatized: 7,408 unique words (‚Üì17.7%)
   Average Text Length:
     ‚Ä¢ Cleaned: 2518 characters
     ‚Ä¢ Stemmed: 2210 characters
     ‚Ä¢ Lemmatized: 2387 characters

üìã IMDB Dataset Analysis:
   Vocabulary Size (sample):
     ‚Ä¢ Original: 4,648 unique words
     ‚Ä¢ Stemmed: 3,641 unique words (‚Üì21.7%)
     ‚Ä¢ Lemmatized: 3,914 unique words (‚Üì15.8%)
   Average Text Length:
     ‚Ä¢ Cleaned: 839 characters
     ‚Ä¢ Stemmed: 740 characters
     ‚Ä¢ Lemmatized: 796 chara

In [23]:
# Save normalized datasets
print("üíæ Saving normalized datasets...")

# Create normalized data directory
normalized_dir = "../data/normalized"
os.makedirs(normalized_dir, exist_ok=True)

saved_normalized_files = []

# Save BBC normalized dataset
if 'BBC_normalized' in datasets:
    filepath = os.path.join(normalized_dir, "bbc_news_normalized.csv")
    datasets['BBC_normalized'].to_csv(filepath, index=False)
    saved_normalized_files.append(f"BBC: {filepath}")
    print(f"‚úÖ BBC normalized dataset saved: {len(datasets['BBC_normalized'])} articles")

# Save CNN normalized dataset
if 'CNN_normalized' in datasets:
    filepath = os.path.join(normalized_dir, "cnn_dailymail_normalized.csv")
    datasets['CNN_normalized'].to_csv(filepath, index=False)
    saved_normalized_files.append(f"CNN: {filepath}")
    print(f"‚úÖ CNN normalized dataset saved: {len(datasets['CNN_normalized'])} articles")

# Save IMDB normalized dataset
if 'IMDB_normalized' in datasets:
    filepath = os.path.join(normalized_dir, "imdb_reviews_normalized.csv")
    datasets['IMDB_normalized'].to_csv(filepath, index=False)
    saved_normalized_files.append(f"IMDB: {filepath}")
    print(f"‚úÖ IMDB normalized dataset saved: {len(datasets['IMDB_normalized'])} reviews")

# Update metadata with normalization information
normalization_metadata = {
    'normalization_date': time.strftime('%Y-%m-%d %H:%M:%S'),
    'day': 'Day 10-11',
    'objective': 'Text normalization: stemming and lemmatization',
    'methods_used': {
        'stemming': 'Porter Stemmer',
        'lemmatization': 'WordNet Lemmatizer with POS tagging'
    },
    'files_created': saved_normalized_files,
    'analysis_results': analysis_results,
    'next_step': 'Topic modeling and sentiment analysis with normalized text',
    'recommendations': {
        'stemming': 'Faster processing, good for large-scale analysis',
        'lemmatization': 'Better semantic preservation, recommended for accuracy'
    }
}

# Save normalization metadata
metadata_path = os.path.join(normalized_dir, "normalization_metadata.json")
with open(metadata_path, 'w') as f:
    json.dump(normalization_metadata, f, indent=2)

print(f"‚úÖ Normalization metadata saved: {metadata_path}")
print(f"üéØ Day 10-11 objectives completed successfully!")
print(f"üìÅ Files ready for next phase: Topic modeling and sentiment analysis")

üíæ Saving normalized datasets...
‚úÖ BBC normalized dataset saved: 2225 articles
‚úÖ CNN normalized dataset saved: 5000 articles
‚úÖ IMDB normalized dataset saved: 1000 reviews
‚úÖ Normalization metadata saved: ../data/normalized\normalization_metadata.json
üéØ Day 10-11 objectives completed successfully!
üìÅ Files ready for next phase: Topic modeling and sentiment analysis


## NEXT STEP IS TO TOKENIZE THE DATASET
**TOKENIZING EACH DATASET ONE BY ONE AND THEN ADDING THE SAME INTO A FOLDER NAMED TOKENIZED WITH AT LAST THERE SUMMARY INTO A JSON FILE FORMAT.**

In [24]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

# download punkt for nltk if not already
nltk.download("punkt")

# huggingface optional (for subword tokenization)
try:
    from transformers import AutoTokenizer
    hf_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
except Exception as e:
    hf_tokenizer = None
    print("‚ö†Ô∏è transformers not installed or no internet, skipping subword tokenization")


‚ö†Ô∏è transformers not installed or no internet, skipping subword tokenization


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\subod\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [25]:
def word_level_tokens(text: str):
    if not isinstance(text, str) or not text.strip():
        return []
    return [t for t in word_tokenize(text) if t.strip()]

def hf_encode(texts, tokenizer, max_len=256):
    enc = tokenizer(
        texts,
        max_length=max_len,
        truncation=True,
        padding=False,
        return_attention_mask=True,
        add_special_tokens=True,
    )
    out = []
    for i in range(len(texts)):
        out.append({
            "input_ids": enc["input_ids"][i],
            "attention_mask": enc["attention_mask"][i],
        })
    return out


In [34]:
bbc_df = datasets["BBC_normalized"].copy()

# word-level
bbc_df["tokens_clean"]      = bbc_df["text_cleaned"].apply(word_level_tokens)
bbc_df["tokens_stemmed"]    = bbc_df["text_stemmed"].apply(word_level_tokens)
bbc_df["tokens_lemmatized"] = bbc_df["text_lemmatized"].apply(word_level_tokens)

# optional: subword (only clean + lemma, not stemmed)
if hf_tokenizer:
    enc_clean = hf_encode(bbc_df["text_cleaned"].fillna("").astype(str).tolist(), hf_tokenizer)
    enc_lemma = hf_encode(bbc_df["text_lemmatized"].fillna("").astype(str).tolist(), hf_tokenizer)

    bbc_df["hf_clean_ids"]       = [e["input_ids"] for e in enc_clean]
    bbc_df["hf_clean_mask"]      = [e["attention_mask"] for e in enc_clean]
    bbc_df["hf_lemmatized_ids"]  = [e["input_ids"] for e in enc_lemma]
    bbc_df["hf_lemmatized_mask"] = [e["attention_mask"] for e in enc_lemma]

# save
bbc_df.to_parquet("./tokenized/bbc_news_tokenized.parquet", index=False)


In [35]:
cnn = datasets["CNN_normalized"].copy()

cnn["tokens_clean"]      = cnn["text_cleaned"].apply(word_level_tokens)
cnn["tokens_stemmed"]    = cnn["text_stemmed"].apply(word_level_tokens)
cnn["tokens_lemmatized"] = cnn["text_lemmatized"].apply(word_level_tokens)

if hf_tokenizer:
    enc_clean = hf_encode(cnn["text_cleaned"].fillna("").astype(str).tolist(), hf_tokenizer)
    enc_lemma = hf_encode(cnn["text_lemmatized"].fillna("").astype(str).tolist(), hf_tokenizer)

    cnn["hf_clean_ids"]       = [e["input_ids"] for e in enc_clean]
    cnn["hf_clean_mask"]      = [e["attention_mask"] for e in enc_clean]
    cnn["hf_lemmatized_ids"]  = [e["input_ids"] for e in enc_lemma]
    cnn["hf_lemmatized_mask"] = [e["attention_mask"] for e in enc_lemma]

cnn.to_parquet("./tokenized/cnn_dailymail_tokenized.parquet", index=False)


In [36]:
imdb = datasets["IMDB_normalized"].copy()

imdb["tokens_clean"]      = imdb["review_cleaned"].apply(word_level_tokens)
imdb["tokens_stemmed"]    = imdb["review_stemmed"].apply(word_level_tokens)
imdb["tokens_lemmatized"] = imdb["review_lemmatized"].apply(word_level_tokens)

if hf_tokenizer:
    enc_clean = hf_encode(imdb["review_cleaned"].fillna("").astype(str).tolist(), hf_tokenizer)
    enc_lemma = hf_encode(imdb["review_lemmatized"].fillna("").astype(str).tolist(), hf_tokenizer)

    imdb["hf_clean_ids"]       = [e["input_ids"] for e in enc_clean]
    imdb["hf_clean_mask"]      = [e["attention_mask"] for e in enc_clean]
    imdb["hf_lemmatized_ids"]  = [e["input_ids"] for e in enc_lemma]
    imdb["hf_lemmatized_mask"] = [e["attention_mask"] for e in enc_lemma]

imdb.to_parquet("./tokenized/imdb_reviews_tokenized.parquet", index=False)


In [37]:
import pandas as pd

df = pd.read_parquet("./tokenized/bbc_news_tokenized.parquet")
print(df.columns)
print(df[["text_cleaned","tokens_lemmatized"]].head(3))


Index(['category', 'text', 'text_cleaned', 'text_stemmed', 'text_lemmatized',
       'tokens_clean', 'tokens_stemmed', 'tokens_lemmatized'],
      dtype='object')
                                        text_cleaned  \
0  tv future hands viewers home theatre systems p...   
1  worldcom boss left books alone former worldcom...   
2  tigers wary farrell gamble leicester say not r...   

                                   tokens_lemmatized  
0  [tv, future, hand, viewer, home, theatre, syst...  
1  [worldcom, bos, leave, book, alone, former, wo...  
2  [tiger, wary, farrell, gamble, leicester, say,...  


In [39]:
import json

def summarize_token_lengths(df, cols):
    summary = {}
    for c in cols:
        if c in df.columns:
            lengths = df[c].apply(
                lambda x: len(x) if isinstance(x, (list, tuple)) else 0
            )
            summary[c] = {
                "count": int(lengths.shape[0]),
                "avg_len": float(lengths.mean()) if len(lengths) else 0.0,
                "p50_len": float(lengths.quantile(0.50)) if len(lengths) else 0.0,
                "p90_len": float(lengths.quantile(0.90)) if len(lengths) else 0.0,
                "p95_len": float(lengths.quantile(0.95)) if len(lengths) else 0.0,
                "max_len": int(lengths.max()) if len(lengths) else 0,
            }
    return summary

report = {}

# --- BBC ---
bbc_loaded = pd.read_parquet("./tokenized/bbc_news_tokenized.parquet")
report["BBC"] = summarize_token_lengths(
    bbc_loaded,
    ["tokens_clean","tokens_stemmed","tokens_lemmatized",
     "hf_clean_ids","hf_lemmatized_ids"]
)
print("üìä BBC:", report["BBC"])

# --- CNN ---
cnn_loaded = pd.read_parquet("./tokenized/cnn_dailymail_tokenized.parquet")
report["CNN"] = summarize_token_lengths(
    cnn_loaded,
    ["tokens_clean","tokens_stemmed","tokens_lemmatized",
     "hf_clean_ids","hf_lemmatized_ids"]
)
print("üìä CNN:", report["CNN"])

# --- IMDB ---
imdb_loaded = pd.read_parquet("./tokenized/imdb_reviews_tokenized.parquet")
report["IMDB"] = summarize_token_lengths(
    imdb_loaded,
    ["tokens_clean","tokens_stemmed","tokens_lemmatized",
     "hf_clean_ids","hf_lemmatized_ids"]
)
print("üìä IMDB:", report["IMDB"])

# --- save all summaries into JSON ---
with open("./tokenized/tokenization_report.json", "w") as f:
    json.dump(report, f, indent=4)

print("\n‚úÖ tokenization report saved to ./tokenized/tokenization_report.json")


üìä BBC: {'tokens_clean': {'count': 2225, 'avg_len': 0.0, 'p50_len': 0.0, 'p90_len': 0.0, 'p95_len': 0.0, 'max_len': 0}, 'tokens_stemmed': {'count': 2225, 'avg_len': 0.0, 'p50_len': 0.0, 'p90_len': 0.0, 'p95_len': 0.0, 'max_len': 0}, 'tokens_lemmatized': {'count': 2225, 'avg_len': 0.0, 'p50_len': 0.0, 'p90_len': 0.0, 'p95_len': 0.0, 'max_len': 0}}
üìä CNN: {'tokens_clean': {'count': 5000, 'avg_len': 0.0, 'p50_len': 0.0, 'p90_len': 0.0, 'p95_len': 0.0, 'max_len': 0}, 'tokens_stemmed': {'count': 5000, 'avg_len': 0.0, 'p50_len': 0.0, 'p90_len': 0.0, 'p95_len': 0.0, 'max_len': 0}, 'tokens_lemmatized': {'count': 5000, 'avg_len': 0.0, 'p50_len': 0.0, 'p90_len': 0.0, 'p95_len': 0.0, 'max_len': 0}}
üìä IMDB: {'tokens_clean': {'count': 1000, 'avg_len': 0.0, 'p50_len': 0.0, 'p90_len': 0.0, 'p95_len': 0.0, 'max_len': 0}, 'tokens_stemmed': {'count': 1000, 'avg_len': 0.0, 'p50_len': 0.0, 'p90_len': 0.0, 'p95_len': 0.0, 'max_len': 0}, 'tokens_lemmatized': {'count': 1000, 'avg_len': 0.0, 'p50_len'