## NarrativeNexus Project: Text Cleaning Implementation

**Objectives:**
- Remove special characters, punctuation, and stop words
- Apply preprocessing to BBC, CNN/DailyMail, and IMDB datasets
- Save cleaned datasets

In [21]:
# Import required libraries
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Download NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

# Initialize tools
stop_words = set(stopwords.words('english'))
print(f"✅ Setup complete. Loaded {len(stop_words)} stop words.")

✅ Setup complete. Loaded 198 stop words.


In [22]:
# Define text cleaning functions
def clean_special_characters(text):
    """Remove special characters, keep only letters, numbers, and spaces"""
    if pd.isna(text):
        return ""
    text = str(text)
    # Remove special characters
    cleaned = re.sub(r'[^\w\s]', ' ', text)
    # Remove extra whitespace
    cleaned = re.sub(r'\s+', ' ', cleaned)
    return cleaned.strip()

def remove_stop_words(text, keep_negations=True):
    """Remove stop words while preserving negations"""
    if pd.isna(text):
        return ""
    text = str(text)
    
    # Keep important negation words
    stop_words_filtered = stop_words.copy()
    if keep_negations:
        important_words = {'not', 'no', 'never', 'none', 'neither', 'nobody', 'nothing'}
        stop_words_filtered = stop_words_filtered - important_words
    
    # Tokenize and filter
    tokens = word_tokenize(text.lower())
    filtered_tokens = [word for word in tokens if word not in stop_words_filtered]
    return ' '.join(filtered_tokens)

def clean_text_pipeline(text):
    """Complete text cleaning pipeline"""
    if pd.isna(text):
        return ""
    
    # Step 1: Convert to lowercase
    cleaned = str(text).lower()
    
    # Step 2: Remove special characters
    cleaned = clean_special_characters(cleaned)
    
    # Step 3: Remove stop words
    cleaned = remove_stop_words(cleaned, keep_negations=True)
    
    return cleaned

print("✅ Text cleaning functions defined.")

✅ Text cleaning functions defined.


In [23]:
# Load datasets
data_dir = "../data"
datasets = {}

# Load BBC News Dataset
try:
    bbc_df = pd.read_csv(f"{data_dir}/bbc-text.csv")
    datasets['BBC'] = bbc_df
    print(f"✅ BBC Dataset: {len(bbc_df)} articles loaded")
except Exception as e:
    print(f"❌ Error loading BBC dataset: {e}")

# Load CNN/DailyMail Dataset
try:
    cnn_df = pd.read_csv(f"{data_dir}/cnn_dailymail.csv")
    datasets['CNN'] = cnn_df
    print(f"✅ CNN Dataset: {len(cnn_df)} articles loaded")
except Exception as e:
    print(f"❌ Error loading CNN dataset: {e}")

# Load IMDB Dataset (subset for demo)
try:
    imdb_df = pd.read_csv(f"{data_dir}/imdb-dataset.csv", nrows=1000)
    datasets['IMDB'] = imdb_df
    print(f"✅ IMDB Dataset: {len(imdb_df)} reviews loaded")
except Exception as e:
    print(f"❌ Error loading IMDB dataset: {e}")

print(f"\n📊 Total datasets loaded: {len(datasets)}")

✅ BBC Dataset: 2225 articles loaded
✅ CNN Dataset: 5000 articles loaded
✅ IMDB Dataset: 1000 reviews loaded

📊 Total datasets loaded: 3


In [26]:
# Clean BBC News Dataset
if 'BBC' in datasets:
    print("🧹 Cleaning BBC News Dataset...")
    bbc_df = datasets['BBC'].copy()
    
    # Apply cleaning
    tqdm.pandas(desc="Processing BBC")
    bbc_df['text_cleaned'] = bbc_df['text'].progress_apply(clean_text_pipeline)
    
    # Calculate metrics
    original_avg = bbc_df['text'].str.len().mean()
    cleaned_avg = bbc_df['text_cleaned'].str.len().mean()
    reduction = ((original_avg - cleaned_avg) / original_avg * 100)
    
    print(f"   • Original avg length: {original_avg:.0f} characters")
    print(f"   • Cleaned avg length: {cleaned_avg:.0f} characters")
    print(f"   • Reduction: {reduction:.1f}%")
    
    datasets['BBC_cleaned'] = bbc_df
    print("✅ BBC cleaning completed")
else:
    print("❌ BBC dataset not available")

🧹 Cleaning BBC News Dataset...


Processing BBC: 100%|██████████| 2225/2225 [00:02<00:00, 1091.59it/s]


   • Original avg length: 2263 characters
   • Cleaned avg length: 1584 characters
   • Reduction: 30.0%
✅ BBC cleaning completed


In [27]:
# Clean CNN/DailyMail Dataset
if 'CNN' in datasets:
    print("🧹 Cleaning CNN/DailyMail Dataset...")
    cnn_df = datasets['CNN'].copy()
    
    # Identify text column
    text_column = 'article' if 'article' in cnn_df.columns else 'text'
    
    # Apply cleaning
    tqdm.pandas(desc="Processing CNN")
    cnn_df['text_cleaned'] = cnn_df[text_column].progress_apply(clean_text_pipeline)
    
    # Calculate metrics
    original_avg = cnn_df[text_column].str.len().mean()
    cleaned_avg = cnn_df['text_cleaned'].str.len().mean()
    reduction = ((original_avg - cleaned_avg) / original_avg * 100)
    
    print(f"   • Original avg length: {original_avg:.0f} characters")
    print(f"   • Cleaned avg length: {cleaned_avg:.0f} characters")
    print(f"   • Reduction: {reduction:.1f}%")
    
    datasets['CNN_cleaned'] = cnn_df
    print("✅ CNN cleaning completed")
else:
    print("❌ CNN dataset not available")

🧹 Cleaning CNN/DailyMail Dataset...


Processing CNN: 100%|██████████| 5000/5000 [00:04<00:00, 1041.26it/s]

   • Original avg length: 2518 characters
   • Cleaned avg length: 2518 characters
   • Reduction: 0.0%
✅ CNN cleaning completed





In [28]:
# Clean IMDB Dataset
if 'IMDB' in datasets:
    print("🧹 Cleaning IMDB Reviews Dataset...")
    imdb_df = datasets['IMDB'].copy()
    
    # Apply cleaning
    tqdm.pandas(desc="Processing IMDB")
    imdb_df['review_cleaned'] = imdb_df['review'].progress_apply(clean_text_pipeline)
    
    # Calculate metrics
    original_avg = imdb_df['review'].str.len().mean()
    cleaned_avg = imdb_df['review_cleaned'].str.len().mean()
    reduction = ((original_avg - cleaned_avg) / original_avg * 100)
    
    print(f"   • Original avg length: {original_avg:.0f} characters")
    print(f"   • Cleaned avg length: {cleaned_avg:.0f} characters")
    print(f"   • Reduction: {reduction:.1f}%")
    
    datasets['IMDB_cleaned'] = imdb_df
    print("✅ IMDB cleaning completed")
else:
    print("❌ IMDB dataset not available")

🧹 Cleaning IMDB Reviews Dataset...


Processing IMDB: 100%|██████████| 1000/1000 [00:00<00:00, 1759.02it/s]

   • Original avg length: 1311 characters
   • Cleaned avg length: 839 characters
   • Reduction: 36.0%
✅ IMDB cleaning completed





In [29]:
# Save cleaned datasets
import os
import json
import time

# Create cleaned data directory
cleaned_dir = "../data/cleaned"
os.makedirs(cleaned_dir, exist_ok=True)

saved_files = []

# Save BBC cleaned dataset
if 'BBC_cleaned' in datasets:
    filepath = os.path.join(cleaned_dir, "bbc_news_cleaned.csv")
    datasets['BBC_cleaned'].to_csv(filepath, index=False)
    saved_files.append(f"BBC: {filepath}")
    print(f"✅ BBC dataset saved: {len(datasets['BBC_cleaned'])} articles")

# Save CNN cleaned dataset
if 'CNN_cleaned' in datasets:
    filepath = os.path.join(cleaned_dir, "cnn_dailymail_cleaned.csv")
    datasets['CNN_cleaned'].to_csv(filepath, index=False)
    saved_files.append(f"CNN: {filepath}")
    print(f"✅ CNN dataset saved: {len(datasets['CNN_cleaned'])} articles")

# Save IMDB cleaned dataset
if 'IMDB_cleaned' in datasets:
    filepath = os.path.join(cleaned_dir, "imdb_reviews_cleaned.csv")
    datasets['IMDB_cleaned'].to_csv(filepath, index=False)
    saved_files.append(f"IMDB: {filepath}")
    print(f"✅ IMDB dataset saved: {len(datasets['IMDB_cleaned'])} reviews")

# Save metadata
metadata = {
    'cleaning_date': time.strftime('%Y-%m-%d %H:%M:%S'),
    'day': 'Day 8-9',
    'objective': 'Text cleaning: remove special characters, punctuation, stop words',
    'files_created': saved_files,
    'next_step': 'Week 3: Topic modeling with LDA/NMF'
}

metadata_path = os.path.join(cleaned_dir, "preprocessing_metadata.json")
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2)

✅ BBC dataset saved: 2225 articles
✅ CNN dataset saved: 5000 articles
✅ IMDB dataset saved: 1000 reviews


## Day 10-11: Text Normalization with Stemming and Lemmatization

**Objectives:**
- Implement stemming using Porter Stemmer
- Implement lemmatization using WordNet Lemmatizer
- Compare performance between stemming and lemmatization
- Apply normalization to all cleaned datasets
- Analyze the impact of normalization on text data

In [30]:
# Import additional libraries for stemming and lemmatization
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
import time

# Download additional NLTK data
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('averaged_perceptron_tagger_eng', quiet=True)
nltk.download('maxent_ne_chunker', quiet=True)
nltk.download('words', quiet=True)

# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

print("✅ Stemming and Lemmatization tools initialized")
print(f"📝 Porter Stemmer ready")
print(f"📝 WordNet Lemmatizer ready")

✅ Stemming and Lemmatization tools initialized
📝 Porter Stemmer ready
📝 WordNet Lemmatizer ready


In [34]:
# Define normalization functions
def get_wordnet_pos(treebank_tag):
    """Convert TreeBank POS tags to WordNet POS tags for better lemmatization"""
    if treebank_tag.startswith('J'):
        return 'a'  # adjective
    elif treebank_tag.startswith('V'):
        return 'v'  # verb
    elif treebank_tag.startswith('N'):
        return 'n'  # noun
    elif treebank_tag.startswith('R'):
        return 'r'  # adverb
    else:
        return 'n'  # default to noun

def stem_text(text):
    """Apply Porter Stemming to text"""
    if pd.isna(text) or text == "":
        return ""
    
    text = str(text)
    tokens = word_tokenize(text.lower())
    stemmed_tokens = [stemmer.stem(token) for token in tokens if token.isalpha()]
    return ' '.join(stemmed_tokens)

def lemmatize_text(text):
    """Apply Lemmatization with POS tagging to text"""
    if pd.isna(text) or text == "":
        return ""
    
    text = str(text)
    tokens = word_tokenize(text.lower())
    
    # Filter only alphabetic tokens
    alpha_tokens = [token for token in tokens if token.isalpha()]
    
    # Get POS tags
    pos_tags = pos_tag(alpha_tokens)
    
    # Lemmatize with appropriate POS tags
    lemmatized_tokens = [
        lemmatizer.lemmatize(token, get_wordnet_pos(pos)) 
        for token, pos in pos_tags
    ]
    
    return ' '.join(lemmatized_tokens)

def normalize_text_pipeline(text, method='lemma'):
    """Complete text normalization pipeline with stemming or lemmatization"""
    if pd.isna(text) or text == "":
        return ""
    
    if method == 'stem':
        return stem_text(text)
    elif method == 'lemma':
        return lemmatize_text(text)
    else:
        raise ValueError("Method must be 'stem' or 'lemma'")

print("✅ Text normalization functions defined")
print("📋 Available methods: 'stem' (Porter Stemmer), 'lemma' (WordNet Lemmatizer)")

✅ Text normalization functions defined
📋 Available methods: 'stem' (Porter Stemmer), 'lemma' (WordNet Lemmatizer)


In [35]:
# Demonstrate stemming vs lemmatization
sample_text = "The children were running and jumping happily in the beautiful gardens while their parents were watching"

print("🔍 Stemming vs Lemmatization Comparison:")
print(f"Original text: {sample_text}")
print()

# Apply stemming
stemmed = stem_text(sample_text)
print(f"Stemmed text: {stemmed}")

# Apply lemmatization
lemmatized = lemmatize_text(sample_text)
print(f"Lemmatized text: {lemmatized}")

print()
print("📊 Key Differences:")
print("• Stemming: Faster, rule-based, may create non-words")
print("• Lemmatization: Slower, dictionary-based, preserves valid words")
print("• Lemmatization with POS: Most accurate, context-aware")

🔍 Stemming vs Lemmatization Comparison:
Original text: The children were running and jumping happily in the beautiful gardens while their parents were watching

Stemmed text: the children were run and jump happili in the beauti garden while their parent were watch
Lemmatized text: the child be run and jump happily in the beautiful garden while their parent be watch

📊 Key Differences:
• Stemming: Faster, rule-based, may create non-words
• Lemmatization: Slower, dictionary-based, preserves valid words
• Lemmatization with POS: Most accurate, context-aware


In [36]:
# Apply normalization to BBC News Dataset
if 'BBC_cleaned' in datasets:
    print("🔄 Normalizing BBC News Dataset...")
    bbc_df = datasets['BBC_cleaned'].copy()
    
    # Apply both stemming and lemmatization
    print("   📝 Applying stemming...")
    tqdm.pandas(desc="BBC Stemming")
    bbc_df['text_stemmed'] = bbc_df['text_cleaned'].progress_apply(
        lambda x: normalize_text_pipeline(x, method='stem')
    )
    
    print("   📝 Applying lemmatization...")
    tqdm.pandas(desc="BBC Lemmatization")
    bbc_df['text_lemmatized'] = bbc_df['text_cleaned'].progress_apply(
        lambda x: normalize_text_pipeline(x, method='lemma')
    )
    
    # Calculate metrics
    cleaned_avg = bbc_df['text_cleaned'].str.len().mean()
    stemmed_avg = bbc_df['text_stemmed'].str.len().mean()
    lemmatized_avg = bbc_df['text_lemmatized'].str.len().mean()
    
    stem_reduction = ((cleaned_avg - stemmed_avg) / cleaned_avg * 100)
    lemma_reduction = ((cleaned_avg - lemmatized_avg) / cleaned_avg * 100)
    
    print(f"   • Cleaned avg length: {cleaned_avg:.0f} characters")
    print(f"   • Stemmed avg length: {stemmed_avg:.0f} characters (↓{stem_reduction:.1f}%)")
    print(f"   • Lemmatized avg length: {lemmatized_avg:.0f} characters (↓{lemma_reduction:.1f}%)")
    
    datasets['BBC_normalized'] = bbc_df
    print("✅ BBC normalization completed")
else:
    print("❌ BBC cleaned dataset not available")

🔄 Normalizing BBC News Dataset...
   📝 Applying stemming...


BBC Stemming: 100%|██████████| 2225/2225 [00:15<00:00, 143.43it/s]


   📝 Applying lemmatization...


BBC Lemmatization: 100%|██████████| 2225/2225 [00:31<00:00, 69.60it/s]

   • Cleaned avg length: 1584 characters
   • Stemmed avg length: 1368 characters (↓13.6%)
   • Lemmatized avg length: 1471 characters (↓7.1%)
✅ BBC normalization completed





In [37]:
# Apply normalization to CNN/DailyMail Dataset
if 'CNN_cleaned' in datasets:
    print("🔄 Normalizing CNN/DailyMail Dataset...")
    cnn_df = datasets['CNN_cleaned'].copy()
    
    # Apply both stemming and lemmatization
    print("   📝 Applying stemming...")
    tqdm.pandas(desc="CNN Stemming")
    cnn_df['text_stemmed'] = cnn_df['text_cleaned'].progress_apply(
        lambda x: normalize_text_pipeline(x, method='stem')
    )
    
    print("   📝 Applying lemmatization...")
    tqdm.pandas(desc="CNN Lemmatization")
    cnn_df['text_lemmatized'] = cnn_df['text_cleaned'].progress_apply(
        lambda x: normalize_text_pipeline(x, method='lemma')
    )
    
    # Calculate metrics
    cleaned_avg = cnn_df['text_cleaned'].str.len().mean()
    stemmed_avg = cnn_df['text_stemmed'].str.len().mean()
    lemmatized_avg = cnn_df['text_lemmatized'].str.len().mean()
    
    stem_reduction = ((cleaned_avg - stemmed_avg) / cleaned_avg * 100)
    lemma_reduction = ((cleaned_avg - lemmatized_avg) / cleaned_avg * 100)
    
    print(f"   • Cleaned avg length: {cleaned_avg:.0f} characters")
    print(f"   • Stemmed avg length: {stemmed_avg:.0f} characters (↓{stem_reduction:.1f}%)")
    print(f"   • Lemmatized avg length: {lemmatized_avg:.0f} characters (↓{lemma_reduction:.1f}%)")
    
    datasets['CNN_normalized'] = cnn_df
    print("✅ CNN normalization completed")
else:
    print("❌ CNN cleaned dataset not available")

🔄 Normalizing CNN/DailyMail Dataset...
   📝 Applying stemming...


CNN Stemming: 100%|██████████| 5000/5000 [01:00<00:00, 82.51it/s] 


   📝 Applying lemmatization...


CNN Lemmatization: 100%|██████████| 5000/5000 [01:57<00:00, 42.62it/s]

   • Cleaned avg length: 2518 characters
   • Stemmed avg length: 2210 characters (↓12.2%)
   • Lemmatized avg length: 2387 characters (↓5.2%)
✅ CNN normalization completed





In [38]:
# Apply normalization to IMDB Reviews Dataset
if 'IMDB_cleaned' in datasets:
    print("🔄 Normalizing IMDB Reviews Dataset...")
    imdb_df = datasets['IMDB_cleaned'].copy()
    
    # Apply both stemming and lemmatization
    print("   📝 Applying stemming...")
    tqdm.pandas(desc="IMDB Stemming")
    imdb_df['review_stemmed'] = imdb_df['review_cleaned'].progress_apply(
        lambda x: normalize_text_pipeline(x, method='stem')
    )
    
    print("   📝 Applying lemmatization...")
    tqdm.pandas(desc="IMDB Lemmatization")
    imdb_df['review_lemmatized'] = imdb_df['review_cleaned'].progress_apply(
        lambda x: normalize_text_pipeline(x, method='lemma')
    )
    
    # Calculate metrics
    cleaned_avg = imdb_df['review_cleaned'].str.len().mean()
    stemmed_avg = imdb_df['review_stemmed'].str.len().mean()
    lemmatized_avg = imdb_df['review_lemmatized'].str.len().mean()
    
    
    stem_reduction = ((cleaned_avg - stemmed_avg) / cleaned_avg * 100)
    lemma_reduction = ((cleaned_avg - lemmatized_avg) / cleaned_avg * 100)
    
    print(f"   • Cleaned avg length: {cleaned_avg:.0f} characters")
    print(f"   • Stemmed avg length: {stemmed_avg:.0f} characters (↓{stem_reduction:.1f}%)")
    print(f"   • Lemmatized avg length: {lemmatized_avg:.0f} characters (↓{lemma_reduction:.1f}%)")
    
    datasets['IMDB_normalized'] = imdb_df
    print("✅ IMDB normalization completed")
else:
    print("❌ IMDB cleaned dataset not available")

🔄 Normalizing IMDB Reviews Dataset...
   📝 Applying stemming...


IMDB Stemming: 100%|██████████| 1000/1000 [00:03<00:00, 250.11it/s]


   📝 Applying lemmatization...


IMDB Lemmatization: 100%|██████████| 1000/1000 [00:07<00:00, 126.38it/s]

   • Cleaned avg length: 839 characters
   • Stemmed avg length: 740 characters (↓11.7%)
   • Lemmatized avg length: 796 characters (↓5.1%)
✅ IMDB normalization completed





In [40]:
# Comparative Analysis of Normalization Results
print("📊 Normalization Performance Summary")
print("=" * 60)

analysis_results = []

# Analyze each dataset
for dataset_name in ['BBC', 'CNN', 'IMDB']:
    normalized_key = f"{dataset_name}_normalized"
    
    if normalized_key in datasets:
        df = datasets[normalized_key]
        
        if dataset_name == 'IMDB':
            text_col = 'review_cleaned'
            stem_col = 'review_stemmed'
            lemma_col = 'review_lemmatized'
        else:
            text_col = 'text_cleaned'
            stem_col = 'text_stemmed'
            lemma_col = 'text_lemmatized'
        
        # Calculate vocabulary reduction
        def get_vocab_size(series):
            all_words = set()
            for text in series:
                if pd.notna(text) and text != "":
                    all_words.update(text.split())
            return len(all_words)
        
        # Get sample for vocabulary analysis (first 100 entries for performance)
        sample_df = df.head(100)
        
        original_vocab = get_vocab_size(sample_df[text_col])
        stemmed_vocab = get_vocab_size(sample_df[stem_col])
        lemmatized_vocab = get_vocab_size(sample_df[lemma_col])
        
        stem_vocab_reduction = ((original_vocab - stemmed_vocab) / original_vocab * 100)
        lemma_vocab_reduction = ((original_vocab - lemmatized_vocab) / original_vocab * 100)
        
        # Average lengths
        clean_len = df[text_col].str.len().mean()
        stem_len = df[stem_col].str.len().mean()
        lemma_len = df[lemma_col].str.len().mean()
        
        result = {
            'dataset': dataset_name,
            'original_vocab': original_vocab,
            'stemmed_vocab': stemmed_vocab,
            'lemmatized_vocab': lemmatized_vocab,
            'stem_vocab_reduction': stem_vocab_reduction,
            'lemma_vocab_reduction': lemma_vocab_reduction,
            'clean_avg_length': clean_len,
            'stem_avg_length': stem_len,
            'lemma_avg_length': lemma_len
        }
        
        analysis_results.append(result)
        
        print(f"\n📋 {dataset_name} Dataset Analysis:")
        print(f"   Vocabulary Size (sample):")
        print(f"     • Original: {original_vocab:,} unique words")
        print(f"     • Stemmed: {stemmed_vocab:,} unique words (↓{stem_vocab_reduction:.1f}%)")
        print(f"     • Lemmatized: {lemmatized_vocab:,} unique words (↓{lemma_vocab_reduction:.1f}%)")
        print(f"   Average Text Length:")
        print(f"     • Cleaned: {clean_len:.0f} characters")
        print(f"     • Stemmed: {stem_len:.0f} characters")
        print(f"     • Lemmatized: {lemma_len:.0f} characters")

print(f"\n✅ Normalization analysis completed for {len(analysis_results)} datasets")

📊 Normalization Performance Summary

📋 BBC Dataset Analysis:
   Vocabulary Size (sample):
     • Original: 5,947 unique words
     • Stemmed: 4,235 unique words (↓28.8%)
     • Lemmatized: 4,669 unique words (↓21.5%)
   Average Text Length:
     • Cleaned: 1584 characters
     • Stemmed: 1368 characters
     • Lemmatized: 1471 characters

📋 CNN Dataset Analysis:
   Vocabulary Size (sample):
     • Original: 8,999 unique words
     • Stemmed: 6,506 unique words (↓27.7%)
     • Lemmatized: 7,408 unique words (↓17.7%)
   Average Text Length:
     • Cleaned: 2518 characters
     • Stemmed: 2210 characters
     • Lemmatized: 2387 characters

📋 IMDB Dataset Analysis:
   Vocabulary Size (sample):
     • Original: 4,648 unique words
     • Stemmed: 3,641 unique words (↓21.7%)
     • Lemmatized: 3,914 unique words (↓15.8%)
   Average Text Length:
     • Cleaned: 839 characters
     • Stemmed: 740 characters
     • Lemmatized: 796 characters

✅ Normalization analysis completed for 3 datasets


In [41]:
# Save normalized datasets
print("💾 Saving normalized datasets...")

# Create normalized data directory
normalized_dir = "../data/normalized"
os.makedirs(normalized_dir, exist_ok=True)

saved_normalized_files = []

# Save BBC normalized dataset
if 'BBC_normalized' in datasets:
    filepath = os.path.join(normalized_dir, "bbc_news_normalized.csv")
    datasets['BBC_normalized'].to_csv(filepath, index=False)
    saved_normalized_files.append(f"BBC: {filepath}")
    print(f"✅ BBC normalized dataset saved: {len(datasets['BBC_normalized'])} articles")

# Save CNN normalized dataset
if 'CNN_normalized' in datasets:
    filepath = os.path.join(normalized_dir, "cnn_dailymail_normalized.csv")
    datasets['CNN_normalized'].to_csv(filepath, index=False)
    saved_normalized_files.append(f"CNN: {filepath}")
    print(f"✅ CNN normalized dataset saved: {len(datasets['CNN_normalized'])} articles")

# Save IMDB normalized dataset
if 'IMDB_normalized' in datasets:
    filepath = os.path.join(normalized_dir, "imdb_reviews_normalized.csv")
    datasets['IMDB_normalized'].to_csv(filepath, index=False)
    saved_normalized_files.append(f"IMDB: {filepath}")
    print(f"✅ IMDB normalized dataset saved: {len(datasets['IMDB_normalized'])} reviews")

# Update metadata with normalization information
normalization_metadata = {
    'normalization_date': time.strftime('%Y-%m-%d %H:%M:%S'),
    'day': 'Day 10-11',
    'objective': 'Text normalization: stemming and lemmatization',
    'methods_used': {
        'stemming': 'Porter Stemmer',
        'lemmatization': 'WordNet Lemmatizer with POS tagging'
    },
    'files_created': saved_normalized_files,
    'analysis_results': analysis_results,
    'next_step': 'Topic modeling and sentiment analysis with normalized text',
    'recommendations': {
        'stemming': 'Faster processing, good for large-scale analysis',
        'lemmatization': 'Better semantic preservation, recommended for accuracy'
    }
}

# Save normalization metadata
metadata_path = os.path.join(normalized_dir, "normalization_metadata.json")
with open(metadata_path, 'w') as f:
    json.dump(normalization_metadata, f, indent=2)

print(f"✅ Normalization metadata saved: {metadata_path}")
print(f"🎯 Day 10-11 objectives completed successfully!")
print(f"📁 Files ready for next phase: Topic modeling and sentiment analysis")

💾 Saving normalized datasets...
✅ BBC normalized dataset saved: 2225 articles
✅ CNN normalized dataset saved: 5000 articles
✅ IMDB normalized dataset saved: 1000 reviews
✅ Normalization metadata saved: ../data/normalized\normalization_metadata.json
🎯 Day 10-11 objectives completed successfully!
📁 Files ready for next phase: Topic modeling and sentiment analysis


In [53]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

# download punkt for nltk if not already
nltk.download("punkt")

# huggingface optional (for subword tokenization)
try:
    from transformers import AutoTokenizer
    hf_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
except Exception as e:
    hf_tokenizer = None
    print("⚠️ transformers not installed or no internet, skipping subword tokenization")


⚠️ transformers not installed or no internet, skipping subword tokenization


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\subod\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [54]:
def word_level_tokens(text: str):
    if not isinstance(text, str) or not text.strip():
        return []
    return [t for t in word_tokenize(text) if t.strip()]

def hf_encode(texts, tokenizer, max_len=256):
    enc = tokenizer(
        texts,
        max_length=max_len,
        truncation=True,
        padding=False,
        return_attention_mask=True,
        add_special_tokens=True,
    )
    out = []
    for i in range(len(texts)):
        out.append({
            "input_ids": enc["input_ids"][i],
            "attention_mask": enc["attention_mask"][i],
        })
    return out


In [56]:
bbc = datasets["BBC_normalized"].copy()

# word-level
bbc["tokens_clean"]      = bbc["text_cleaned"].apply(word_level_tokens)
bbc["tokens_stemmed"]    = bbc["text_stemmed"].apply(word_level_tokens)
bbc["tokens_lemmatized"] = bbc["text_lemmatized"].apply(word_level_tokens)

# optional: subword (only clean + lemma, not stemmed)
if hf_tokenizer:
    enc_clean = hf_encode(bbc["text_cleaned"].fillna("").astype(str).tolist(), hf_tokenizer)
    enc_lemma = hf_encode(bbc["text_lemmatized"].fillna("").astype(str).tolist(), hf_tokenizer)

    bbc["hf_clean_ids"]       = [e["input_ids"] for e in enc_clean]
    bbc["hf_clean_mask"]      = [e["attention_mask"] for e in enc_clean]
    bbc["hf_lemmatized_ids"]  = [e["input_ids"] for e in enc_lemma]
    bbc["hf_lemmatized_mask"] = [e["attention_mask"] for e in enc_lemma]

# save
bbc.to_parquet("./tokenized/bbc_news_tokenized.parquet", index=False)


OSError: Cannot save file into a non-existent directory: 'tokenized'

In [5]:
# Tokenization Function
def tokenize_text(text, method="word"):
    """Tokenize text into words or sentences"""
    if pd.isna(text) or text == "":
        return []
    
    text = str(text)
    
    if method == "word":
        return word_tokenize(text.lower())   # Word-level tokens
    elif method == "sentence":
        from nltk.tokenize import sent_tokenize
        return sent_tokenize(text)           # Sentence-level tokens
    else:
        raise ValueError("Method must be 'word' or 'sentence'")


KeyboardInterrupt: 