# Text Preprocessing - Day 8-9
## NarrativeNexus Project: Text Cleaning Implementation

**Objectives:**
- Remove special characters, punctuation, and stop words
- Apply preprocessing to BBC, CNN/DailyMail, and IMDB datasets
- Save cleaned datasets for Week 3 topic modeling

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Download NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

# Initialize tools
stop_words = set(stopwords.words('english'))
print(f"✅ Setup complete. Loaded {len(stop_words)} stop words.")

✅ Setup complete. Loaded 198 stop words.


In [2]:
# Define text cleaning functions
def clean_special_characters(text):
    """Remove special characters, keep only letters, numbers, and spaces"""
    if pd.isna(text):
        return ""
    text = str(text)
    # Remove special characters
    cleaned = re.sub(r'[^\w\s]', ' ', text)
    # Remove extra whitespace
    cleaned = re.sub(r'\s+', ' ', cleaned)
    return cleaned.strip()

def remove_stop_words(text, keep_negations=True):
    """Remove stop words while preserving negations"""
    if pd.isna(text):
        return ""
    text = str(text)
    
    # Keep important negation words
    stop_words_filtered = stop_words.copy()
    if keep_negations:
        important_words = {'not', 'no', 'never', 'none', 'neither', 'nobody', 'nothing'}
        stop_words_filtered = stop_words_filtered - important_words
    
    # Tokenize and filter
    tokens = word_tokenize(text.lower())
    filtered_tokens = [word for word in tokens if word not in stop_words_filtered]
    return ' '.join(filtered_tokens)

def clean_text_pipeline(text):
    """Complete text cleaning pipeline"""
    if pd.isna(text):
        return ""
    
    # Step 1: Convert to lowercase
    cleaned = str(text).lower()
    
    # Step 2: Remove special characters
    cleaned = clean_special_characters(cleaned)
    
    # Step 3: Remove stop words
    cleaned = remove_stop_words(cleaned, keep_negations=True)
    
    return cleaned

print("✅ Text cleaning functions defined.")

✅ Text cleaning functions defined.


In [3]:
# Load datasets
data_dir = "../data"
datasets = {}

# Load BBC News Dataset
try:
    bbc_df = pd.read_csv(f"{data_dir}/bbc-text.csv")
    datasets['BBC'] = bbc_df
    print(f"✅ BBC Dataset: {len(bbc_df)} articles loaded")
except Exception as e:
    print(f"❌ Error loading BBC dataset: {e}")

# Load CNN/DailyMail Dataset
try:
    cnn_df = pd.read_csv(f"{data_dir}/cnn_dailymail.csv")
    datasets['CNN'] = cnn_df
    print(f"✅ CNN Dataset: {len(cnn_df)} articles loaded")
except Exception as e:
    print(f"❌ Error loading CNN dataset: {e}")

# Load IMDB Dataset (subset for demo)
try:
    imdb_df = pd.read_csv(f"{data_dir}/imdb-dataset.csv", nrows=1000)
    datasets['IMDB'] = imdb_df
    print(f"✅ IMDB Dataset: {len(imdb_df)} reviews loaded")
except Exception as e:
    print(f"❌ Error loading IMDB dataset: {e}")

print(f"\n📊 Total datasets loaded: {len(datasets)}")

✅ BBC Dataset: 2225 articles loaded
✅ CNN Dataset: 5000 articles loaded
✅ IMDB Dataset: 1000 reviews loaded

📊 Total datasets loaded: 3


In [4]:
# Clean BBC News Dataset
if 'BBC' in datasets:
    print("🧹 Cleaning BBC News Dataset...")
    bbc_df = datasets['BBC'].copy()
    
    # Apply cleaning
    tqdm.pandas(desc="Processing BBC")
    bbc_df['text_cleaned'] = bbc_df['text'].progress_apply(clean_text_pipeline)
    
    # Calculate metrics
    original_avg = bbc_df['text'].str.len().mean()
    cleaned_avg = bbc_df['text_cleaned'].str.len().mean()
    reduction = ((original_avg - cleaned_avg) / original_avg * 100)
    
    print(f"   • Original avg length: {original_avg:.0f} characters")
    print(f"   • Cleaned avg length: {cleaned_avg:.0f} characters")
    print(f"   • Reduction: {reduction:.1f}%")
    
    datasets['BBC_cleaned'] = bbc_df
    print("✅ BBC cleaning completed")
else:
    print("❌ BBC dataset not available")

🧹 Cleaning BBC News Dataset...


Processing BBC: 100%|██████████| 2225/2225 [00:02<00:00, 1108.66it/s]

   • Original avg length: 2263 characters
   • Cleaned avg length: 1584 characters
   • Reduction: 30.0%
✅ BBC cleaning completed





In [5]:
# Clean CNN/DailyMail Dataset
if 'CNN' in datasets:
    print("🧹 Cleaning CNN/DailyMail Dataset...")
    cnn_df = datasets['CNN'].copy()
    
    # Identify text column
    text_column = 'article' if 'article' in cnn_df.columns else 'text'
    
    # Apply cleaning
    tqdm.pandas(desc="Processing CNN")
    cnn_df['text_cleaned'] = cnn_df[text_column].progress_apply(clean_text_pipeline)
    
    # Calculate metrics
    original_avg = cnn_df[text_column].str.len().mean()
    cleaned_avg = cnn_df['text_cleaned'].str.len().mean()
    reduction = ((original_avg - cleaned_avg) / original_avg * 100)
    
    print(f"   • Original avg length: {original_avg:.0f} characters")
    print(f"   • Cleaned avg length: {cleaned_avg:.0f} characters")
    print(f"   • Reduction: {reduction:.1f}%")
    
    datasets['CNN_cleaned'] = cnn_df
    print("✅ CNN cleaning completed")
else:
    print("❌ CNN dataset not available")

🧹 Cleaning CNN/DailyMail Dataset...


Processing CNN: 100%|██████████| 5000/5000 [00:04<00:00, 1072.88it/s]

   • Original avg length: 2518 characters
   • Cleaned avg length: 2518 characters
   • Reduction: 0.0%
✅ CNN cleaning completed





In [6]:
# Clean IMDB Dataset
if 'IMDB' in datasets:
    print("🧹 Cleaning IMDB Reviews Dataset...")
    imdb_df = datasets['IMDB'].copy()
    
    # Apply cleaning
    tqdm.pandas(desc="Processing IMDB")
    imdb_df['review_cleaned'] = imdb_df['review'].progress_apply(clean_text_pipeline)
    
    # Calculate metrics
    original_avg = imdb_df['review'].str.len().mean()
    cleaned_avg = imdb_df['review_cleaned'].str.len().mean()
    reduction = ((original_avg - cleaned_avg) / original_avg * 100)
    
    print(f"   • Original avg length: {original_avg:.0f} characters")
    print(f"   • Cleaned avg length: {cleaned_avg:.0f} characters")
    print(f"   • Reduction: {reduction:.1f}%")
    
    datasets['IMDB_cleaned'] = imdb_df
    print("✅ IMDB cleaning completed")
else:
    print("❌ IMDB dataset not available")

🧹 Cleaning IMDB Reviews Dataset...


Processing IMDB: 100%|██████████| 1000/1000 [00:00<00:00, 1818.58it/s]

   • Original avg length: 1311 characters
   • Cleaned avg length: 839 characters
   • Reduction: 36.0%
✅ IMDB cleaning completed





In [7]:
# Save cleaned datasets
import os
import json
import time

# Create cleaned data directory
cleaned_dir = "../data/cleaned"
os.makedirs(cleaned_dir, exist_ok=True)

saved_files = []

# Save BBC cleaned dataset
if 'BBC_cleaned' in datasets:
    filepath = os.path.join(cleaned_dir, "bbc_news_cleaned.csv")
    datasets['BBC_cleaned'].to_csv(filepath, index=False)
    saved_files.append(f"BBC: {filepath}")
    print(f"✅ BBC dataset saved: {len(datasets['BBC_cleaned'])} articles")

# Save CNN cleaned dataset
if 'CNN_cleaned' in datasets:
    filepath = os.path.join(cleaned_dir, "cnn_dailymail_cleaned.csv")
    datasets['CNN_cleaned'].to_csv(filepath, index=False)
    saved_files.append(f"CNN: {filepath}")
    print(f"✅ CNN dataset saved: {len(datasets['CNN_cleaned'])} articles")

# Save IMDB cleaned dataset
if 'IMDB_cleaned' in datasets:
    filepath = os.path.join(cleaned_dir, "imdb_reviews_cleaned.csv")
    datasets['IMDB_cleaned'].to_csv(filepath, index=False)
    saved_files.append(f"IMDB: {filepath}")
    print(f"✅ IMDB dataset saved: {len(datasets['IMDB_cleaned'])} reviews")

# Save metadata
metadata = {
    'cleaning_date': time.strftime('%Y-%m-%d %H:%M:%S'),
    'day': 'Day 8-9',
    'objective': 'Text cleaning: remove special characters, punctuation, stop words',
    'files_created': saved_files,
    'next_step': 'Week 3: Topic modeling with LDA/NMF'
}

metadata_path = os.path.join(cleaned_dir, "preprocessing_metadata.json")
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2)

✅ BBC dataset saved: 2225 articles
✅ CNN dataset saved: 5000 articles
✅ IMDB dataset saved: 1000 reviews
