# NarrativeNexus: Text Preprocessing Pipeline

**Objective:** Clean, normalize, and tokenize BBC News, CNN/DailyMail, and IMDB datasets for NLP tasks.

**Pipeline:** Text Cleaning → Normalization → Tokenization

In [None]:
# Essential imports
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tag import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
tqdm.pandas()

# Download NLTK data
for item in ['punkt', 'stopwords', 'wordnet', 'averaged_perceptron_tagger']:
    nltk.download(item, quiet=True)

# Initialize tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

print("✅ Setup complete")

✅ Setup complete


In [None]:
# Core preprocessing functions
def clean_text(text):
    """Clean text: lowercase, remove special chars, remove stop words"""
    if pd.isna(text):
        return ""
    
    # Clean and tokenize
    text = re.sub(r'[^\w\s]', ' ', str(text).lower())
    tokens = word_tokenize(text)
    
    # Remove stop words (keep negations)
    keep_words = {'not', 'no', 'never'}
    filtered_stops = stop_words - keep_words
    tokens = [word for word in tokens if word not in filtered_stops and word.isalpha()]
    
    return ' '.join(tokens)

def lemmatize_text(text):
    """Lemmatize text with POS tagging"""
    if pd.isna(text) or text == "":
        return ""
    
    tokens = word_tokenize(str(text))
    pos_tags = pos_tag(tokens)
    
    # Convert POS tags for lemmatizer
    def get_pos(tag):
        if tag.startswith('V'): return 'v'
        elif tag.startswith('N'): return 'n'
        elif tag.startswith('R'): return 'r'
        elif tag.startswith('J'): return 'a'
        return 'n'
    
    lemmatized = [lemmatizer.lemmatize(word, get_pos(pos)) for word, pos in pos_tags]
    return ' '.join(lemmatized)

def tokenize_text(text):
    """Simple word tokenization"""
    if pd.isna(text) or text == "":
        return []
    return word_tokenize(str(text))

print("✅ Functions defined")

✅ Functions defined


In [None]:
# Load datasets
print("📂 Loading datasets...")

datasets = {}
try:
    datasets['BBC'] = pd.read_csv("../data/bbc-text.csv")
    datasets['CNN'] = pd.read_csv("../data/cnn_dailymail.csv")
    datasets['IMDB'] = pd.read_csv("../data/imdb-dataset.csv", nrows=1000)
    
    for name, df in datasets.items():
        print(f"✅ {name}: {len(df):,} documents")
        
except Exception as e:
    print(f"❌ Error: {e}")

print(f"\n📊 Total: {sum(len(df) for df in datasets.values()):,} documents")

📂 Loading datasets...
✅ BBC: 2,225 documents
✅ CNN: 5,000 documents
✅ IMDB: 1,000 documents

📊 Total: 8,225 documents


In [None]:
# Process all datasets
print("🔄 Processing datasets...\n")

processed = {}

for name, df in datasets.items():
    print(f"Processing {name}...")
    
    # Get text column
    text_col = 'review' if name == 'IMDB' else ('article' if 'article' in df.columns else 'text')
    
    # Apply pipeline
    df_processed = df.copy()
    df_processed['cleaned'] = df[text_col].progress_apply(clean_text)
    df_processed['lemmatized'] = df_processed['cleaned'].progress_apply(lemmatize_text)
    df_processed['tokens'] = df_processed['lemmatized'].apply(tokenize_text)
    df_processed['token_count'] = df_processed['tokens'].apply(len)
    
    # Stats
    avg_tokens = df_processed['token_count'].mean()
    vocab_size = len(set([token for tokens in df_processed['tokens'] for token in tokens]))
    
    print(f"  ✅ Avg tokens: {avg_tokens:.0f}, Vocabulary: {vocab_size:,}\n")
    
    processed[name] = df_processed

print("✅ All datasets processed")

🔄 Processing datasets...

Processing BBC...


  0%|          | 1/2225 [00:00<00:09, 226.19it/s]


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\PRIYATAM/nltk_data'
    - 'c:\\Users\\PRIYATAM\\AppData\\Local\\Programs\\Python\\Python313\\nltk_data'
    - 'c:\\Users\\PRIYATAM\\AppData\\Local\\Programs\\Python\\Python313\\share\\nltk_data'
    - 'c:\\Users\\PRIYATAM\\AppData\\Local\\Programs\\Python\\Python313\\lib\\nltk_data'
    - 'C:\\Users\\PRIYATAM\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [None]:
# Summary and export
print("📋 PREPROCESSING SUMMARY")
print("=" * 40)

total_docs = 0
total_vocab = set()

for name, df in processed.items():
    docs = len(df)
    avg_tokens = df['token_count'].mean()
    vocab = set([token for tokens in df['tokens'] for token in tokens])
    
    total_docs += docs
    total_vocab.update(vocab)
    
    print(f"\n{name}:")
    print(f"  • Documents: {docs:,}")
    print(f"  • Avg tokens: {avg_tokens:.0f}")
    print(f"  • Vocabulary: {len(vocab):,}")
    print(f"  • Status: ✅ Preprocessed")

print(f"\n🎯 TOTALS:")
print(f"  • Total documents: {total_docs:,}")
print(f"  • Total vocabulary: {len(total_vocab):,}")
print(f"  • Ready for: Week 3 Topic Modeling")

# Quick export
import os
os.makedirs("../data/final", exist_ok=True)

for name, df in processed.items():
    # Keep only essential columns for Week 2 completion
    export_df = df[['cleaned', 'lemmatized', 'tokens', 'token_count']].copy()
    export_df.to_csv(f"../data/final/{name.lower()}.csv", index=False)


📋 PREPROCESSING SUMMARY

BBC:
  • Documents: 2,225
  • Avg tokens: 220
  • Vocabulary: 23,122
  • Status: ✅ Preprocessed

CNN:
  • Documents: 5,000
  • Avg tokens: 345
  • Vocabulary: 59,444
  • Status: ✅ Preprocessed

IMDB:
  • Documents: 1,000
  • Avg tokens: 124
  • Vocabulary: 14,619
  • Status: ✅ Preprocessed

🎯 TOTALS:
  • Total documents: 8,225
  • Total vocabulary: 69,131
  • Ready for: Week 3 Topic Modeling

CNN:
  • Documents: 5,000
  • Avg tokens: 345
  • Vocabulary: 59,444
  • Status: ✅ Preprocessed

IMDB:
  • Documents: 1,000
  • Avg tokens: 124
  • Vocabulary: 14,619
  • Status: ✅ Preprocessed

🎯 TOTALS:
  • Total documents: 8,225
  • Total vocabulary: 69,131
  • Ready for: Week 3 Topic Modeling
