# NarrativeNexus

**Pipeline:** Text Cleaning → Normalization → Tokenization → Topic Modelling

In [2]:
# Essential imports
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tag import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
tqdm.pandas()

# Download NLTK data
for item in ['punkt', 'stopwords', 'wordnet', 'averaged_perceptron_tagger']:
    nltk.download(item, quiet=True)

# Initialize tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

print("✅ Setup complete")

✅ Setup complete


In [3]:
# Core preprocessing functions
def clean_text(text):
    """Clean text: lowercase, remove special chars, remove stop words"""
    if pd.isna(text):
        return ""
    
    # Clean and tokenize
    text = re.sub(r'[^\w\s]', ' ', str(text).lower())
    tokens = word_tokenize(text)
    
    # Remove stop words (keep negations)
    keep_words = {'not', 'no', 'never'}
    filtered_stops = stop_words - keep_words
    tokens = [word for word in tokens if word not in filtered_stops and word.isalpha()]
    
    return ' '.join(tokens)

def lemmatize_text(text):
    """Lemmatize text with POS tagging"""
    if pd.isna(text) or text == "":
        return ""
    
    tokens = word_tokenize(str(text))
    pos_tags = pos_tag(tokens)
    
    # Convert POS tags for lemmatizer
    def get_pos(tag):
        if tag.startswith('V'): return 'v'
        elif tag.startswith('N'): return 'n'
        elif tag.startswith('R'): return 'r'
        elif tag.startswith('J'): return 'a'
        return 'n'
    
    lemmatized = [lemmatizer.lemmatize(word, get_pos(pos)) for word, pos in pos_tags]
    return ' '.join(lemmatized)

def tokenize_text(text):
    """Simple word tokenization"""
    if pd.isna(text) or text == "":
        return []
    return word_tokenize(str(text))

print("✅ Functions defined")

✅ Functions defined


In [4]:
# Load datasets
print("📂 Loading datasets...")

datasets = {}
try:
    datasets['BBC'] = pd.read_csv("../data/bbc-text.csv")
    datasets['CNN'] = pd.read_csv("../data/cnn_dailymail.csv")
    datasets['IMDB'] = pd.read_csv("../data/imdb-dataset.csv", nrows=1000)
    
    for name, df in datasets.items():
        print(f"✅ {name}: {len(df):,} documents")
        
except Exception as e:
    print(f"❌ Error: {e}")

print(f"\n📊 Total: {sum(len(df) for df in datasets.values()):,} documents")

📂 Loading datasets...
✅ BBC: 2,225 documents
✅ CNN: 5,000 documents
✅ IMDB: 1,000 documents

📊 Total: 8,225 documents
✅ BBC: 2,225 documents
✅ CNN: 5,000 documents
✅ IMDB: 1,000 documents

📊 Total: 8,225 documents


In [5]:
# Process all datasets
print("🔄 Processing datasets...\n")

processed = {}

for name, df in datasets.items():
    print(f"Processing {name}...")
    
    # Get text column
    text_col = 'review' if name == 'IMDB' else ('article' if 'article' in df.columns else 'text')
    
    # Apply pipeline
    df_processed = df.copy()
    df_processed['cleaned'] = df[text_col].progress_apply(clean_text)
    df_processed['lemmatized'] = df_processed['cleaned'].progress_apply(lemmatize_text)
    df_processed['tokens'] = df_processed['lemmatized'].apply(tokenize_text)
    df_processed['token_count'] = df_processed['tokens'].apply(len)
    
    # Stats
    avg_tokens = df_processed['token_count'].mean()
    vocab_size = len(set([token for tokens in df_processed['tokens'] for token in tokens]))
    
    print(f"  ✅ Avg tokens: {avg_tokens:.0f}, Vocabulary: {vocab_size:,}\n")
    
    processed[name] = df_processed

print("✅ All datasets processed")

🔄 Processing datasets...

Processing BBC...


100%|██████████| 2225/2225 [00:03<00:00, 613.70it/s]
100%|██████████| 2225/2225 [00:25<00:00, 87.23it/s] 


  ✅ Avg tokens: 220, Vocabulary: 23,122

Processing CNN...


100%|██████████| 5000/5000 [00:06<00:00, 831.17it/s]
100%|██████████| 5000/5000 [01:16<00:00, 65.05it/s]


  ✅ Avg tokens: 345, Vocabulary: 59,444

Processing IMDB...


100%|██████████| 1000/1000 [00:00<00:00, 1324.64it/s]
100%|██████████| 1000/1000 [00:05<00:00, 174.40it/s]


  ✅ Avg tokens: 124, Vocabulary: 14,619

✅ All datasets processed


In [6]:
# Summary and export
print("📋 PREPROCESSING SUMMARY")
print("=" * 40)

total_docs = 0
total_vocab = set()

for name, df in processed.items():
    docs = len(df)
    avg_tokens = df['token_count'].mean()
    vocab = set([token for tokens in df['tokens'] for token in tokens])
    
    total_docs += docs
    total_vocab.update(vocab)
    
    print(f"\n{name}:")
    print(f"  • Documents: {docs:,}")
    print(f"  • Avg tokens: {avg_tokens:.0f}")
    print(f"  • Vocabulary: {len(vocab):,}")
    print(f"  • Status: ✅ Preprocessed")

print(f"\n🎯 TOTALS:")
print(f"  • Total documents: {total_docs:,}")
print(f"  • Total vocabulary: {len(total_vocab):,}")

# Quick export
import os
os.makedirs("../data/final", exist_ok=True)

for name, df in processed.items():
    # Keep only essential columns for Week 2 completion
    export_df = df[['cleaned', 'lemmatized', 'tokens', 'token_count']].copy()
    export_df.to_csv(f"../data/final/{name.lower()}.csv", index=False)


📋 PREPROCESSING SUMMARY

BBC:
  • Documents: 2,225
  • Avg tokens: 220
  • Vocabulary: 23,122
  • Status: ✅ Preprocessed

CNN:
  • Documents: 5,000
  • Avg tokens: 345
  • Vocabulary: 59,444
  • Status: ✅ Preprocessed

CNN:
  • Documents: 5,000
  • Avg tokens: 345
  • Vocabulary: 59,444
  • Status: ✅ Preprocessed

IMDB:
  • Documents: 1,000
  • Avg tokens: 124
  • Vocabulary: 14,619
  • Status: ✅ Preprocessed

🎯 TOTALS:
  • Total documents: 8,225
  • Total vocabulary: 69,131

IMDB:
  • Documents: 1,000
  • Avg tokens: 124
  • Vocabulary: 14,619
  • Status: ✅ Preprocessed

🎯 TOTALS:
  • Total documents: 8,225
  • Total vocabulary: 69,131


In [7]:
# Topic Modeling Implementation (LDA & NMF)
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF

print("🔍 Topic Modeling Implementation")
print("=" * 40)

# Prepare data for topic modeling
for name, df in processed.items():
    print(f"\n📊 {name} Dataset:")
    
    # Prepare documents
    documents = df['lemmatized'].fillna('')
    
    # LDA Implementation
    count_vectorizer = CountVectorizer(max_features=500, min_df=2, max_df=0.8, stop_words='english')
    count_matrix = count_vectorizer.fit_transform(documents)
    
    lda = LatentDirichletAllocation(n_components=3, random_state=42)
    lda.fit(count_matrix)
    
    print("  LDA Topics:")
    feature_names = count_vectorizer.get_feature_names_out()
    for topic_idx, topic in enumerate(lda.components_):
        top_words = [feature_names[i] for i in topic.argsort()[-5:][::-1]]
        print(f"    Topic {topic_idx + 1}: {', '.join(top_words)}")
    
    # NMF Implementation  
    tfidf_vectorizer = TfidfVectorizer(max_features=500, min_df=2, max_df=0.8, stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
    
    nmf = NMF(n_components=3, random_state=42)
    nmf.fit(tfidf_matrix)
    
    print("  NMF Topics:")
    feature_names = tfidf_vectorizer.get_feature_names_out()
    for topic_idx, topic in enumerate(nmf.components_):
        top_words = [feature_names[i] for i in topic.argsort()[-5:][::-1]]
        print(f"    Topic {topic_idx + 1}: {', '.join(top_words)}")

print("\n✅ Topic modeling complete!")

🔍 Topic Modeling Implementation

📊 BBC Dataset:
  LDA Topics:
    Topic 1: mr, government, year, labour, party
    Topic 2: game, use, people, make, player
    Topic 3: year, film, company, best, market
  NMF Topics:
    Topic 1: mr, government, labour, election, people
    Topic 2: game, play, win, player, england
    Topic 3: film, award, best, star, actor

📊 CNN Dataset:
  LDA Topics:
    Topic 1: people, year, like, make, time
    Topic 2: obama, new, president, year, make
    Topic 3: state, government, report, official, people
  NMF Topics:
    Topic 1: like, year, think, make, time
    Topic 2: police, report, kill, official, government
    Topic 3: obama, president, clinton, bush, mccain

📊 IMDB Dataset:
  LDA Topics:
    Topic 1: movie, film, like, great, make
    Topic 2: movie, br, bad, like, make
    Topic 3: br, film, make, character, like
  NMF Topics:
    Topic 1: br, character, know, really, episode
    Topic 2: movie, bad, watch, like, good
    Topic 3: film, make, cha