In [None]:
"""
# LUMIN.AI: Text Preprocessing Pipeline for Democratic Governance Analysis

This notebook demonstrates the complete text preprocessing pipeline for democratic governance text analysis,
showcasing the functionality of our custom preprocessing modules.
"""

# Standard library imports
import sys
import os
import json
import time
import random
import re
from pathlib import Path

# Add parent directory to path to import our modules
sys.path.append(os.path.abspath(".."))

# Third-party imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)

# Configure visualization settings
plt.style.use("seaborn-v0_8-whitegrid")
sns.set_palette("deep")
plt.rcParams["figure.figsize"] = (12, 8)
plt.rcParams["font.size"] = 12

In [None]:
"""
## 1. Import Custom Preprocessing Modules

First, we'll import our custom preprocessing modules designed specifically for governance text analysis.
"""

# Import our custom modules
try:
    from src.preprocessing import (
        text_cleaning,
        tokenization,
        feature_extraction,
        data_augmentation,
    )
    from src.utils import config

    print("✅ Successfully imported custom preprocessing modules")
except ImportError as e:
    print(f"❌ Error importing modules: {e}")
    print(
        "Make sure you're running this notebook from the deep-learning directory or have added it to PYTHONPATH"
    )

# Check if all required modules are available
required_modules = {
    "text_cleaning": text_cleaning,
    "tokenization": tokenization,
    "feature_extraction": feature_extraction,
    "data_augmentation": data_augmentation,
    "config": config,
}

for name, module in required_modules.items():
    functions = [
        func
        for func in dir(module)
        if callable(getattr(module, func)) and not func.startswith("_")
    ]
    print(f"{name}: {len(functions)} functions available")

In [None]:
"""
## 2. Load Sample Governance Text Data

For this demonstration, we'll use a sample dataset of governance-related text.
In a real-world scenario, we would use the Austria Democracy Radar dataset.
"""

# Sample governance text data (placeholder for actual dataset)
sample_texts = [
    "The democratic institutions must be strengthened to ensure proper representation of all citizens.",
    "Government policies should focus on transparency and accountability to build public trust.",
    "The election commission has reported significant improvement in voter participation this year.",
    "Civil society organizations play a crucial role in democratic oversight and citizen engagement.",
    "Parliamentary debates highlighted concerns about the separation of powers in recent legislation.",
    "Freedom of the press is essential for maintaining democratic norms and government accountability.",
    "Voter registration processes need to be simplified to increase electoral participation.",
    "Constitutional courts provide important checks on executive power in modern democracies.",
    "Public policy research indicates that democratic backsliding is occurring in several regions.",
    "Political parties must engage in constructive dialogue to overcome polarization in society.",
]

# Sample labels (for demonstration purposes)
sample_labels = [
    "institutional_reform",
    "governance_quality",
    "electoral_process",
    "civil_society",
    "legislative_process",
    "media_freedom",
    "electoral_process",
    "judicial_independence",
    "democratic_backsliding",
    "political_discourse",
]

# Convert to a DataFrame for easier manipulation
df = pd.DataFrame({"text": sample_texts, "label": sample_labels})

# Display the dataset
print(f"Sample dataset shape: {df.shape}")
df.head()

In [None]:
"""
## 3. Text Cleaning

Apply our custom text cleaning functions to the dataset.
"""

# Apply text cleaning with default settings
cleaned_texts = text_cleaning.clean_text_batch(df["text"].tolist())

# Create a comparison DataFrame
comparison_df = pd.DataFrame({"original": df["text"], "cleaned": cleaned_texts})

# Display the comparison
comparison_df.head(3)

# Examine how cleaning affects text statistics


def text_stats(text_list):
    """Calculate basic text statistics"""
    avg_len = sum(len(text) for text in text_list) / len(text_list)
    avg_words = sum(len(text.split()) for text in text_list) / len(text_list)
    return {
        "avg_length": avg_len,
        "avg_words": avg_words,
        "total_chars": sum(len(text) for text in text_list),
    }


# Compare statistics before and after cleaning
orig_stats = text_stats(df["text"].tolist())
clean_stats = text_stats(cleaned_texts)

stats_df = pd.DataFrame({"Original": orig_stats, "Cleaned": clean_stats})

print("Text statistics before and after cleaning:")
stats_df.T

In [None]:
"""
## 4. Tokenization

Apply our custom tokenization functions, which are specifically designed to handle governance text.
"""

# Apply standard tokenization
standard_tokens = [tokenization.tokenize_text(text) for text in cleaned_texts[:3]]

# Apply governance-specific tokenization with phrase preservation
governance_tokens = [
    tokenization.custom_governance_tokenizer(text) for text in cleaned_texts[:3]
]

# Display the comparison
tokenization_df = pd.DataFrame(
    {
        "Cleaned Text": cleaned_texts[:3],
        "Standard Tokens": [", ".join(tokens) for tokens in standard_tokens],
        "Governance Tokens": [", ".join(tokens) for tokens in governance_tokens],
    }
)

tokenization_df

# Show preserved governance phrases
print("\nPreserved governance phrases (sample):")
for phrase in tokenization.GOVERNANCE_PHRASES[:10]:
    print(f"- {phrase}")

# Compare token counts
std_token_counts = [len(tokens) for tokens in standard_tokens]
gov_token_counts = [len(tokens) for tokens in governance_tokens]

token_counts_df = pd.DataFrame(
    {
        "Standard Tokenization": std_token_counts,
        "Governance Tokenization": gov_token_counts,
    }
)

print("\nToken counts comparison:")
token_counts_df

In [None]:
"""
## 5. Feature Extraction

Transform the tokenized text into numerical features using different approaches:
1. Bag of Words (BoW)
2. TF-IDF
3. Word embeddings (Word2Vec)
"""

# Prepare all tokenized texts for feature extraction
all_governance_tokens = [
    tokenization.custom_governance_tokenizer(text) for text in cleaned_texts
]

# Create Bag of Words features
bow_features, bow_vectorizer = feature_extraction.create_bow_features(
    cleaned_texts, **config.DEFAULT_FEATURE_EXTRACTION_CONFIG.get("bow_params", {})
)

print(f"Bag of Words features shape: {bow_features.shape}")
print(f"Number of unique terms: {len(bow_vectorizer.get_feature_names_out())}")
print("\nSample terms:")
print(bow_vectorizer.get_feature_names_out()[:10])

# Create TF-IDF features
tfidf_features, tfidf_vectorizer = feature_extraction.create_tfidf_features(
    cleaned_texts, **config.DEFAULT_FEATURE_EXTRACTION_CONFIG.get("tfidf_params", {})
)

print(f"\nTF-IDF features shape: {tfidf_features.shape}")

# Train a Word2Vec model on our tokenized documents
w2v_model = feature_extraction.create_word2vec_embeddings(
    all_governance_tokens,
    **config.DEFAULT_FEATURE_EXTRACTION_CONFIG.get("word2vec_params", {}),
)

print(f"\nWord2Vec model vocabulary size: {len(w2v_model.wv.index_to_key)}")

# Generate document embeddings by averaging word vectors
word_vectors = {word: w2v_model.wv[word] for word in w2v_model.wv.index_to_key}
doc_embeddings = feature_extraction.create_document_embeddings(
    all_governance_tokens, word_vectors
)

print(f"Document embeddings shape: {doc_embeddings.shape}")

# Visualize the most similar words to a governance term
governance_term = "democracy"
if governance_term in w2v_model.wv:
    similar_words = w2v_model.wv.most_similar(governance_term, topn=10)
    similar_df = pd.DataFrame(similar_words, columns=["word", "similarity"])
    print(f"\nWords most similar to '{governance_term}':")
    print(similar_df)
else:
    print(
        f"\nThe term '{governance_term}' is not in the vocabulary (try with a larger dataset)"
    )

# If the model has enough vocabulary, let's visualize word embeddings
try:
    from sklearn.manifold import TSNE
    import matplotlib.pyplot as plt

    # Get embeddings for the most common words
    words = w2v_model.wv.index_to_key[:30]
    embeddings = np.array([w2v_model.wv[word] for word in words])

    # Apply t-SNE to reduce to 2 dimensions
    tsne = TSNE(n_components=2, random_state=42)
    embeddings_2d = tsne.fit_transform(embeddings)

    # Plot
    plt.figure(figsize=(12, 10))
    plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c="steelblue", alpha=0.7)

    # Label points
    for i, word in enumerate(words):
        plt.annotate(
            word, (embeddings_2d[i, 0], embeddings_2d[i, 1]), fontsize=12, alpha=0.8
        )

    plt.title("Word Embeddings Visualization (t-SNE)", fontsize=15)
    plt.xlabel("t-SNE Dimension 1", fontsize=12)
    plt.ylabel("t-SNE Dimension 2", fontsize=12)
    plt.grid(True, linestyle="--", alpha=0.5)
    plt.show()
except Exception as e:
    print(f"Could not visualize word embeddings: {e}")
    print("Try with a larger dataset to build a more robust vocabulary.")

In [None]:
"""
## 6. Named Entity Recognition for Governance Text

Extract political entities using our specialized NER function.
"""

# We'll use spaCy's NER capabilities
import spacy

try:
    # Try to load the spaCy model
    nlp = spacy.load("en_core_web_sm")

    # Extract political entities from our texts
    political_entities = feature_extraction.extract_political_entities(
        df["text"].tolist()[:3], nlp
    )

    # Display entities found
    for i, (text, entities) in enumerate(
        zip(df["text"].tolist()[:3], political_entities)
    ):
        print(f"\nText {i+1}: {text}")
        print("Entities found:")
        for entity in entities:
            print(f"- {entity['text']} ({entity['type']})")

    # Count entities by type
    entity_types = {}
    for doc_entities in political_entities:
        for entity in doc_entities:
            entity_type = entity["type"]
            entity_types[entity_type] = entity_types.get(entity_type, 0) + 1

    # Plot entity distribution
    if entity_types:
        plt.figure(figsize=(10, 6))
        plt.bar(entity_types.keys(), entity_types.values(), color="steelblue")
        plt.title("Political Entity Types in Sample Texts")
        plt.xlabel("Entity Type")
        plt.ylabel("Count")
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
    else:
        print("No political entities found in the sample texts.")

except ImportError:
    print(
        "SpaCy model 'en_core_web_sm' not available. Install with: python -m spacy download en_core_web_sm"
    )
except Exception as e:
    print(f"Error during entity extraction: {e}")

In [None]:
"""
## 7. Data Augmentation for Governance Text

Apply our specialized data augmentation techniques for governance text.
"""

# Sample text for augmentation demonstration
sample_text = df["text"].iloc[0]
print(f"Original text: {sample_text}\n")

# Apply various augmentation techniques
print("Data augmentation examples:\n")

# 1. Synonym replacement
augmented = data_augmentation.synonym_replacement(sample_text.split(), n=2)
print(f"Synonym replacement: {' '.join(augmented)}\n")

# 2. Random swap
augmented = data_augmentation.random_swap(sample_text.split(), n=2)
print(f"Random swap: {' '.join(augmented)}\n")

# 3. Random deletion
augmented = data_augmentation.random_deletion(sample_text.split(), p=0.1)
print(f"Random deletion: {' '.join(augmented)}\n")

# 4. Random insertion
augmented = data_augmentation.random_insertion(sample_text.split(), n=2)
print(f"Random insertion: {' '.join(augmented)}\n")

# 5. Governance-specific augmentation
augmented = data_augmentation.governance_specific_augmentation(sample_text)
print(f"Governance-specific: {augmented}\n")

# Create multiple augmented examples
augmented_examples = data_augmentation.create_augmented_examples(
    sample_text, n_examples=3
)
print("Multiple augmentation examples:")
for i, example in enumerate(augmented_examples):
    print(f"{i+1}: {example}")

# Demonstrate dataset balancing with augmentation
# First, create an imbalanced dataset
from collections import Counter

label_counts = Counter(df["label"])
print("\nLabel distribution before balancing:")
print(label_counts)

# Balance the dataset
balanced_texts, balanced_labels = data_augmentation.balance_dataset(
    df["text"].tolist(), df["label"].tolist()
)

balanced_counts = Counter(balanced_labels)
print("\nLabel distribution after balancing:")
print(balanced_counts)

# Create a DataFrame with balanced data
balanced_df = pd.DataFrame({"text": balanced_texts, "label": balanced_labels})

# Plot the label distribution before and after balancing
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.bar(label_counts.keys(), label_counts.values(), color="steelblue")
plt.title("Label Distribution Before Balancing")
plt.xlabel("Label")
plt.ylabel("Count")
plt.xticks(rotation=90)

plt.subplot(1, 2, 2)
plt.bar(balanced_counts.keys(), balanced_counts.values(), color="steelblue")
plt.title("Label Distribution After Balancing")
plt.xlabel("Label")
plt.ylabel("Count")
plt.xticks(rotation=90)

plt.tight_layout()
plt.show()

In [None]:
"""
## 8. Complete Preprocessing Pipeline

Put everything together into a comprehensive preprocessing pipeline for governance text.
"""


def preprocess_governance_text(texts, labels=None, mode="train"):
    """
    Complete preprocessing pipeline for governance text analysis.

    Args:
        texts: List of raw texts
        labels: Optional list of labels
        mode: 'train' or 'predict' mode

    Returns:
        Dictionary with preprocessed features and metadata
    """
    print(f"Starting preprocessing pipeline in {mode} mode...")
    results = {"pipeline_steps": []}

    # Step 1: Text cleaning
    print("Step 1: Text cleaning...")
    cleaned_texts = text_cleaning.clean_text_batch(texts)
    results["cleaned_texts"] = cleaned_texts
    results["pipeline_steps"].append("text_cleaning")

    # Step 2: Tokenization
    print("Step 2: Governance-specific tokenization...")
    tokenized_texts = [
        tokenization.custom_governance_tokenizer(text) for text in cleaned_texts
    ]
    results["tokenized_texts"] = tokenized_texts
    results["pipeline_steps"].append("tokenization")

    # Step 3: Feature extraction
    print("Step 3: Feature extraction...")

    # 3.1: TF-IDF features
    tfidf_features, tfidf_vectorizer = feature_extraction.create_tfidf_features(
        cleaned_texts,
        **config.DEFAULT_FEATURE_EXTRACTION_CONFIG.get("tfidf_params", {}),
    )
    results["tfidf_features"] = tfidf_features
    results["tfidf_vectorizer"] = tfidf_vectorizer

    # 3.2: Word embeddings (if in training mode)
    if mode == "train":
        w2v_model = feature_extraction.create_word2vec_embeddings(
            tokenized_texts,
            **config.DEFAULT_FEATURE_EXTRACTION_CONFIG.get("word2vec_params", {}),
        )
        results["word2vec_model"] = w2v_model

        # Create document embeddings
        word_vectors = {word: w2v_model.wv[word] for word in w2v_model.wv.index_to_key}
        doc_embeddings = feature_extraction.create_document_embeddings(
            tokenized_texts, word_vectors
        )
        results["doc_embeddings"] = doc_embeddings

    results["pipeline_steps"].append("feature_extraction")

    # Step 4: Named entity recognition (optional)
    try:
        print("Step 4: Political entity extraction...")
        import spacy

        nlp = spacy.load("en_core_web_sm")
        political_entities = feature_extraction.extract_political_entities(texts, nlp)
        results["political_entities"] = political_entities
        results["pipeline_steps"].append("entity_extraction")
    except Exception as e:
        print(f"Skipping entity extraction: {e}")

    # Step 5: Data balancing and augmentation (only in training mode)
    if mode == "train" and labels is not None:
        print("Step 5: Dataset balancing and augmentation...")
        try:
            augmented_texts, augmented_labels = data_augmentation.balance_dataset(
                texts, labels
            )
            results["augmented_texts"] = augmented_texts
            results["augmented_labels"] = augmented_labels
            results["pipeline_steps"].append("data_augmentation")
        except Exception as e:
            print(f"Skipping data augmentation: {e}")

    print("Preprocessing pipeline completed successfully.")
    return results


# Run the complete pipeline on our sample dataset
pipeline_results = preprocess_governance_text(
    df["text"].tolist(), df["label"].tolist(), mode="train"
)

# Show available results
print("\nPipeline results summary:")
for key in pipeline_results:
    if key == "pipeline_steps":
        print(f"{key}: {pipeline_results[key]}")
    elif isinstance(pipeline_results[key], list):
        print(f"{key}: List with {len(pipeline_results[key])} items")
    elif hasattr(pipeline_results[key], "shape"):
        print(f"{key}: Array/Matrix with shape {pipeline_results[key].shape}")
    else:
        print(f"{key}: {type(pipeline_results[key])}")

# Use the preprocessed features for model training (placeholder)
print("\nThe preprocessed features can now be used for model training.")
print("For example, using TF-IDF features:")
print(
    f"X = pipeline_results['tfidf_features']  # Shape: {pipeline_results['tfidf_features'].shape}"
)
print(
    "y = pipeline_results['augmented_labels'] if 'augmented_labels' in pipeline_results else df['label']"
)

In [None]:
"""
## 9. Saving and Loading Pipeline Components

Demonstrate how to save and load preprocessing components for later use.
This is essential for maintaining consistency between training and inference.
"""

import pickle
import os
from pathlib import Path

# Create directories for saving pipeline components
models_dir = Path("../models")
models_dir.mkdir(exist_ok=True)

preprocessing_dir = models_dir / "preprocessing"
preprocessing_dir.mkdir(exist_ok=True)

# Save components


def save_pipeline_components(results, output_dir):
    """Save preprocessing pipeline components."""
    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True)

    # Save vectorizers
    if "tfidf_vectorizer" in results:
        with open(output_dir / "tfidf_vectorizer.pkl", "wb") as f:
            pickle.dump(results["tfidf_vectorizer"], f)

    # Save Word2Vec model
    if "word2vec_model" in results:
        results["word2vec_model"].save(str(output_dir / "word2vec_model.w2v"))

    # Save preprocessing configuration
    pipeline_config = {
        "steps": results["pipeline_steps"],
        "has_tfidf": "tfidf_vectorizer" in results,
        "has_word2vec": "word2vec_model" in results,
    }

    with open(output_dir / "pipeline_config.json", "w") as f:
        json.dump(pipeline_config, f, indent=2)

    print(f"Pipeline components saved to {output_dir}")


# Example of saving pipeline components
save_pipeline_components(pipeline_results, preprocessing_dir)

# Function for loading pipeline components


def load_pipeline_components(input_dir):
    """Load preprocessing pipeline components."""
    input_dir = Path(input_dir)
    results = {}

    # Load pipeline configuration
    try:
        with open(input_dir / "pipeline_config.json", "r") as f:
            pipeline_config = json.load(f)
            results["pipeline_steps"] = pipeline_config["steps"]
    except FileNotFoundError:
        print("Pipeline configuration not found.")
        return {}

    # Load TF-IDF vectorizer if available
    if pipeline_config.get("has_tfidf", False):
        try:
            with open(input_dir / "tfidf_vectorizer.pkl", "rb") as f:
                results["tfidf_vectorizer"] = pickle.load(f)
        except FileNotFoundError:
            print("TF-IDF vectorizer not found.")

    # Load Word2Vec model if available
    if pipeline_config.get("has_word2vec", False):
        try:
            from gensim.models import Word2Vec

            results["word2vec_model"] = Word2Vec.load(
                str(input_dir / "word2vec_model.w2v")
            )
        except FileNotFoundError:
            print("Word2Vec model not found.")
        except Exception as e:
            print(f"Error loading Word2Vec model: {e}")

    print(f"Pipeline components loaded from {input_dir}")
    return results


# Example of loading pipeline components
loaded_components = load_pipeline_components(preprocessing_dir)
print("\nLoaded components:")
for key in loaded_components:
    print(f"- {key}")

# Demonstrate how to use loaded components for inference


def preprocess_for_inference(text, loaded_components):
    """Apply preprocessing pipeline to new text using saved components."""
    if not loaded_components:
        print("No pipeline components loaded.")
        return None

    # Clean text
    cleaned_text = text_cleaning.clean_governance_text(text)

    # Tokenize
    tokens = tokenization.custom_governance_tokenizer(cleaned_text)

    # Extract features
    features = {}

    # TF-IDF features
    if "tfidf_vectorizer" in loaded_components:
        tfidf_features = loaded_components["tfidf_vectorizer"].transform([cleaned_text])
        features["tfidf"] = tfidf_features

    # Word2Vec features
    if "word2vec_model" in loaded_components:
        w2v_model = loaded_components["word2vec_model"]

        # Create document embedding
        word_vectors = {word: w2v_model.wv[word] for word in w2v_model.wv.index_to_key}
        doc_embedding = feature_extraction.document_embedding_average(
            tokens, word_vectors
        )

        if len(doc_embedding) > 0:
            features["word2vec"] = doc_embedding

    return {"cleaned_text": cleaned_text, "tokens": tokens, "features": features}


# Example new text for inference
new_text = "Electoral reform is essential for strengthening democratic institutions."
processed = preprocess_for_inference(new_text, loaded_components)

print("\nPreprocessed new text for inference:")
print(f"Original: {new_text}")
print(f"Cleaned: {processed['cleaned_text']}")
print(f"Tokens: {processed['tokens']}")
print("Features extracted:")
for feat_type, feat in processed["features"].items():
    if hasattr(feat, "shape"):
        print(f"- {feat_type}: Shape {feat.shape}")
    else:
        print(f"- {feat_type}: Length {len(feat)}")

In [None]:
"""
## 10. Conclusion and Next Steps

This notebook has demonstrated the complete preprocessing pipeline for governance text analysis,
which will serve as a foundation for our sentiment analysis model and other NLP tasks.

### Summary of what we've covered:
1. Text cleaning specialized for governance text
2. Domain-specific tokenization preserving governance phrases
3. Feature extraction using traditional approaches (BoW, TF-IDF) and word embeddings
4. Political entity recognition for governance text
5. Data augmentation techniques specific to governance domain
6. Dataset balancing for handling imbalanced class distributions
7. Complete preprocessing pipeline implementation
8. Saving and loading pipeline components for inference

### Next steps:
1. Connect this preprocessing pipeline to the sentiment analysis model
2. Apply these preprocessing techniques to the full Austria Democracy Radar dataset
3. Extend the preprocessing pipeline with language detection and filtering
4. Refine governance-specific text augmentation techniques
5. Integrate the preprocessing pipeline into an end-to-end API service
"""

# Run some final checks
print("Preprocessing modules ready for integration into sentiment analysis pipeline.")
print(
    f"Text cleaning functions: {len([f for f in dir(text_cleaning) if callable(getattr(text_cleaning, f)) and not f.startswith('_')])}"
)
print(
    f"Tokenization functions: {len([f for f in dir(tokenization) if callable(getattr(tokenization, f)) and not f.startswith('_')])}"
)
print(
    f"Feature extraction functions: {len([f for f in dir(feature_extraction) if callable(getattr(feature_extraction, f)) and not f.startswith('_')])}"
)
print(
    f"Data augmentation functions: {len([f for f in dir(data_augmentation) if callable(getattr(data_augmentation, f)) and not f.startswith('_')])}"
)

print("\nEnd of preprocessing pipeline demonstration.")