# LUMIN.AI: Sentiment Analysis MVP

This notebook implements a minimum viable product for sentiment analysis in the LUMIN.AI project, specifically focused on analyzing governance-related texts. According to our roadmap, this is a key deliverable for Week 2, where we build a basic sentiment classification model.

## Objectives
- Implement a basic sentiment analysis pipeline for governance texts
- Compare traditional ML approaches with neural network methods
- Evaluate model performance on governance-specific language
- Create a foundation for the production-ready API service

## Success Criteria
- Model achieves at least 70% accuracy on governance text sentiment classification
- Clear documentation of the preprocessing and model training process
- Reproducible results for future development

In [None]:
# Import libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
import warnings
import pickle
from tqdm import tqdm

# For text preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import spacy

# For machine learning
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import Pipeline

# For neural networks
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import pipeline

# Set plotting style
sns.set_theme(style="whitegrid", palette="viridis")
plt.rcParams["figure.figsize"] = (12, 8)
warnings.filterwarnings("ignore")

# Download necessary NLTK resources
try:
    nltk.data.find("tokenizers/punkt")
    nltk.data.find("corpora/stopwords")
    nltk.data.find("corpora/wordnet")
except LookupError:
    nltk.download("punkt")
    nltk.download("stopwords")
    nltk.download("wordnet")

# Load spaCy model
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Downloading spaCy model...")
    os.system("python -m spacy download en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

print("Environment setup complete!")

In [None]:
# Define paths
SAMPLE_DATA_PATH = "../../data/examples/sample_data.csv"
FULL_DATA_PATH = "../../data/raw/democracy-radar/"
MODEL_SAVE_PATH = "../models/"

# Create models directory if it doesn't exist
os.makedirs(MODEL_SAVE_PATH, exist_ok=True)

# Check if sample data exists and load it
if os.path.exists(SAMPLE_DATA_PATH):
    print(f"Loading sample data from {SAMPLE_DATA_PATH}")
    data = pd.read_csv(SAMPLE_DATA_PATH)
    print(f"Sample data loaded successfully with {len(data)} rows")
else:
    print(f"Sample data not found at {SAMPLE_DATA_PATH}")
    print("Creating synthetic data for development")
    # Create synthetic data with governance-related text
    governance_texts = [
        "This governance proposal is excellent and transparent.",
        "I don't trust this process at all.",
        "The proposal contains both good and bad elements.",
        "Very clear communication from the team.",
        "Too many decisions made behind closed doors.",
        "The voting process was fair and accessible.",
        "The documentation is insufficient.",
        "Waiting to see how implementation goes before judging.",
        "Great to see community feedback incorporated.",
        "Timeline is unrealistic and poorly planned.",
    ]
    sentiments = [
        "positive",
        "negative",
        "neutral",
        "positive",
        "negative",
        "positive",
        "negative",
        "neutral",
        "positive",
        "negative",
    ]
    categories = [
        "transparency",
        "trust",
        "evaluation",
        "communication",
        "transparency",
        "participation",
        "documentation",
        "implementation",
        "feedback",
        "planning",
    ]
    confidence = [0.92, 0.87, 0.76, 0.95, 0.89, 0.91, 0.82, 0.79, 0.93, 0.88]

    data = pd.DataFrame(
        {
            "id": range(1, 11),
            "text": governance_texts,
            "sentiment": sentiments,
            "category": categories,
            "confidence": confidence,
        }
    )

# Display the first few rows of the data
print("\nFirst 5 rows of the data:")
display(data.head())

# Check sentiment distribution
print("\nSentiment distribution:")
display(data["sentiment"].value_counts())

# Check category distribution
print("\nCategory distribution:")
display(data["category"].value_counts())

In [None]:
# Define preprocessing function
def preprocess_text(text):
    """
    Preprocess text for sentiment analysis:
    1. Convert to lowercase
    2. Remove special characters and numbers
    3. Remove extra whitespace
    4. Tokenize
    5. Remove stopwords
    6. Lemmatize
    """
    if not isinstance(text, str):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove special characters and numbers
    text = re.sub(r"[^a-zA-Z\s]", "", text)

    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join tokens back into text
    processed_text = " ".join(tokens)

    return processed_text


# Apply preprocessing to text column
data["processed_text"] = data["text"].apply(preprocess_text)

# Display examples of original and preprocessed text
print("Examples of original and preprocessed text:")
text_examples = pd.DataFrame(
    {
        "Original Text": data["text"].head(),
        "Preprocessed Text": data["processed_text"].head(),
    }
)
display(text_examples)

# Check for empty processed texts
empty_texts = data[data["processed_text"] == ""]
if len(empty_texts) > 0:
    print(f"\nWarning: {len(empty_texts)} texts were empty after preprocessing.")
    display(empty_texts)
else:
    print("\nAll texts were successfully preprocessed.")

In [None]:
# Prepare data for modeling
X = data["processed_text"]
y = data["sentiment"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

# Create a CountVectorizer (Bag of Words)
count_vectorizer = CountVectorizer(max_features=1000)
X_train_counts = count_vectorizer.fit_transform(X_train)
X_test_counts = count_vectorizer.transform(X_test)

# Display the vocabulary size and sample features
print(f"\nBag of Words vocabulary size: {len(count_vectorizer.vocabulary_)}")
print("Sample features (first 10):", list(count_vectorizer.vocabulary_.keys())[:10])

# Create a TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Display the vocabulary size and sample features
print(f"\nTF-IDF vocabulary size: {len(tfidf_vectorizer.vocabulary_)}")
print("Sample features (first 10):", list(tfidf_vectorizer.vocabulary_.keys())[:10])

# Compare the feature representations
print("\nBag of Words representation (first sample):")
print(X_train_counts[0].toarray())

print("\nTF-IDF representation (first sample):")
print(X_train_tfidf[0].toarray())

In [None]:
# Define models to evaluate
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "SVM": LinearSVC(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
}

# Define a function to evaluate models


def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average="weighted")
    recall = recall_score(y_test, y_pred, average="weighted")
    f1 = f1_score(y_test, y_pred, average="weighted")

    # Print metrics
    print(f"{model_name} Performance:")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1 Score: {f1:.4f}")

    # Print classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # Plot confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=sorted(data["sentiment"].unique()),
        yticklabels=sorted(data["sentiment"].unique()),
    )
    plt.title(f"Confusion Matrix - {model_name}")
    plt.ylabel("True Label")
    plt.xlabel("Predicted Label")
    plt.show()

    return model, accuracy, precision, recall, f1


# Evaluate models using Bag of Words features
print("Evaluating models with Bag of Words features...\n")
bow_results = []

for name, model in models.items():
    print(f"\n{'-'*50}")
    model, accuracy, precision, recall, f1 = evaluate_model(
        model, X_train_counts, X_test_counts, y_train, y_test, f"{name} (BoW)"
    )
    bow_results.append(
        {
            "Model": f"{name} (BoW)",
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
        }
    )

# Evaluate models using TF-IDF features
print("\nEvaluating models with TF-IDF features...\n")
tfidf_results = []

for name, model in models.items():
    print(f"\n{'-'*50}")
    model, accuracy, precision, recall, f1 = evaluate_model(
        model, X_train_tfidf, X_test_tfidf, y_train, y_test, f"{name} (TF-IDF)"
    )
    tfidf_results.append(
        {
            "Model": f"{name} (TF-IDF)",
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
        }
    )

# Combine results
all_results = pd.DataFrame(bow_results + tfidf_results)

# Display results
print("\nModel Performance Summary:")
display(all_results.sort_values("F1 Score", ascending=False))

# Plot model comparison
plt.figure(figsize=(14, 8))
sns.barplot(x="Model", y="Accuracy", data=all_results, palette="viridis")
plt.title("Model Accuracy Comparison")
plt.xticks(rotation=45)
plt.ylim(0, 1)
plt.tight_layout()
plt.show()

# Identify the best model
best_model_row = all_results.loc[all_results["F1 Score"].idxmax()]
best_model_name = best_model_row["Model"]
print(f"\nBest model: {best_model_name}")
print(f"Accuracy: {best_model_row['Accuracy']:.4f}")
print(f"F1 Score: {best_model_row['F1 Score']:.4f}")

In [None]:
# Determine which model and feature extraction method to optimize
# For demonstration, let's assume SVM with TF-IDF performed best
# In a real scenario, you would use the best_model_name variable to determine this

# Create a pipeline for the model
pipeline = Pipeline(
    [("vectorizer", TfidfVectorizer()), ("classifier", LinearSVC(random_state=42))]
)

# Define hyperparameter grid
param_grid = {
    "vectorizer__max_features": [500, 1000, 2000],
    "vectorizer__ngram_range": [(1, 1), (1, 2)],  # Unigrams or unigrams+bigrams
    "classifier__C": [0.1, 1.0, 10.0],  # Regularization parameter
}

# Perform grid search with cross-validation
print("Performing grid search with cross-validation...")
grid_search = GridSearchCV(
    pipeline, param_grid, cv=5, scoring="f1_weighted", n_jobs=-1, verbose=1
)
grid_search.fit(X_train, y_train)

# Print best parameters
print("\nBest parameters:")
for param, value in grid_search.best_params_.items():
    print(f"  {param}: {value}")

# Evaluate the optimized model
optimized_model = grid_search.best_estimator_
y_pred = optimized_model.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="weighted")
recall = recall_score(y_test, y_pred, average="weighted")
f1 = f1_score(y_test, y_pred, average="weighted")

print("\nOptimized Model Performance:")
print(f"  Accuracy: {accuracy:.4f}")
print(f"  Precision: {precision:.4f}")
print(f"  Recall: {recall:.4f}")
print(f"  F1 Score: {f1:.4f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=sorted(data["sentiment"].unique()),
    yticklabels=sorted(data["sentiment"].unique()),
)
plt.title("Confusion Matrix - Optimized Model")
plt.ylabel("True Label")
plt.xlabel("Predicted Label")
plt.show()

# Save the optimized model
optimized_model_path = os.path.join(MODEL_SAVE_PATH, "optimized_sentiment_model.pkl")
with open(optimized_model_path, "wb") as f:
    pickle.dump(optimized_model, f)

print(f"\nOptimized model saved to {optimized_model_path}")

In [None]:
# Set up the sentiment analysis pipeline using a pre-trained model
print("Setting up transformer-based sentiment analysis pipeline...")
transformer_sentiment_analyzer = pipeline("sentiment-analysis")

# Function to map Hugging Face sentiment labels to our labels


def map_sentiment(result):
    # Hugging Face returns 'POSITIVE' or 'NEGATIVE'
    # We need to map to 'positive', 'negative', or 'neutral'
    label = result[0]["label"].lower()
    score = result[0]["score"]

    # If the score is close to 0.5, consider it neutral
    if 0.4 <= score <= 0.6:
        return "neutral"
    else:
        return label


# Test the transformer model on a few examples
print("\nTesting transformer model on example texts:")
for i, text in enumerate(data["text"].head(5)):
    result = transformer_sentiment_analyzer(text)
    mapped_sentiment = map_sentiment(result)
    print(f"\nText: {text}")
    print(f"True sentiment: {data['sentiment'].iloc[i]}")
    print(
        f"Transformer prediction: {mapped_sentiment} (raw: {result[0]['label']} with score: {result[0]['score']:.4f})"
    )

# Evaluate the transformer model on the test set
print("\nEvaluating transformer model on test set...")
transformer_predictions = []

for text in tqdm(X_test):
    result = transformer_sentiment_analyzer(text)
    mapped_sentiment = map_sentiment(result)
    transformer_predictions.append(mapped_sentiment)

# Calculate metrics
accuracy = accuracy_score(y_test, transformer_predictions)
precision = precision_score(y_test, transformer_predictions, average="weighted")
recall = recall_score(y_test, transformer_predictions, average="weighted")
f1 = f1_score(y_test, transformer_predictions, average="weighted")

print("\nTransformer Model Performance:")
print(f"  Accuracy: {accuracy:.4f}")
print(f"  Precision: {precision:.4f}")
print(f"  Recall: {recall:.4f}")
print(f"  F1 Score: {f1:.4f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, transformer_predictions))

# Plot confusion matrix
cm = confusion_matrix(y_test, transformer_predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=sorted(data["sentiment"].unique()),
    yticklabels=sorted(data["sentiment"].unique()),
)
plt.title("Confusion Matrix - Transformer Model")
plt.ylabel("True Label")
plt.xlabel("Predicted Label")
plt.show()

In [None]:
# Create a function to predict sentiment using our optimized model
def predict_sentiment_ml(text, model=optimized_model):
    """
    Predict sentiment using the optimized machine learning model.

    Args:
        text (str): Input text to analyze
        model: Trained sentiment analysis model

    Returns:
        dict: Prediction results with sentiment label and confidence score
    """
    # Preprocess the text
    processed_text = preprocess_text(text)

    # Make prediction
    prediction = model.predict([processed_text])[0]

    # Get probability scores if the model supports it
    try:
        proba = model.predict_proba([processed_text])[0]
        confidence = proba.max()
    except:
        # For models that don't support predict_proba (like LinearSVC)
        confidence = 0.8  # Default confidence

    return {
        "text": text,
        "sentiment": prediction,
        "confidence": float(confidence),
        "model_type": "machine_learning",
    }


# Create a function to predict sentiment using the transformer model


def predict_sentiment_transformer(text, model=transformer_sentiment_analyzer):
    """
    Predict sentiment using the transformer model.

    Args:
        text (str): Input text to analyze
        model: Transformer sentiment analysis pipeline

    Returns:
        dict: Prediction results with sentiment label and confidence score
    """
    # Make prediction
    result = model(text)
    label = result[0]["label"].lower()
    score = result[0]["score"]

    # Map to our sentiment categories
    if 0.4 <= score <= 0.6:
        sentiment = "neutral"
    else:
        sentiment = label

    return {
        "text": text,
        "sentiment": sentiment,
        "confidence": float(score),
        "model_type": "transformer",
    }


# Test both models on new examples
test_texts = [
    "The government's transparency initiative has been a great success.",
    "Citizens are increasingly frustrated with the lack of accountability.",
    "The new policy has both advantages and disadvantages that need careful consideration.",
    "Public trust in institutions remains stable according to recent surveys.",
]

print("Predictions using optimized machine learning model:")
for text in test_texts:
    result = predict_sentiment_ml(text)
    print(f"\nText: {text}")
    print(f"Sentiment: {result['sentiment']} (confidence: {result['confidence']:.4f})")

print("\nPredictions using transformer model:")
for text in test_texts:
    result = predict_sentiment_transformer(text)
    print(f"\nText: {text}")
    print(f"Sentiment: {result['sentiment']} (confidence: {result['confidence']:.4f})")

# Example of how to integrate with a FastAPI endpoint
print("\nExample FastAPI endpoint code:")
print(
    """
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import pickle

app = FastAPI()

# Load the model
with open('models/optimized_sentiment_model.pkl', 'rb') as f:
    model = pickle.load(f)

class TextRequest(BaseModel):
    text: str

class SentimentResponse(BaseModel):
    text: str
    sentiment: str
    confidence: float
    model_type: str

@app.post("/predict", response_model=SentimentResponse)
async def predict_sentiment(request: TextRequest):
    try:
        result = predict_sentiment_ml(request.text, model)
        return result
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))
"""
)