# Citizen Feedback Text Analysis: Exploratory Data Analysis

This notebook demonstrates the complete pipeline for analyzing citizen feedback about public services in Nigeria.

**Contents:**
1. Data Generation & Loading
2. Text Cleaning & Preprocessing
3. Exploratory Data Analysis
4. Sentiment Analysis
5. Topic Modeling
6. Combined Analysis & Insights
7. Export Results

## Setup & Imports

In [None]:
# Standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Project modules
from src.data.generate_synthetic_feedback import generate_synthetic_feedback, save_to_csv
from src.text.cleaning import clean_text_df
from src.text.features import compute_tfidf_features, get_top_keywords, compute_text_statistics
from src.text.sentiment import compute_sentiment, get_sentiment_summary
from src.text.topic_modeling import fit_topics, print_topics, save_topic_assignments
from src.viz.plots import (
    plot_topic_trends, plot_sentiment_trends, plot_top_keywords,
    plot_topic_sentiment_heatmap, plot_channel_distribution,
    plot_state_distribution, plot_word_length_distribution
)

# Plotting setup
%matplotlib inline
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

print('✓ All imports successful')

## 1. Data Generation & Loading

In [None]:
# Check if data already exists
data_path = Path('data/raw/citizen_feedback.csv')

if not data_path.exists():
    print('Generating synthetic data...')
    records = generate_synthetic_feedback(n=50000, months=24, seed=42)
    save_to_csv(records, data_path)
    print(f'✓ Data generated and saved to {data_path}')
else:
    print(f'✓ Data already exists at {data_path}')

# Load data
df = pd.read_csv(data_path)
df['created_at'] = pd.to_datetime(df['created_at'])

print(f'\nDataset shape: {df.shape}')
print(f'Columns: {list(df.columns)}')
df.head()

### Data Overview

In [None]:
print('Dataset Information:')
print('='*60)
df.info()

print('\nMissing Values:')
print(df.isnull().sum())

print('\nDate Range:')
print(f'Start: {df["created_at"].min()}')
print(f'End: {df["created_at"].max()}')

## 2. Text Cleaning & Preprocessing

In [None]:
# Show sample raw messages
print('Sample Raw Messages:')
print('='*60)
for idx in range(5):
    print(f'\n[{idx+1}] {df.iloc[idx]["raw_text"]}')

In [None]:
# Clean text
df_clean = clean_text_df(df, text_col='raw_text')

print('\nCleaning Results:')
print(f'Original records: {len(df):,}')
print(f'After cleaning: {len(df_clean):,}')
print(f'Removed: {len(df) - len(df_clean):,} records')

In [None]:
# Show cleaned messages with PII masking examples
print('Sample Cleaned Messages (with PII masking):')
print('='*60)
for idx in range(5):
    print(f'\n[{idx+1}] Original: {df_clean.iloc[idx]["raw_text"]}')
    print(f'    Cleaned:  {df_clean.iloc[idx]["cleaned_text"]}')

## 3. Exploratory Data Analysis

### Channel Distribution

In [None]:
plot_channel_distribution(df_clean, output_path=Path('reports/figures/channel_dist.png'), show=True)

### State Distribution

In [None]:
plot_state_distribution(df_clean, top_n=15, output_path=Path('reports/figures/state_dist.png'), show=True)

### Department Distribution

In [None]:
dept_counts = df_clean['assigned_dept'].value_counts()
fig, ax = plt.subplots(figsize=(10, 6))
dept_counts.plot.bar(ax=ax, color='steelblue')
ax.set_xlabel('Department')
ax.set_ylabel('Count')
ax.set_title('Feedback by Department')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('reports/figures/dept_dist.png', dpi=300, bbox_inches='tight')
plt.show()

### Message Length Distribution

In [None]:
plot_word_length_distribution(df_clean, output_path=Path('reports/figures/length_dist.png'), show=True)

print('\nText Length Statistics:')
print(df_clean['word_count'].describe())

### Temporal Patterns

In [None]:
df_clean['month'] = df_clean['created_at'].dt.to_period('M')
monthly_counts = df_clean.groupby('month').size()

fig, ax = plt.subplots(figsize=(14, 5))
monthly_counts.plot(ax=ax, marker='o', linewidth=2)
ax.set_xlabel('Month')
ax.set_ylabel('Feedback Count')
ax.set_title('Feedback Volume Over Time')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('reports/figures/temporal_trend.png', dpi=300, bbox_inches='tight')
plt.show()

### Rating Distribution

In [None]:
# Filter non-empty ratings
rated = df_clean[df_clean['rating'].notna() & (df_clean['rating'] != '')]
rated['rating'] = pd.to_numeric(rated['rating'], errors='coerce')
rated = rated[rated['rating'] > 0]

if len(rated) > 0:
    fig, ax = plt.subplots(figsize=(8, 5))
    rated['rating'].value_counts().sort_index().plot.bar(ax=ax, color='coral')
    ax.set_xlabel('Rating')
    ax.set_ylabel('Count')
    ax.set_title('Rating Distribution (1=worst, 5=best)')
    plt.tight_layout()
    plt.savefig('reports/figures/rating_dist.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f'\nRatings provided: {len(rated):,} ({len(rated)/len(df_clean)*100:.1f}%)')
    print(f'Average rating: {rated["rating"].mean():.2f}')

## 4. Sentiment Analysis

In [None]:
# Compute sentiment
df_clean = compute_sentiment(df_clean, text_col='cleaned_text')

In [None]:
# Overall sentiment summary
print('Overall Sentiment Distribution:')
print('='*60)
print(df_clean['sentiment_label'].value_counts())
print('\nSentiment Statistics:')
print(get_sentiment_summary(df_clean))

In [None]:
# Sentiment by channel
print('\nSentiment by Channel:')
print('='*60)
print(get_sentiment_summary(df_clean, group_by='channel'))

In [None]:
# Sentiment examples
print('\nSample Positive Feedback:')
print('='*60)
for idx, row in df_clean[df_clean['sentiment_label'] == 'positive'].head(3).iterrows():
    print(f'[{row["sentiment_score"]:.2f}] {row["cleaned_text"][:150]}...')

print('\nSample Negative Feedback:')
print('='*60)
for idx, row in df_clean[df_clean['sentiment_label'] == 'negative'].head(3).iterrows():
    print(f'[{row["sentiment_score"]:.2f}] {row["cleaned_text"][:150]}...')

In [None]:
# Sentiment trends
plot_sentiment_trends(
    df_clean,
    by='national',
    output_path=Path('reports/figures/sentiment_trends.png'),
    show=True
)

## 5. Topic Modeling

In [None]:
# Fit topic model
topic_result = fit_topics(
    df_clean,
    text_col='cleaned_text',
    n_topics=10,
    method='lda',
    random_state=42
)

In [None]:
# Print topics
print_topics(topic_result, n_words=10)

In [None]:
# Add topic assignments to dataframe
df_clean['dominant_topic'] = topic_result['dominant_topics']

# Topic distribution
topic_dist = df_clean['dominant_topic'].value_counts().sort_index()
print('\nTopic Distribution:')
print(topic_dist)

In [None]:
# Topic trends over time
plot_topic_trends(
    df_clean,
    topic_col='dominant_topic',
    output_path=Path('reports/figures/topic_trends.png'),
    show=True
)

## 6. Combined Analysis & Insights

In [None]:
# Topic × Sentiment heatmap
plot_topic_sentiment_heatmap(
    df_clean,
    output_path=Path('reports/figures/topic_sentiment_heatmap.png'),
    show=True
)

In [None]:
# Topic by channel
topic_channel = pd.crosstab(df_clean['dominant_topic'], df_clean['channel'], normalize='columns')

fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(topic_channel, annot=True, fmt='.2f', cmap='YlOrRd', ax=ax)
ax.set_xlabel('Channel')
ax.set_ylabel('Topic')
ax.set_title('Topic Distribution by Channel')
plt.tight_layout()
plt.savefig('reports/figures/topic_channel_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Extract top keywords
texts = df_clean['cleaned_text'].tolist()
tfidf_matrix, vectorizer, feature_names = compute_tfidf_features(texts)
top_keywords = get_top_keywords(tfidf_matrix, feature_names, top_n=20)

plot_top_keywords(
    top_keywords,
    output_path=Path('reports/figures/top_keywords.png'),
    show=True
)

### Key Insights

In [None]:
print('KEY INSIGHTS')
print('='*60)

print('\n1. VOLUME')
print(f'   Total messages: {len(df_clean):,}')
print(f'   Top channel: {df_clean["channel"].value_counts().index[0]}')
print(f'   Top state: {df_clean["state"].value_counts().index[0]}')

print('\n2. SENTIMENT')
neg_pct = (df_clean['sentiment_label'] == 'negative').sum() / len(df_clean) * 100
print(f'   Negative feedback: {neg_pct:.1f}%')
print(f'   Average sentiment score: {df_clean["sentiment_score"].mean():.3f}')

print('\n3. TOPICS')
top_topic = df_clean['dominant_topic'].value_counts().index[0]
top_topic_pct = df_clean['dominant_topic'].value_counts().iloc[0] / len(df_clean) * 100
print(f'   Most common topic: {top_topic} ({top_topic_pct:.1f}%)')

print('\n4. RESPONSE')
resolved = df_clean[df_clean['resolved'] == 'True']
resolution_rate = len(resolved) / len(df_clean) * 100
print(f'   Resolution rate: {resolution_rate:.1f}%')
if 'response_time_days' in df_clean.columns:
    resolved_times = pd.to_numeric(resolved['response_time_days'], errors='coerce')
    avg_response = resolved_times.mean()
    print(f'   Avg response time: {avg_response:.1f} days')

print('\n5. RECOMMENDATIONS')
print('   • Focus on top 3 topics for quick wins')
print('   • Improve response time for SMS/Hotline channels')
print('   • Address negative sentiment clusters in specific states')
print('   • Recognize and replicate positive feedback patterns')

## 7. Export Results

In [None]:
# Save processed data with topic assignments
output_path = Path('data/processed/topic_assignments.csv')
save_topic_assignments(df_clean, topic_result, output_path)

# Save to parquet as well
parquet_path = Path('data/processed/citizen_feedback_clean.parquet')
df_clean.to_parquet(parquet_path, index=False)

print(f'✓ Results saved to:')
print(f'  - {output_path}')
print(f'  - {parquet_path}')
print(f'  - reports/figures/ (visualizations)')

## Next Steps

1. **Launch Dashboard**: Run `streamlit run dashboards/app.py` for interactive exploration
2. **Review Policy Brief**: See `reports/citizen_feedback_brief.md` for recommendations
3. **Documentation**: Review `docs/` folder for ethics, data dictionary, and modeling notes
4. **Customization**: Adjust `config/analysis_config.yml` to experiment with parameters