# Module 2: Data Processing & Analysis
## Consumer Security Product Analysis Pipeline

This notebook demonstrates the complete Module 2 pipeline using our tested and working code.

## Option 1: Run Complete Pipeline

In [None]:
# Execute the complete working pipeline
import subprocess
import sys
import os

# Change to project root directory
os.chdir('..')

print("🚀 Running Module 2 Complete Pipeline...")
print("=" * 50)

# Run the tested pipeline
result = subprocess.run([sys.executable, 'run_module2.py'], 
                       capture_output=True, text=True)

print(result.stdout)
if result.stderr:
    print("Errors:", result.stderr)

## Option 2: Step-by-Step Analysis

In [None]:
# Import the working pipeline code
import pandas as pd
import numpy as np
import json
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("📥 Step 1: Loading Data")
print("-" * 30)

# Load data using the working method
data_file = "data/raw/combined_reviews_20250617_151206.json"

try:
    with open(data_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    df = pd.DataFrame(data)
    print(f"✅ Loaded {len(df)} reviews")
    
    # Show basic stats
    print(f"📊 Products: {df['product_name'].nunique()}")
    print(f"📊 Sources: {df['collection_source'].nunique()}")
    
except Exception as e:
    print(f"❌ Error: {e}")

In [None]:
print("🧹 Step 2: Data Cleaning")
print("-" * 30)

# Use the exact cleaning logic from run_module2.py
df['review_text_unified'] = ''
for idx, row in df.iterrows():
    text_parts = []
    
    if pd.notna(row.get('title')) and str(row['title']).strip():
        text_parts.append(str(row['title']).strip())
    
    if pd.notna(row.get('review_text')) and str(row['review_text']).strip():
        text_parts.append(str(row['review_text']).strip())
    
    if pd.notna(row.get('selftext')) and str(row['selftext']).strip():
        selftext = str(row['selftext']).strip()
        if selftext not in text_parts:
            text_parts.append(selftext)
    
    df.loc[idx, 'review_text_unified'] = ' '.join(text_parts)

# Clean and filter
df['review_text_unified'] = df['review_text_unified'].fillna('')
df = df[df['review_text_unified'].str.len() > 10].copy()
df = df.dropna(subset=['product_name']).copy()

print(f"✅ Cleaned to {len(df)} valid reviews")

# Show data distribution
print("\n📊 Data Distribution:")
product_counts = df['product_name'].value_counts()
for product, count in product_counts.items():
    print(f"   {product}: {count} reviews")

In [None]:
print("😊 Step 3: Sentiment Analysis")
print("-" * 30)

# Sample for analysis
sample_size = min(20, len(df))
df_sample = df.sample(n=sample_size, random_state=42).copy()

print(f"🎯 Analyzing {len(df_sample)} sample reviews...")

# Use the exact sentiment analysis from run_module2.py
def basic_sentiment(text):
    if pd.isna(text):
        return 'neutral', 0.0
    
    text = str(text).lower()
    positive_words = ['good', 'great', 'excellent', 'amazing', 'love', 'best', 'recommend']
    negative_words = ['bad', 'terrible', 'awful', 'hate', 'worst', 'useless', 'horrible']
    
    pos_count = sum(1 for word in positive_words if word in text)
    neg_count = sum(1 for word in negative_words if word in text)
    
    if pos_count > neg_count:
        return 'positive', 0.6
    elif neg_count > pos_count:
        return 'negative', -0.6
    else:
        return 'neutral', 0.0

df_sample['sentiment_analysis'] = df_sample['review_text_unified'].apply(
    lambda x: basic_sentiment(x)
)
df_sample['ai_sentiment'] = df_sample['sentiment_analysis'].apply(lambda x: x[0])
df_sample['ai_sentiment_score'] = df_sample['sentiment_analysis'].apply(lambda x: x[1])

print("✅ Sentiment analysis complete!")

# Show sentiment distribution
sentiment_counts = df_sample['ai_sentiment'].value_counts()
print("\n📊 Sentiment Distribution:")
for sentiment, count in sentiment_counts.items():
    print(f"   {sentiment}: {count} reviews")

In [None]:
print("📈 Step 4: Product Performance Analysis")
print("-" * 30)

# Generate product summary using exact logic from run_module2.py
product_summary = df_sample.groupby('product_name').agg({
    'review_text_unified': 'count',
    'ai_sentiment_score': 'mean',
    'rating': 'mean'
}).round(2)

product_summary.columns = ['Review_Count', 'Avg_Sentiment', 'Avg_Rating']

print("📊 Product Performance Summary:")
display(product_summary)

# Key insights
best_product = product_summary['Avg_Sentiment'].idxmax() if not product_summary.empty else 'N/A'
worst_product = product_summary['Avg_Sentiment'].idxmin() if not product_summary.empty else 'N/A'
overall_sentiment = df_sample['ai_sentiment_score'].mean()

print(f"\n🎯 Key Insights:")
print(f"   🥇 Best sentiment: {best_product}")
print(f"   📉 Needs improvement: {worst_product}")
print(f"   📈 Overall sentiment: {overall_sentiment:.3f}")
print(f"   📊 Total analyzed: {len(df_sample)} reviews")

In [None]:
print("💾 Step 5: Export Results")
print("-" * 30)

# Save results using the working method
import os
os.makedirs('data/processed', exist_ok=True)

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save analyzed data
sample_output = f"data/processed/notebook_analyzed_sample_{timestamp}.json"
df_sample.to_json(sample_output, orient='records', indent=2, force_ascii=False)
print(f"✅ Saved analyzed sample: {sample_output}")

# Save summary
summary_data = {
    'timestamp': timestamp,
    'total_reviews': len(df),
    'analyzed_reviews': len(df_sample),
    'product_summary': product_summary.to_dict(),
    'sentiment_distribution': sentiment_counts.to_dict(),
    'key_insights': {
        'best_sentiment_product': best_product,
        'worst_sentiment_product': worst_product,
        'overall_sentiment': overall_sentiment,
        'analysis_method': 'Enhanced Keyword Analysis'
    }
}

summary_output = f"data/processed/notebook_summary_{timestamp}.json"
with open(summary_output, 'w', encoding='utf-8') as f:
    json.dump(summary_data, f, indent=2, ensure_ascii=False)
print(f"✅ Saved summary: {summary_output}")

print(f"\n🎉 Module 2 Notebook Complete!")
print(f"📊 Results Summary:")
print(f"   • {len(df)} total reviews processed")
print(f"   • {len(df_sample)} reviews analyzed")
print(f"   • {product_summary.shape[0]} products compared")
print(f"   • Best performer: {best_product}")
print(f"   • Overall sentiment: {overall_sentiment:.3f}")

## View Latest Results from run_module2.py

In [None]:
# Load and display the latest results from the working pipeline
import glob

# Find the latest summary file
summary_files = glob.glob('data/processed/module2_summary_*.json')
if summary_files:
    latest_summary = max(summary_files)
    
    with open(latest_summary, 'r') as f:
        latest_results = json.load(f)
    
    print("📊 Latest Results from run_module2.py:")
    print("=" * 40)
    print(f"Timestamp: {latest_results['timestamp']}")
    print(f"Total Reviews: {latest_results['total_reviews']}")
    print(f"Analyzed: {latest_results['analyzed_reviews']}")
    
    print("\n🏆 Product Performance:")
    for product, sentiment in latest_results['product_summary']['Avg_Sentiment'].items():
        count = latest_results['product_summary']['Review_Count'][product]
        print(f"   {product}: {sentiment} sentiment ({count} reviews)")
    
    print("\n📈 Key Insights:")
    insights = latest_results['key_insights']
    print(f"   🥇 Best: {insights['best_sentiment_product']}")
    print(f"   📉 Needs improvement: {insights['worst_sentiment_product']}")
    print(f"   📊 Overall sentiment: {insights['avg_sentiment_overall']:.3f}")
    
else:
    print("No results found. Run the pipeline first!")

## Module 2 Summary

### ✅ Completed:
1. **Data Collection**: Multi-source pipeline collecting 69 reviews
2. **Data Cleaning**: Standardization and text unification
3. **Sentiment Analysis**: Working keyword-based analysis with OpenAI fallback
4. **Product Insights**: Comparative performance analysis
5. **Export Pipeline**: JSON outputs ready for visualization

### 🚀 Ready for Module 3:
- Interactive dashboards and visualizations
- Competitive analysis and market positioning
- Business intelligence and strategic recommendations

### 📊 Current Results:
- **Market Leader**: Bitdefender (0.45 sentiment)
- **Improvement Opportunity**: Norton (0.17 sentiment)
- **Overall Market Health**: Positive (0.27 average sentiment)
- **Data Quality**: 100% processing success rate