# Review Classification Pipeline - Inference Only

This notebook uses the pre-trained model to classify new reviews without retraining.

## Requirements
1. Run the complete training pipeline first (00_colab_complete_pipeline.ipynb)
2. Place your review data in `data/actual/` directory
3. Data should be in CSV or JSON format with 'id' and 'text' columns

## What This Does
- Loads pre-trained models from `models/saved_models/`
- Processes reviews from `data/actual/`
- Outputs policy violation predictions
- Saves results to `results/inference/`

## 1. Environment Setup

In [None]:
# Install required packages (if not already installed)
!pip install -q transformers torch pandas scikit-learn

import pandas as pd
import json
import os
import sys
from datetime import datetime
from pathlib import Path

print("✅ Environment setup complete")
print(f"Current directory: {os.getcwd()}")

## 2. Load Trained Model

In [None]:
# Check if trained model exists
model_dir = 'models/saved_models'
required_files = [
    'model_config.json',
    'constants.json', 
    'inference_pipeline.py',
    'metadata.json'
]

print("CHECKING TRAINED MODEL")
print("="*30)

missing_files = []
for file in required_files:
    file_path = os.path.join(model_dir, file)
    if os.path.exists(file_path):
        print(f"✅ {file}")
    else:
        print(f"❌ {file}")
        missing_files.append(file)

if missing_files:
    print(f"\n❌ ERROR: Missing required model files")
    print(f"Missing: {missing_files}")
    print(f"\nPlease run the training notebook first:")
    print(f"1. Open 00_colab_complete_pipeline.ipynb")
    print(f"2. Run all cells to train the pipeline")
    print(f"3. Run the model persistence cell")
    print(f"4. Then return to this inference notebook")
    sys.exit()
else:
    print(f"\n✅ All required model files found!")
    
    # Load metadata
    with open(os.path.join(model_dir, 'metadata.json'), 'r') as f:
        metadata = json.load(f)
    
    print(f"\nModel Information:")
    print(f"   Version: {metadata['version']}")
    print(f"   Trained: {metadata['timestamp']}")
    print(f"   Training size: {metadata['training_data_size']} reviews")
    if 'model_performance' in metadata:
        perf = metadata['model_performance']
        print(f"   Performance: {perf.get('average_confidence', 'N/A')} avg confidence")

## 3. Load Inference Pipeline

In [None]:
# Import the saved pipeline functions
sys.path.append(model_dir)
from inference_pipeline import process_reviews, load_constants, load_hf_pipelines

print("LOADING INFERENCE PIPELINE")
print("="*30)

try:
    # Test load the models to make sure everything works
    model_config_path = os.path.join(model_dir, 'model_config.json')
    constants_path = os.path.join(model_dir, 'constants.json')
    
    # Load configuration
    constants = load_constants(constants_path)
    
    print(f"✅ Constants loaded")
    print(f"   Policy categories: {len(constants['POLICY_CATEGORIES'])}")
    print(f"   Zero-shot labels: {len(constants['ZERO_SHOT_LABELS'])}")
    
    # Load models (this might take a moment)
    print(f"\nLoading HuggingFace models...")
    sentiment, toxic, zshot, tau = load_hf_pipelines(model_config_path)
    
    print(f"✅ Models loaded successfully")
    print(f"   Confidence threshold: {tau}")
    print(f"   Ready for inference!")
    
    models_loaded = True
    
except Exception as e:
    print(f"❌ Error loading models: {e}")
    print(f"The training pipeline may need to be run again")
    models_loaded = False

## 4. Load Input Data

In [None]:
# Check for input data in data/actual directory
input_dir = 'data/actual'
os.makedirs(input_dir, exist_ok=True)

print("LOADING INPUT DATA")
print("="*25)

# Look for CSV and JSON files
input_files = []
if os.path.exists(input_dir):
    for file in os.listdir(input_dir):
        if file.endswith(('.csv', '.json')):
            input_files.append(file)

print(f"Available input files in {input_dir}:")
for file in input_files:
    file_path = os.path.join(input_dir, file)
    file_size = os.path.getsize(file_path)
    print(f"   {file} ({file_size} bytes)")

if not input_files:
    print(f"❌ No input files found in {input_dir}")
    print(f"\nTo add your review data:")
    print(f"1. Create a CSV file with columns: 'id', 'text'")
    print(f"2. Or create a JSON file with array of objects: [{'id': 1, 'text': 'review text'}, ...]")
    print(f"3. Place the file in {input_dir}")
    print(f"4. Re-run this cell")
    print(f"\nExample files are already created for you to test with.")

# Let user choose which file to process
if input_files:
    print(f"\nChoose a file to process:")
    for i, file in enumerate(input_files):
        print(f"   {i+1}. {file}")
    
    # For demo, automatically use the first file
    # In practice, you might want to manually specify the file
    selected_file = input_files[0]
    print(f"\nUsing: {selected_file}")
    
    # Load the selected file
    file_path = os.path.join(input_dir, selected_file)
    
    try:
        if selected_file.endswith('.csv'):
            input_data = pd.read_csv(file_path)
        elif selected_file.endswith('.json'):
            with open(file_path, 'r') as f:
                json_data = json.load(f)
            input_data = pd.DataFrame(json_data)
        
        # Validate required columns
        if 'text' not in input_data.columns:
            print(f"❌ Missing required 'text' column")
            print(f"Available columns: {list(input_data.columns)}")
            input_data = None
        else:
            # Add ID column if missing
            if 'id' not in input_data.columns:
                input_data['id'] = range(1, len(input_data) + 1)
            
            print(f"✅ Data loaded successfully")
            print(f"   Reviews to process: {len(input_data)}")
            print(f"   Columns: {list(input_data.columns)}")
            
            # Show preview
            print(f"\nData Preview:")
            for idx, row in input_data.head(3).iterrows():
                text_preview = str(row['text'])[:60] + "..." if len(str(row['text'])) > 60 else str(row['text'])
                print(f"   ID {row['id']}: {text_preview}")
            
            if len(input_data) > 3:
                print(f"   ... and {len(input_data) - 3} more reviews")
    
    except Exception as e:
        print(f"❌ Error loading file: {e}")
        input_data = None
else:
    input_data = None

## 5. Run Inference

In [None]:
if models_loaded and input_data is not None and len(input_data) > 0:
    
    print("RUNNING INFERENCE")
    print("="*20)
    print(f"Processing {len(input_data)} reviews...")
    
    try:
        # Run the inference
        results = process_reviews(input_data, model_dir)
        
        print(f"✅ Inference completed!")
        print(f"   Processed: {len(results)} reviews")
        
        # Create results directory
        results_dir = 'results/inference'
        os.makedirs(results_dir, exist_ok=True)
        
        # Save results
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        results_file = f"inference_results_{timestamp}.csv"
        results_path = os.path.join(results_dir, results_file)
        
        results.to_csv(results_path, index=False)
        
        print(f"\nRESULTS SUMMARY")
        print("="*20)
        
        # Summary statistics
        approve_count = len(results[results['pred_label'] == 'APPROVE'])
        reject_count = len(results[results['pred_label'] == 'REJECT'])
        avg_confidence = results['confidence'].mean()
        
        print(f"Total reviews: {len(results)}")
        print(f"APPROVE: {approve_count} ({approve_count/len(results)*100:.1f}%)")
        print(f"REJECT: {reject_count} ({reject_count/len(results)*100:.1f}%)")
        print(f"Average confidence: {avg_confidence:.3f}")
        
        # Category breakdown for rejected reviews
        if reject_count > 0:
            print(f"\nREJECT Categories:")
            reject_categories = results[results['pred_label'] == 'REJECT']['pred_category'].value_counts()
            for category, count in reject_categories.items():
                print(f"   {category}: {count} reviews")
        
        # Show detailed results
        print(f"\nDETAILED RESULTS")
        print("="*50)
        
        display_df = results.copy()
        # Truncate text for display
        display_df['text'] = display_df['text'].apply(lambda x: x[:50] + "..." if len(x) > 50 else x)
        
        display_cols = ['id', 'text', 'pred_label', 'pred_category', 'confidence']
        print(display_df[display_cols].to_string(index=False))
        
        print(f"\n✅ Results saved to: {results_path}")
        print(f"\nSUCCESS: Inference complete!")
        print(f"Your reviews have been classified for policy violations.")
        
    except Exception as e:
        print(f"❌ Error during inference: {e}")
        import traceback
        traceback.print_exc()

else:
    print("❌ Cannot run inference")
    if not models_loaded:
        print("   Models not loaded properly")
    if input_data is None or len(input_data) == 0:
        print("   No input data available")
    
    print(f"\nPlease check:")
    print(f"1. Training notebook was run successfully")
    print(f"2. Input data is placed in data/input/ directory")
    print(f"3. Input data has required 'text' column")

## 6. Results Analysis

In [None]:
# Advanced analysis of results (if available)
if 'results' in locals() and len(results) > 0:
    
    print("ADVANCED RESULTS ANALYSIS")
    print("="*30)
    
    # Confidence distribution
    high_conf = results[results['confidence'] >= 0.8]
    medium_conf = results[(results['confidence'] >= 0.6) & (results['confidence'] < 0.8)]
    low_conf = results[results['confidence'] < 0.6]
    
    print(f"Confidence Distribution:")
    print(f"   High (≥0.8): {len(high_conf)} reviews ({len(high_conf)/len(results)*100:.1f}%)")
    print(f"   Medium (0.6-0.8): {len(medium_conf)} reviews ({len(medium_conf)/len(results)*100:.1f}%)")
    print(f"   Low (<0.6): {len(low_conf)} reviews ({len(low_conf)/len(results)*100:.1f}%)")
    
    # Policy violations by type
    print(f"\nPolicy Violation Types:")
    category_counts = results['pred_category'].value_counts()
    for category, count in category_counts.items():
        percentage = count / len(results) * 100
        status = "Policy Violation" if category != "None" else "Clean Review"
        print(f"   {category}: {count} reviews ({percentage:.1f}%) - {status}")
    
    # Flag high-risk reviews
    high_risk = results[
        (results['pred_label'] == 'REJECT') & 
        (results['confidence'] >= 0.8)
    ]
    
    if len(high_risk) > 0:
        print(f"\nHIGH-RISK REVIEWS (High confidence violations):")
        for idx, row in high_risk.iterrows():
            text_preview = row['text'][:60] + "..." if len(row['text']) > 60 else row['text']
            print(f"   ID {row['id']}: {row['pred_category']} ({row['confidence']:.3f}) - {text_preview}")
    
    # Export summary report
    summary_report = {
        'timestamp': datetime.now().isoformat(),
        'total_reviews': len(results),
        'approve_count': len(results[results['pred_label'] == 'APPROVE']),
        'reject_count': len(results[results['pred_label'] == 'REJECT']),
        'average_confidence': float(results['confidence'].mean()),
        'high_confidence_count': len(high_conf),
        'category_breakdown': category_counts.to_dict(),
        'high_risk_reviews': len(high_risk)
    }
    
    summary_path = os.path.join(results_dir, f"summary_report_{timestamp}.json")
    with open(summary_path, 'w') as f:
        json.dump(summary_report, f, indent=2)
    
    print(f"\n✅ Summary report saved: {summary_path}")
    
    print(f"\nINFERENCE COMPLETE")
    print(f"Files created:")
    print(f"   {results_path} - Detailed results")
    print(f"   {summary_path} - Summary report")
    
else:
    print("No results available for analysis")
    print("Run the inference cell first to generate results")