# InfraOwl Data Exploration

This notebook explores the infrastructure issue dataset for training the InfraOwl model.

## Overview
- Analyze dataset structure and statistics
- Visualize sample images from each class
- Check for data quality issues
- Identify potential improvements

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from PIL import Image
import yaml
from collections import Counter

# Set style
plt.style.use('default')
sns.set_palette("husl")

# Configuration
with open('../configs/training_config.yaml', 'r') as f:
    config = yaml.safe_load(f)

print("🚀 InfraOwl Data Exploration Started!")
print(f"Classes: {config['classes']}")

## 1. Dataset Structure Analysis

In [None]:
# Analyze raw data structure
raw_data_path = Path('../data/raw_images')
classes = config['classes']

data_stats = {}
for class_name in classes:
    class_dir = raw_data_path / class_name
    if class_dir.exists():
        # Count images
        image_files = []
        for ext in ['*.jpg', '*.jpeg', '*.png', '*.JPG', '*.JPEG', '*.PNG']:
            image_files.extend(list(class_dir.glob(ext)))
        data_stats[class_name] = len(image_files)
    else:
        data_stats[class_name] = 0

# Display statistics
print("📊 Dataset Statistics:")
for class_name, count in data_stats.items():
    print(f"  {class_name}: {count} images")

total_images = sum(data_stats.values())
print(f"\nTotal Images: {total_images}")

In [None]:
# Visualize class distribution
plt.figure(figsize=(12, 6))

# Bar plot
plt.subplot(1, 2, 1)
classes_list = list(data_stats.keys())
counts_list = list(data_stats.values())
bars = plt.bar(classes_list, counts_list)
plt.title('Images per Class')
plt.xlabel('Infrastructure Issue Type')
plt.ylabel('Number of Images')
plt.xticks(rotation=45)

# Add value labels on bars
for bar, count in zip(bars, counts_list):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
             str(count), ha='center', va='bottom')

# Pie chart
plt.subplot(1, 2, 2)
if total_images > 0:
    plt.pie(counts_list, labels=classes_list, autopct='%1.1f%%', startangle=90)
    plt.title('Class Distribution')
else:
    plt.text(0.5, 0.5, 'No data found', ha='center', va='center', transform=plt.gca().transAxes)

plt.tight_layout()
plt.show()

## 2. Sample Image Visualization

In [None]:
# Display sample images from each class
def show_sample_images(num_samples=3):
    num_classes = len([c for c in classes if data_stats[c] > 0])
    if num_classes == 0:
        print("No images found to display")
        return
    
    fig, axes = plt.subplots(num_classes, num_samples, figsize=(15, 4*num_classes))
    if num_classes == 1:
        axes = axes.reshape(1, -1)
    
    row_idx = 0
    for class_name in classes:
        if data_stats[class_name] == 0:
            continue
            
        class_dir = raw_data_path / class_name
        image_files = []
        for ext in ['*.jpg', '*.jpeg', '*.png', '*.JPG', '*.JPEG', '*.PNG']:
            image_files.extend(list(class_dir.glob(ext)))
        
        # Sample random images
        sample_size = min(num_samples, len(image_files))
        if sample_size > 0:
            sampled_files = np.random.choice(image_files, sample_size, replace=False)
            
            for col_idx, img_path in enumerate(sampled_files):
                try:
                    img = Image.open(img_path)
                    if num_classes == 1:
                        ax = axes[col_idx]
                    else:
                        ax = axes[row_idx, col_idx]
                    
                    ax.imshow(img)
                    ax.set_title(f'{class_name}\n{img.size[0]}x{img.size[1]}')
                    ax.axis('off')
                except Exception as e:
                    print(f"Error loading {img_path}: {e}")
        
        # Clear unused subplots
        for col_idx in range(sample_size, num_samples):
            if num_classes == 1:
                axes[col_idx].axis('off')
            else:
                axes[row_idx, col_idx].axis('off')
        
        row_idx += 1
    
    plt.tight_layout()
    plt.show()

if total_images > 0:
    show_sample_images(num_samples=4)
else:
    print("ℹ️  No images available for visualization.")
    print("   Please add images to the data/raw_images/ folders.")

## 3. Image Properties Analysis

In [None]:
# Analyze image properties
def analyze_image_properties():
    image_info = []
    
    for class_name in classes:
        if data_stats[class_name] == 0:
            continue
            
        class_dir = raw_data_path / class_name
        image_files = []
        for ext in ['*.jpg', '*.jpeg', '*.png', '*.JPG', '*.JPEG', '*.PNG']:
            image_files.extend(list(class_dir.glob(ext)))
        
        # Sample images for analysis (max 50 per class for speed)
        sample_files = image_files[:min(50, len(image_files))]
        
        for img_path in sample_files:
            try:
                with Image.open(img_path) as img:
                    width, height = img.size
                    mode = img.mode
                    format_type = img.format
                    
                    # File size
                    file_size = os.path.getsize(img_path) / 1024  # KB
                    
                    image_info.append({
                        'class': class_name,
                        'width': width,
                        'height': height,
                        'aspect_ratio': width / height,
                        'pixels': width * height,
                        'mode': mode,
                        'format': format_type,
                        'file_size_kb': file_size
                    })
            except Exception as e:
                print(f"Error analyzing {img_path}: {e}")
    
    return pd.DataFrame(image_info)

if total_images > 0:
    print("🔍 Analyzing image properties...")
    df = analyze_image_properties()
    
    if len(df) > 0:
        print(f"\n📊 Analyzed {len(df)} images")
        print("\nImage Statistics:")
        print(df.describe())
        
        # Display summary by class
        print("\nSummary by Class:")
        class_summary = df.groupby('class').agg({
            'width': ['mean', 'std'],
            'height': ['mean', 'std'],
            'aspect_ratio': ['mean', 'std'],
            'file_size_kb': ['mean', 'std']
        }).round(2)
        print(class_summary)
    else:
        print("❌ No valid images found for analysis")
else:
    print("ℹ️  No images available for property analysis.")

In [None]:
# Visualize image properties
if total_images > 0 and len(df) > 0:
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Resolution distribution
    axes[0, 0].scatter(df['width'], df['height'], alpha=0.6, c=pd.Categorical(df['class']).codes)
    axes[0, 0].set_xlabel('Width (pixels)')
    axes[0, 0].set_ylabel('Height (pixels)')
    axes[0, 0].set_title('Image Resolutions')
    axes[0, 0].grid(True, alpha=0.3)
    
    # Aspect ratio distribution
    for class_name in df['class'].unique():
        class_data = df[df['class'] == class_name]
        axes[0, 1].hist(class_data['aspect_ratio'], alpha=0.6, label=class_name, bins=20)
    axes[0, 1].set_xlabel('Aspect Ratio (W/H)')
    axes[0, 1].set_ylabel('Frequency')
    axes[0, 1].set_title('Aspect Ratio Distribution')
    axes[0, 1].legend()
    axes[0, 1].grid(True, alpha=0.3)
    
    # File size distribution
    df.boxplot(column='file_size_kb', by='class', ax=axes[1, 0])
    axes[1, 0].set_title('File Size Distribution by Class')
    axes[1, 0].set_xlabel('Class')
    axes[1, 0].set_ylabel('File Size (KB)')
    
    # Image mode distribution
    mode_counts = df['mode'].value_counts()
    axes[1, 1].pie(mode_counts.values, labels=mode_counts.index, autopct='%1.1f%%')
    axes[1, 1].set_title('Image Mode Distribution')
    
    plt.tight_layout()
    plt.show()

## 4. Data Quality Assessment

In [None]:
# Data quality checks
def assess_data_quality():
    issues = []
    recommendations = []
    
    # Check class balance
    if len(data_stats) > 1:
        min_count = min([c for c in data_stats.values() if c > 0])
        max_count = max(data_stats.values())
        
        if max_count > 0 and min_count / max_count < 0.5:
            issues.append("⚠️  Class imbalance detected")
            recommendations.append("Consider data augmentation or collecting more samples for underrepresented classes")
    
    # Check minimum samples per class
    min_recommended = 100
    for class_name, count in data_stats.items():
        if 0 < count < min_recommended:
            issues.append(f"⚠️  {class_name} has only {count} samples (recommended: {min_recommended}+)")
            recommendations.append(f"Collect more {class_name} samples for better model performance")
    
    # Check for missing classes
    missing_classes = [c for c, count in data_stats.items() if count == 0]
    if missing_classes:
        issues.append(f"❌ Missing data for classes: {', '.join(missing_classes)}")
        recommendations.append("Add images to missing class directories")
    
    # Resolution recommendations
    if total_images > 0 and len(df) > 0:
        min_resolution = 224  # Target model input size
        low_res_count = len(df[(df['width'] < min_resolution) | (df['height'] < min_resolution)])
        
        if low_res_count > 0:
            issues.append(f"⚠️  {low_res_count} images have resolution < {min_resolution}x{min_resolution}")
            recommendations.append("Consider using higher resolution images for better quality")
    
    return issues, recommendations

print("🔍 Assessing data quality...\n")
issues, recommendations = assess_data_quality()

if issues:
    print("Issues Found:")
    for issue in issues:
        print(f"  {issue}")
    
    print("\nRecommendations:")
    for rec in recommendations:
        print(f"  • {rec}")
else:
    print("✅ No major data quality issues detected!")

print(f"\n📋 Dataset Status:")
if total_images >= 500:
    print("  ✅ Good dataset size for training")
elif total_images >= 200:
    print("  ⚠️  Adequate dataset size, but more data would help")
else:
    print("  ❌ Small dataset size, consider collecting more data")

## 5. Next Steps & Recommendations

In [None]:
print("🚀 Next Steps for InfraOwl Training:")
print()

if total_images > 0:
    print("1. 📁 Data Preprocessing:")
    print("   Run: python ../scripts/data_preprocessing.py")
    print("   This will resize images and create train/val/test splits")
    print()
    
    print("2. 🎯 Model Training:")
    print("   Run: python ../scripts/train_model.py")
    print("   This will train the infrastructure detection model")
    print()
    
    print("3. 📱 Mobile Conversion:")
    print("   Run: python ../scripts/convert_to_tflite.py")
    print("   This will convert the model to TensorFlow Lite for mobile")
    print()
    
    print("4. 📊 Model Evaluation:")
    print("   Run: python ../scripts/evaluate_model.py")
    print("   This will evaluate model performance and generate reports")
    print()

else:
    print("❗ First, add images to the data/raw_images/ directories:")
    print("   - data/raw_images/pothole/")
    print("   - data/raw_images/crack/")
    print("   - data/raw_images/broken_pipe/")
    print("   - data/raw_images/damaged_road/")
    print("   - data/raw_images/normal/")
    print()
    print("Then re-run this notebook to analyze your data.")

print("💡 Tips for Better Results:")
print("   • Aim for 100+ images per class")
print("   • Use high-quality, well-lit images")
print("   • Include variety in angles and conditions")
print("   • Ensure consistent labeling across classes")
print("   • Consider data augmentation for small datasets")