# HaemaVision - Exploratory Data Analysis

This notebook explores the blood cell image dataset, analyzing class distribution, image sizes, and visualizing examples from each class.

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from PIL import Image
import random
from collections import Counter

## Dataset Structure

First, let's explore the dataset structure and count the number of images per class.

In [None]:
# Define the data directory path
DATA_DIR = "../data"

# Get all class folders
class_folders = [f for f in os.listdir(DATA_DIR) if os.path.isdir(os.path.join(DATA_DIR, f))]
print(f"Found {len(class_folders)} classes: {class_folders}")

# Count images per class
class_counts = {}
for cls in class_folders:
    class_path = os.path.join(DATA_DIR, cls)
    images = [f for f in os.listdir(class_path) if f.endswith(('.png', '.jpg', '.jpeg'))]
    class_counts[cls] = len(images)

# Convert to DataFrame and display
counts_df = pd.DataFrame(list(class_counts.items()), columns=['Class', 'Count'])
counts_df = counts_df.sort_values('Count', ascending=False).reset_index(drop=True)
print(f"Total images: {counts_df['Count'].sum()}")
counts_df

## Class Distribution Visualization

In [None]:
# Plot class distribution
plt.figure(figsize=(12, 6))
plt.bar(counts_df['Class'], counts_df['Count'], color='skyblue')
plt.title('Number of Images per Blood Cell Class', fontsize=15)
plt.xlabel('Class', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()

# Calculate and display percentages
total = counts_df['Count'].sum()
percentages = counts_df.copy()
percentages['Percentage'] = (percentages['Count'] / total * 100).round(2)
percentages

## Image Size Analysis

Let's check the dimensions of images in each class to understand the variety of sizes.

In [None]:
def get_image_dimensions(folder):
    """Get dimensions of images in a folder"""
    folder_path = os.path.join(DATA_DIR, folder)
    images = [f for f in os.listdir(folder_path) if f.endswith(('.png', '.jpg', '.jpeg'))]
    
    # Sample up to 50 random images to avoid processing too many
    sample_size = min(50, len(images))
    sampled_images = random.sample(images, sample_size)
    
    dimensions = []
    for img_file in sampled_images:
        img_path = os.path.join(folder_path, img_file)
        try:
            with Image.open(img_path) as img:
                dimensions.append(img.size)  # (width, height)
        except Exception as e:
            print(f"Error processing {img_path}: {e}")
    
    return dimensions

# Collect dimensions from each class
all_dimensions = {}
for cls in class_folders:
    all_dimensions[cls] = get_image_dimensions(cls)

# Analyze dimensions
dimension_stats = {}
for cls, dims in all_dimensions.items():
    if dims:
        widths = [d[0] for d in dims]
        heights = [d[1] for d in dims]
        dimension_stats[cls] = {
            'min_width': min(widths),
            'max_width': max(widths),
            'avg_width': sum(widths) / len(widths),
            'min_height': min(heights),
            'max_height': max(heights),
            'avg_height': sum(heights) / len(heights),
            'most_common': Counter(dims).most_common(1)[0][0]
        }

# Display statistics
pd.DataFrame.from_dict(dimension_stats, orient='index')

## Sample Image Visualization

Let's visualize sample images from each class to understand what we're working with.

In [None]:
def display_sample_images(class_name, num_samples=5):
    """Display sample images from a specific class"""
    folder_path = os.path.join(DATA_DIR, class_name)
    images = [f for f in os.listdir(folder_path) if f.endswith(('.png', '.jpg', '.jpeg'))]
    
    if len(images) < num_samples:
        num_samples = len(images)
    
    sample_images = random.sample(images, num_samples)
    
    plt.figure(figsize=(15, 3))
    for i, img_file in enumerate(sample_images):
        img_path = os.path.join(folder_path, img_file)
        try:
            img = Image.open(img_path)
            plt.subplot(1, num_samples, i+1)
            plt.imshow(img)
            plt.axis('off')
            plt.title(f"{i+1}")
        except Exception as e:
            print(f"Error displaying {img_path}: {e}")
    
    plt.suptitle(f"Sample Images: {class_name}", fontsize=16)
    plt.tight_layout()
    plt.show()

In [None]:
# Display samples from each class
for cls in class_folders:
    display_sample_images(cls)

## Summary and Observations

Based on the exploration above, we can make the following observations:

1. Class distribution: [to be filled after running]
2. Image dimensions: [to be filled after running]
3. Visual characteristics of different cell types: [to be filled after running]

### Next Steps

1. Preprocessing strategy:
   - Resize all images to a consistent size (e.g., 128x128)
   - Apply normalization
   - Use data augmentation to balance classes if necessary

2. Model considerations:
   - Use a CNN architecture appropriate for this multi-class classification task
   - Consider class weights if distribution is imbalanced