# Iris Species Classification - Exploratory Data Analysis

This notebook performs exploratory data analysis on the Iris dataset for multi-class classification.

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import json
import os

# Load the training dataset
train_path = '/Users/yuvalheffetz/ds-agent-projects/session_8f5d6987-98d5-4c7f-84cd-6fe7a7f21976/data/train_set.csv'
df = pd.read_csv(train_path)

print(f"Dataset shape: {df.shape}")
print(f"\nColumn names: {list(df.columns)}")
print(f"\nData types:")
print(df.dtypes)
print(f"\nFirst few rows:")
print(df.head())

In [None]:
# Basic dataset information
print(f"Dataset info:")
print(f"Total samples: {len(df)}")
print(f"Number of features: {len(df.columns) - 2}")  # Excluding Id and Species
print(f"\nTarget variable distribution:")
print(df['Species'].value_counts())
print(f"\nMissing values:")
print(df.isnull().sum())
print(f"\nBasic statistics for numerical features:")
feature_cols = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
print(df[feature_cols].describe())

## Feature Distribution Analysis

Let's analyze the distribution of features across different species to understand class separability.

In [None]:
# Create violin plots to show feature distributions by species
app_color_palette = [
    'rgba(99, 110, 250, 0.8)',   # Blue
    'rgba(239, 85, 59, 0.8)',    # Red/Orange-Red
    'rgba(0, 204, 150, 0.8)',    # Green
    'rgba(171, 99, 250, 0.8)',   # Purple
    'rgba(255, 161, 90, 0.8)',   # Orange
    'rgba(25, 211, 243, 0.8)',   # Cyan
    'rgba(255, 102, 146, 0.8)',  # Pink
    'rgba(182, 232, 128, 0.8)',  # Light Green
    'rgba(255, 151, 255, 0.8)',  # Magenta
    'rgba(254, 203, 82, 0.8)'    # Yellow
]

# Create subplots for all four features
feature_names = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
feature_labels = ['Sepal Length (cm)', 'Sepal Width (cm)', 'Petal Length (cm)', 'Petal Width (cm)']

fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=feature_labels,
    vertical_spacing=0.12,
    horizontal_spacing=0.1
)

species_list = df['Species'].unique()
positions = [(1,1), (1,2), (2,1), (2,2)]

for i, (feature, label) in enumerate(zip(feature_names, feature_labels)):
    row, col = positions[i]
    
    for j, species in enumerate(species_list):
        species_data = df[df['Species'] == species][feature]
        
        fig.add_trace(
            go.Violin(
                y=species_data,
                name=species.replace('Iris-', ''),
                side='positive',
                line_color=app_color_palette[j],
                fillcolor=app_color_palette[j],
                showlegend=i == 0,  # Only show legend for first subplot
                legendgroup=species
            ),
            row=row, col=col
        )

fig.update_layout(
    height=600,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    font=dict(color='#8B5CF6', size=12),
    title_font=dict(color='#7C3AED', size=16),
    violinmode='group'
)

# Update axes styling
fig.update_xaxes(
    showgrid=True,
    gridcolor='rgba(139,92,246,0.2)',
    tickfont=dict(color='#8B5CF6', size=11),
    title_font=dict(color='#7C3AED', size=12)
)
fig.update_yaxes(
    showgrid=True,
    gridcolor='rgba(139,92,246,0.2)',
    tickfont=dict(color='#8B5CF6', size=11),
    title_font=dict(color='#7C3AED', size=12)
)

# Update legend
fig.update_layout(legend=dict(font=dict(color='#8B5CF6', size=11)))

# Save the plot
plots_dir = '/Users/yuvalheffetz/ds-agent-projects/session_8f5d6987-98d5-4c7f-84cd-6fe7a7f21976/research/plots'
os.makedirs(plots_dir, exist_ok=True)
fig.write_html(
    f"{plots_dir}/feature_distribution_by_species.html", 
    include_plotlyjs=True, 
    config={'responsive': True, 'displayModeBar': False}
)

fig.show()

## Analysis Summary

Based on the violin plots, we can observe:

1. **Class Separability**: The three species show distinct distributions across features
2. **Feature Importance**: Petal measurements (length and width) appear to be more discriminative than sepal measurements
3. **Linear Separability**: Iris-setosa appears to be linearly separable from the other two species, especially based on petal measurements