# Iris Species Classification - Exploratory Data Analysis

This notebook contains exploratory data analysis for the Iris dataset classification task.

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os

In [None]:
# Load the dataset
df = pd.read_csv('/Users/yuvalheffetz/ds-agent-projects/session_d4b0eb9f-8fbe-4190-b004-6f780e178fe3/data/train_set.csv')
print(f"Dataset shape: {df.shape}")
print(f"\nDataset info:")
df.info()
print(f"\nFirst few rows:")
df.head()

In [None]:
# Basic dataset statistics
print("Dataset Statistics:")
print(df.describe())
print(f"\nTarget variable distribution:")
print(df['Species'].value_counts())
print(f"\nMissing values:")
print(df.isnull().sum())

In [None]:
# Define color palette for consistent styling
app_color_palette = [
    'rgba(99, 110, 250, 0.8)',   # Blue
    'rgba(239, 85, 59, 0.8)',    # Red/Orange-Red
    'rgba(0, 204, 150, 0.8)',    # Green
    'rgba(171, 99, 250, 0.8)',   # Purple
    'rgba(255, 161, 90, 0.8)',   # Orange
    'rgba(25, 211, 243, 0.8)',   # Cyan
    'rgba(255, 102, 146, 0.8)',  # Pink
    'rgba(182, 232, 128, 0.8)',  # Light Green
    'rgba(255, 151, 255, 0.8)',  # Magenta
    'rgba(254, 203, 82, 0.8)'    # Yellow
]

## EDA Step: Feature Distribution Analysis

Let's create a comprehensive view of all numerical features and their distributions across the three iris species.

In [None]:
# Create subplots for all four features
feature_cols = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']

fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=feature_cols,
    specs=[[{"secondary_y": False}, {"secondary_y": False}],
           [{"secondary_y": False}, {"secondary_y": False}]]
)

# Color mapping for species
species_colors = {
    'Iris-setosa': app_color_palette[0],     # Blue
    'Iris-versicolor': app_color_palette[1], # Red/Orange
    'Iris-virginica': app_color_palette[2]   # Green
}

positions = [(1, 1), (1, 2), (2, 1), (2, 2)]

for i, feature in enumerate(feature_cols):
    row, col = positions[i]
    
    for species in df['Species'].unique():
        species_data = df[df['Species'] == species][feature]
        
        fig.add_trace(
            go.Histogram(
                x=species_data,
                name=species,
                marker_color=species_colors[species],
                opacity=0.7,
                nbinsx=15,
                showlegend=(i == 0)  # Only show legend for first subplot
            ),
            row=row, col=col
        )

# Update layout
fig.update_layout(
    height=600,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    font=dict(color='#8B5CF6', size=12),
    title_font=dict(color='#7C3AED', size=16),
    barmode='overlay',
    legend=dict(font=dict(color='#8B5CF6', size=11))
)

# Update axes
for i in range(1, 5):
    fig.update_xaxes(
        gridcolor='rgba(139,92,246,0.2)',
        zerolinecolor='rgba(139,92,246,0.3)',
        tickfont=dict(color='#8B5CF6', size=10),
        title_font=dict(color='#7C3AED', size=11),
        row=(i-1)//2 + 1, col=(i-1)%2 + 1
    )
    fig.update_yaxes(
        gridcolor='rgba(139,92,246,0.2)',
        zerolinecolor='rgba(139,92,246,0.3)',
        tickfont=dict(color='#8B5CF6', size=10),
        title_font=dict(color='#7C3AED', size=11),
        row=(i-1)//2 + 1, col=(i-1)%2 + 1
    )

fig.show()

# Save the plot
fig.write_html("/Users/yuvalheffetz/ds-agent-projects/session_d4b0eb9f-8fbe-4190-b004-6f780e178fe3/research/plots/feature_distributions.html", 
               include_plotlyjs=True, 
               config={'responsive': True, 'displayModeBar': False})

print("Feature distributions plot saved successfully!")