# Iris Species Classification - Exploratory Data Analysis

This notebook performs exploratory data analysis on the Iris dataset to understand the characteristics of the data and inform the machine learning pipeline.

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import json
import os

## 1. Load Dataset

In [2]:
# Load the training dataset
df = pd.read_csv('/Users/yuvalheffetz/ds-agent-projects/session_1d6e2cbf-cf62-4b6a-8d6e-39369137016f/data/train_set.csv')
print(f"Dataset shape: {df.shape}")
print(f"\nDataset info:")
print(df.info())
print(f"\nFirst few rows:")
df.head()

Dataset shape: (120, 6)

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             120 non-null    int64  
 1   SepalLengthCm  120 non-null    float64
 2   SepalWidthCm   120 non-null    float64
 3   PetalLengthCm  120 non-null    float64
 4   PetalWidthCm   120 non-null    float64
 5   Species        120 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 5.8+ KB
None

First few rows:


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,4.4,2.9,1.4,0.2,Iris-setosa
1,2,4.9,2.5,4.5,1.7,Iris-virginica
2,3,6.8,2.8,4.8,1.4,Iris-versicolor
3,4,4.9,3.1,1.5,0.1,Iris-setosa
4,5,5.5,2.5,4.0,1.3,Iris-versicolor


## 2. Basic Dataset Analysis

In [3]:
# Examine target variable distribution
print("Target variable distribution:")
print(df['Species'].value_counts())
print(f"\nDescriptive statistics for numerical features:")
df.describe()

Target variable distribution:
Species
Iris-setosa        40
Iris-virginica     40
Iris-versicolor    40
Name: count, dtype: int64

Descriptive statistics for numerical features:


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,120.0,120.0,120.0,120.0,120.0
mean,60.5,5.841667,3.044167,3.770833,1.204167
std,34.785054,0.840926,0.445669,1.767417,0.763825
min,1.0,4.3,2.0,1.1,0.1
25%,30.75,5.1,2.8,1.6,0.3
50%,60.5,5.75,3.0,4.25,1.3
75%,90.25,6.4,3.3,5.1,1.8
max,120.0,7.9,4.4,6.9,2.5


## 3. Feature Distribution Analysis with Interactive Plot

In [4]:
# Create feature distribution plot using box plots
feature_cols = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']

# Melt the dataframe for easier plotting
df_melted = df.melt(id_vars=['Species'], value_vars=feature_cols, 
                    var_name='Feature', value_name='Value')

# Create interactive box plot
fig = px.box(df_melted, x='Feature', y='Value', color='Species',
             labels={'Value': 'Measurement (cm)', 'Feature': 'Flower Features'})

# Apply consistent color palette
app_color_palette = [
    'rgba(99, 110, 250, 0.8)',   # Blue
    'rgba(239, 85, 59, 0.8)',    # Red/Orange-Red
    'rgba(0, 204, 150, 0.8)',    # Green
    'rgba(171, 99, 250, 0.8)',   # Purple
    'rgba(255, 161, 90, 0.8)',   # Orange
    'rgba(25, 211, 243, 0.8)',   # Cyan
    'rgba(255, 102, 146, 0.8)',  # Pink
    'rgba(182, 232, 128, 0.8)',  # Light Green
    'rgba(255, 151, 255, 0.8)',  # Magenta
    'rgba(254, 203, 82, 0.8)'    # Yellow
]

# Update layout with app styling
fig.update_layout(
    height=600,
    paper_bgcolor='rgba(0,0,0,0)',  # Transparent background
    plot_bgcolor='rgba(0,0,0,0)',   # Transparent plot area
    font=dict(color='#8B5CF6', size=12),  # App's purple color for text
    title_font=dict(color='#7C3AED', size=16),  # Slightly darker purple for titles
    xaxis=dict(
        gridcolor='rgba(139,92,246,0.2)',  # Purple-tinted grid
        zerolinecolor='rgba(139,92,246,0.3)',
        tickfont=dict(color='#8B5CF6', size=11),  # Purple tick labels
        title_font=dict(color='#7C3AED', size=12)  # Darker purple axis titles
    ),
    yaxis=dict(
        gridcolor='rgba(139,92,246,0.2)',  # Purple-tinted grid
        zerolinecolor='rgba(139,92,246,0.3)', 
        tickfont=dict(color='#8B5CF6', size=11),  # Purple tick labels
        title_font=dict(color='#7C3AED', size=12)  # Darker purple axis titles
    ),
    legend=dict(font=dict(color='#8B5CF6', size=11))  # Purple legend
)

# Update colors
for i, trace in enumerate(fig.data):
    trace.marker.color = app_color_palette[i % len(app_color_palette)]
    trace.line.color = app_color_palette[i % len(app_color_palette)]

# Create plots directory if it doesn't exist
os.makedirs('/Users/yuvalheffetz/ds-agent-projects/session_1d6e2cbf-cf62-4b6a-8d6e-39369137016f/research/plots', exist_ok=True)

# Save the plot
fig.write_html('/Users/yuvalheffetz/ds-agent-projects/session_1d6e2cbf-cf62-4b6a-8d6e-39369137016f/research/plots/feature_distribution_by_species.html', 
               include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})

fig.show()

## 4. Key Findings Summary

In [5]:
# Calculate some key statistics for insights
print("Key findings:")
print(f"1. Dataset contains {df.shape[0]} samples with {df.shape[1]-1} features (excluding ID column)")
print(f"2. Perfect class balance: {df['Species'].value_counts().to_dict()}")
print(f"3. No missing values detected")
print(f"4. All features are numerical with different scales:")
for col in feature_cols:
    print(f"   - {col}: {df[col].min():.1f} - {df[col].max():.1f} cm")

# Analyze feature separability
print(f"\n5. Feature separability analysis:")
for col in feature_cols:
    species_means = df.groupby('Species')[col].mean()
    range_diff = species_means.max() - species_means.min()
    print(f"   - {col}: {range_diff:.2f} cm difference between species means")

Key findings:
1. Dataset contains 120 samples with 5 features (excluding ID column)
2. Perfect class balance: {'Iris-setosa': 40, 'Iris-virginica': 40, 'Iris-versicolor': 40}
3. No missing values detected
4. All features are numerical with different scales:
   - SepalLengthCm: 4.3 - 7.9 cm
   - SepalWidthCm: 2.0 - 4.4 cm
   - PetalLengthCm: 1.1 - 6.9 cm
   - PetalWidthCm: 0.1 - 2.5 cm

5. Feature separability analysis:
   - SepalLengthCm: 1.62 cm difference between species means
   - SepalWidthCm: 0.65 cm difference between species means
   - PetalLengthCm: 4.10 cm difference between species means
   - PetalWidthCm: 1.79 cm difference between species means
