In [None]:
"""
Iris Species Classification using Decision Tree Classifier

This script demonstrates a complete machine learning workflow using Scikit-learn
to classify iris species based on their sepal and petal measurements.

Dataset: Iris Species Dataset (built-in sklearn dataset)
Model: Decision Tree Classifier
Features: Sepal length, sepal width, petal length, petal width
Target: Species (setosa, versicolor, virginica)

Author: AI Assistant
Date: 2025-10-16
"""

# =============================================================================
# SECTION 1: IMPORTS AND SETUP
# =============================================================================

import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

print("All necessary libraries imported successfully!")
print("=" * 60)

# =============================================================================
# SECTION 2: DATA LOADING AND EXPLORATION
# =============================================================================

# Load the Iris dataset from sklearn
iris_data = load_iris()

# Display basic information about the dataset
print("Dataset Information:")
print(f"Number of samples: {len(iris_data.data)}")
print(f"Number of features: {len(iris_data.feature_names)}")
print(f"Feature names: {iris_data.feature_names}")
print(f"Target names: {iris_data.target_names}")
print()

# Convert the data into a Pandas DataFrame for better handling
# Create DataFrame with feature data
iris_df = pd.DataFrame(data=iris_data.data, columns=iris_data.feature_names)

# Add target variable to DataFrame
iris_df['species'] = iris_data.target

# Display first few rows to understand the data structure
print("First 5 rows of the dataset:")
print(iris_df.head())
print()

# Display basic statistics
print("Basic statistics of the features:")
print(iris_df.describe())
print()

# =============================================================================
# SECTION 3: DATA PREPROCESSING
# =============================================================================

# Separate features (X) from target (y)
# Features are the measurements (sepal length, sepal width, petal length, petal width)
# Target is the species classification
X = iris_df.drop('species', axis=1)  # Features
y = iris_df['species']  # Target

print("Features (X) shape:", X.shape)
print("Target (y) shape:", y.shape)
print()

# Apply LabelEncoder to convert species names to numerical labels
# This is necessary because sklearn models work with numerical data
label_encoder = LabelEncoder()

# Fit the encoder and transform the target variable
y_encoded = label_encoder.fit_transform(y)

# Display the mapping between original species names and encoded labels
print("Label Encoding Mapping:")
for i, species in enumerate(iris_data.target_names):
    print(f"{i} -> {species}")
print()

# Update the DataFrame with encoded target for consistency
iris_df['species_encoded'] = y_encoded

# Display sample of original vs encoded species
print("Sample of original vs encoded species:")
print(iris_df[['species', 'species_encoded']].head(10))
print()

# =============================================================================
# SECTION 4: DATA SPLITTING
# =============================================================================

# Split the data into training and testing sets
# Using 80% for training and 20% for testing (standard practice)
# random_state ensures reproducible results
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42
)

print("Data splitting completed:")
print(f"Training set size: {len(X_train)} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"Testing set size: {len(X_test)} samples ({len(X_test)/len(X)*100:.1f}%)")
print()

# Display the distribution of species in training and test sets
print("Species distribution in training set:")
train_species_dist = pd.Series(y_train).value_counts().sort_index()
for label, count in train_species_dist.items():
    species_name = iris_data.target_names[label]
    print(f"  {species_name}: {count} samples")

print("\nSpecies distribution in test set:")
test_species_dist = pd.Series(y_test).value_counts().sort_index()
for label, count in test_species_dist.items():
    species_name = iris_data.target_names[label]
    print(f"  {species_name}: {count} samples")
print()

# =============================================================================
# SECTION 5: MODEL TRAINING
# =============================================================================

# Initialize the Decision Tree Classifier
# Using default parameters for simplicity, but these can be tuned for better performance
decision_tree = DecisionTreeClassifier(random_state=42)

print("Decision Tree Classifier initialized with default parameters:")
print(f"Criterion: {decision_tree.criterion}")
print(f"Max depth: {decision_tree.max_depth}")
print(f"Min samples split: {decision_tree.min_samples_split}")
print()

# Train the model on the training data
print("Training the model...")
decision_tree.fit(X_train, y_train)
print("Model training completed!")
print()

# =============================================================================
# SECTION 6: MODEL PREDICTION AND EVALUATION
# =============================================================================

# Make predictions on the test set
print("Making predictions on test set...")
y_pred = decision_tree.predict(X_test)
print("Predictions completed!")
print()

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')

# Display the results
print("MODEL EVALUATION RESULTS:")
print("=" * 40)
print(f"Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Precision (macro-average): {precision:.4f} ({precision*100:.2f}%)")
print(f"Recall (macro-average): {recall:.4f} ({recall*100:.2f}%)")
print()

# Display detailed classification report
print("CLASSIFICATION REPORT:")
print("=" * 40)
print(classification_report(
    y_test, y_pred,
    target_names=iris_data.target_names,
    digits=4
))

# =============================================================================
# SECTION 7: ADDITIONAL ANALYSIS (OPTIONAL)
# =============================================================================

# Display feature importance (Decision Trees provide this useful information)
print("\nFEATURE IMPORTANCE:")
print("=" * 40)
feature_importance = decision_tree.feature_importances_
for feature, importance in zip(iris_data.feature_names, feature_importance):
    print(f"{feature}: {importance:.4f} ({importance*100:.2f}%)")

# Find the most important feature
most_important_idx = np.argmax(feature_importance)
most_important_feature = iris_data.feature_names[most_important_idx]
print(f"\nMost important feature for classification: {most_important_feature}")
print(f"Its importance score: {feature_importance[most_important_idx]:.4f}")

# Display some sample predictions with actual vs predicted
print("\nSAMPLE PREDICTIONS:")
print("=" * 40)
print("Actual vs Predicted species (first 10 test samples):")
for i in range(min(10, len(y_test))):
    actual_species = iris_data.target_names[y_test[i]]
    predicted_species = iris_data.target_names[y_pred[i]]
    print(f"Sample {i+1}: Actual = {actual_species}, Predicted = {predicted_species}")

print("\n" + "=" * 60)
print("SCRIPT EXECUTION COMPLETED SUCCESSFULLY!")
print("=" * 60)

All necessary libraries imported successfully!
Dataset Information:
Number of samples: 150
Number of features: 4
Feature names: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Target names: ['setosa' 'versicolor' 'virginica']

First 5 rows of the dataset:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   species  
0        0  
1        0  
2        0  
3        0  
4        0  

Basic statistics of the features:
       sepal length (cm)  sepal width (cm)  petal length (cm)  \
count         150.000000        150.000000         150.0