In [None]:
# Complete End-to-End ML Pipeline for Iris Classification
# Using Decision Tree and Logistic Regression

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import os

# Set style for better visualizations
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

In [None]:
# ============================================
# STEP 1: DATA LOADING
# ============================================
from sklearn.datasets import load_iris

# Load the Iris dataset
iris_data = load_iris()
df = pd.DataFrame(iris_data.data, columns=iris_data.feature_names)
df['target'] = iris_data.target
df['species'] = df['target'].map({0: 'Setosa', 1: 'Versicolor', 2: 'Virginica'})

print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nDataset Info:")
print(df.info())
print("\nBasic Statistics:")
print(df.describe())

In [None]:
# ============================================
# STEP 2: DATA CLEANING & EXPLORATORY DATA ANALYSIS
# ============================================

# Check for missing values
print("Missing Values:")
print(df.isnull().sum())
print("\nNo missing values found!")

In [None]:
# Check for duplicates
print(f"\nDuplicate rows: {df.duplicated().sum()}")


In [None]:
# Check for outliers using IQR method
print("\nChecking for outliers:")
for col in iris_data.feature_names:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    print(f"{col}: {len(outliers)} outliers")


In [None]:
# Visualize data distribution
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
for idx, col in enumerate(iris_data.feature_names):
    ax = axes[idx // 2, idx % 2]
    df.boxplot(column=col, by='species', ax=ax)
    ax.set_title(f'{col} by Species')
    ax.set_xlabel('Species')
plt.tight_layout()
plt.show()


In [None]:
# Correlation heatmap
plt.figure(figsize=(10, 8))
correlation_matrix = df[iris_data.feature_names].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()


In [None]:
# Pair plot for feature relationships
sns.pairplot(df, hue='species', diag_kind='hist')
plt.suptitle('Pair Plot of Iris Features', y=1.02)
plt.show()

print("\nData cleaning complete! Dataset is clean and ready for modeling.")


In [None]:
# ============================================
# STEP 3: DATA PREPROCESSING
# ============================================

# Separate features and target
X = df[iris_data.feature_names]
y = df['target']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"\nClass distribution in training set:")
print(y_train.value_counts().sort_index())
print(f"\nClass distribution in test set:")
print(y_test.value_counts().sort_index())


In [None]:
# Feature Scaling (Standardization)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nFeatures scaled successfully!")
print(f"Scaled training set shape: {X_train_scaled.shape}")
print(f"Scaled test set shape: {X_test_scaled.shape}")


In [None]:
# ============================================
# STEP 4: MODEL TRAINING
# ============================================

# Initialize models
lr_model = LogisticRegression(random_state=42, max_iter=1000)
dt_model = DecisionTreeClassifier(random_state=42)

In [None]:
# Train Logistic Regression
print("Training Logistic Regression...")
lr_model.fit(X_train_scaled, y_train)
print("✓ Logistic Regression trained successfully!")


In [None]:
# Train Decision Tree
print("Training Decision Tree...")
dt_model.fit(X_train_scaled, y_train)
print("✓ Decision Tree trained successfully!")


In [None]:
# ============================================
# STEP 5: MODEL EVALUATION
# ============================================

# Make predictions
lr_train_pred = lr_model.predict(X_train_scaled)
lr_test_pred = lr_model.predict(X_test_scaled)
dt_train_pred = dt_model.predict(X_train_scaled)
dt_test_pred = dt_model.predict(X_test_scaled)

In [None]:
# Calculate accuracies
lr_train_acc = accuracy_score(y_train, lr_train_pred)
lr_test_acc = accuracy_score(y_test, lr_test_pred)
dt_train_acc = accuracy_score(y_train, dt_train_pred)
dt_test_acc = accuracy_score(y_test, dt_test_pred)

print("=" * 60)
print("MODEL PERFORMANCE METRICS")
print("=" * 60)
print(f"\nLogistic Regression:")
print(f"  Training Accuracy: {lr_train_acc:.4f} ({lr_train_acc*100:.2f}%)")
print(f"  Test Accuracy:     {lr_test_acc:.4f} ({lr_test_acc*100:.2f}%)")

print(f"\nDecision Tree:")
print(f"  Training Accuracy: {dt_train_acc:.4f} ({dt_train_acc*100:.2f}%)")
print(f"  Test Accuracy:     {dt_test_acc:.4f} ({dt_test_acc*100:.2f}%)")


In [None]:
# Classification Reports
print("\n" + "=" * 60)
print("LOGISTIC REGRESSION - Classification Report")
print("=" * 60)
print(classification_report(y_test, lr_test_pred, 
                            target_names=['Setosa', 'Versicolor', 'Virginica']))

print("\n" + "=" * 60)
print("DECISION TREE - Classification Report")
print("=" * 60)
print(classification_report(y_test, dt_test_pred, 
                            target_names=['Setosa', 'Versicolor', 'Virginica']))


In [None]:
# Confusion Matrices
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Logistic Regression Confusion Matrix
cm_lr = confusion_matrix(y_test, lr_test_pred)
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=['Setosa', 'Versicolor', 'Virginica'],
            yticklabels=['Setosa', 'Versicolor', 'Virginica'])
axes[0].set_title('Logistic Regression - Confusion Matrix')
axes[0].set_ylabel('True Label')
axes[0].set_xlabel('Predicted Label')

# Decision Tree Confusion Matrix
cm_dt = confusion_matrix(y_test, dt_test_pred)
sns.heatmap(cm_dt, annot=True, fmt='d', cmap='Greens', ax=axes[1],
            xticklabels=['Setosa', 'Versicolor', 'Virginica'],
            yticklabels=['Setosa', 'Versicolor', 'Virginica'])
axes[1].set_title('Decision Tree - Confusion Matrix')
axes[1].set_ylabel('True Label')
axes[1].set_xlabel('Predicted Label')

plt.tight_layout()
plt.show()

print("\n✓ Model evaluation complete!")


In [None]:
# ============================================
# STEP 6: MODEL EXPORT
# ============================================

# Create models directory if it doesn't exist
os.makedirs('models', exist_ok=True)

Logistic Regression Accuracy: 1.0
Decision Tree Accuracy: 1.0


In [None]:
# Export models and scaler
joblib.dump(lr_model, 'models/logistic_model.pkl')
joblib.dump(dt_model, 'models/decision_tree_model.pkl')
joblib.dump(scaler, 'models/scaler.pkl')

print("✓ Models and scaler exported successfully!")
print("\nExported files:")
print("  - models/logistic_model.pkl")
print("  - models/decision_tree_model.pkl")
print("  - models/scaler.pkl")


In [None]:
# Verify the files exist
import os
print("\nVerification:")
for file in ['models/logistic_model.pkl', 'models/decision_tree_model.pkl', 'models/scaler.pkl']:
    if os.path.exists(file):
        size = os.path.getsize(file)
        print(f"  ✓ {file} ({size} bytes)")
    else:
        print(f"  ✗ {file} - NOT FOUND")

print("\n" + "=" * 60)
print("PIPELINE COMPLETE! ✓")
print("=" * 60)
print("\nModels are ready for deployment!")


Models and Scaler exported successfully!


# Summary

This notebook demonstrates a complete end-to-end machine learning pipeline:

1. **Data Loading**: Loaded the Iris dataset from scikit-learn
2. **Data Cleaning**: Checked for missing values, duplicates, and outliers
3. **Exploratory Data Analysis**: Visualized data distributions, correlations, and relationships
4. **Data Preprocessing**: Split data into train/test sets and applied feature scaling
5. **Model Training**: Trained both Logistic Regression and Decision Tree classifiers
6. **Model Evaluation**: Comprehensive evaluation with accuracy, classification reports, and confusion matrices
7. **Model Export**: Exported trained models and scaler using joblib for deployment

The exported models can be integrated into a FastAPI backend for real-time predictions.
