# Classification Comparison on Iris Dataset

This notebook compares various classification algorithms using the Iris dataset. We will train and evaluate the following models:

1. Logistic Regression
2. K-Nearest Neighbors (KNN)
3. Support Vector Machine (SVM)
4. Decision Tree Classifier
5. Random Forest Classifier
6. Naive Bayes (GaussianNB)

We will evaluate them based on Accuracy, Precision, Recall, and F1-Score, and visualize their performance.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# Model Selection & Evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Dataset
from sklearn.datasets import load_iris

# Set plot style
sns.set(style="whitegrid")
%matplotlib inline

## 2. Load Data

In [None]:
# Load iris dataset as a DataFrame for easier handling
iris_data = load_iris(as_frame=True)
df = iris_data.frame

# Display first 5 rows
df.head()

In [None]:
# Dataset info
df.info()

In [None]:
# Check target class distribution
print("Target Class Distribution:")
print(df['target'].value_counts())
print(f"\nClass Names: {iris_data.target_names}")

## 3. Exploratory Data Analysis (EDA)

In [None]:
# Pairplot to visualize relationships between features
sns.pairplot(df, hue='target', palette='viridis')
plt.show()

In [None]:
# Correlation Matrix
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Feature Correlation Matrix")
plt.show()

## 4. specific Data Splitting

In [None]:
X = iris_data.data
y = iris_data.target

# Split into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 5. Model Implementation & Training

In [None]:
# Dictionary to store models
models = {
    "Logistic Regression": LogisticRegression(max_iter=200),
    "KNN": KNeighborsClassifier(n_neighbors=3),
    "SVM": SVC(kernel='linear'),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Naive Bayes": GaussianNB()
}

# Dictionary to store results
results = {}

# Train and Predict
for name, model in models.items():
    # Train
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    
    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    
    print(f"--- {name} ---")
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=iris_data.target_names))
    print("-"*50)

## 6. Model Comparison

In [None]:
# Create DataFrame for results
results_df = pd.DataFrame(list(results.items()), columns=['Model', 'Accuracy'])
results_df = results_df.sort_values(by='Accuracy', ascending=False)

results_df

In [None]:
# Plot model accuracy
plt.figure(figsize=(10, 6))
sns.barplot(x='Accuracy', y='Model', data=results_df, palette='viridis')
plt.xlim(0.8, 1.05)
plt.title("Model Accuracy Comparison")
plt.xlabel("Accuracy Score")
plt.ylabel("Classification Algorithm")
plt.show()

## 7. Confusion Matrices

In [None]:
# Plot separate confusion matrices
for name, model in models.items():
    plt.figure(figsize=(6, 5))
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, 
                xticklabels=iris_data.target_names, 
                yticklabels=iris_data.target_names)
    plt.title(f"{name} - Confusion Matrix")
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

## 8. Decision Boundaries Visualization
To better understand how each model classifies the data, we will plot decision boundaries. 
Since decision boundaries are best visualized in 2D, we will retrain our models using only the first two features: **Sepal Length** and **Sepal Width**.

In [None]:
from matplotlib.colors import ListedColormap

# Use only the first two features for 2D visualization
X_2d = iris_data.data.iloc[:, :2].values  # Sepal Length, Sepal Width
y_2d = iris_data.target.values

# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

def plot_decision_boundary(model, X, y, title):
    # Train the model on 2D data
    model.fit(X, y)
    
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
                         np.arange(y_min, y_max, 0.02))
    
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    plt.figure(figsize=(8, 6))
    plt.contourf(xx, yy, Z, cmap=cmap_light)
    
    # Plot training points
    sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=iris_data.target_names[y], 
                    palette=['#FF0000', '#00FF00', '#0000FF'], edgecolor="black", s=50)
    
    plt.title(title)
    plt.xlabel(iris_data.feature_names[0])
    plt.ylabel(iris_data.feature_names[1])
    plt.show()

# Plot decision boundaries for each model
for name, model in models.items():
    plot_decision_boundary(model, X_2d, y_2d, f"Decision Boundary - {name}")