In [3]:
pip install plotly

Note: you may need to restart the kernel to use updated packages.


In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import plotly.express as px
import plotly.graph_objects as go
from itertools import combinations
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Loading and cleaning the Iris dataset
def load_and_clean_data():
    try:
        data = pd.read_csv('Iris.csv')
        
        # Dropping 'Id' column
        data = data.drop('Id', axis=1)
        
        # Handling missing or invalid values
        data = data.dropna()
        data = data[data['Species'].isin(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])]
        
        # Checking for outliers using IQR
        for col in ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']:
            Q1 = data[col].quantile(0.25)
            Q3 = data[col].quantile(0.75)
            IQR = Q3 - Q1
            data = data[~((data[col] < (Q1 - 1.5 * IQR)) | (data[col] > (Q3 + 1.5 * IQR)))]
        
        X = data[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']].values
        y = data['Species'].values
        return X, y, data
    except Exception as e:
        print(f"Error loading data: {e}")
        return None, None, None

# Normalizing features
def normalize_features(X):
    scaler = StandardScaler()
    X_normalized = scaler.fit_transform(X)
    return X_normalized, scaler

# Training and evaluating KNN models
def train_and_evaluate_knn(X, y, k_values=[1, 3, 5, 7, 9, 11, 15]):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    results = {}
    
    for k in k_values:
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        
        # Calculating metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        cm = confusion_matrix(y_test, y_pred, labels=['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])
        
        # Cross-validation
        cv_scores = cross_val_score(knn, X, y, cv=5, scoring='accuracy')
        
        results[k] = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'confusion_matrix': cm,
            'cv_mean': cv_scores.mean(),
            'cv_std': cv_scores.std()
        }
        
        # Printing results
        print(f'\nK={k} Results:')
        print(f'Accuracy: {accuracy:.4f}')
        print(f'Precision: {precision:.4f}')
        print(f'Recall: {recall:.4f}')
        print(f'F1-Score: {f1:.4f}')
        print(f'5-Fold CV Accuracy: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})')
        print('Confusion Matrix:')
        print(cm)
    
    return results, X_train, X_test, y_train, y_test

# Visualizing decision boundaries for all pairs of features
def plot_decision_boundaries(X, y, data, k_values=[3, 5, 7, 9]):
    features = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
    feature_pairs = list(combinations(range(4), 2))
    species = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
    
    for k in k_values:
        for i, j in feature_pairs:
            X_2d = X[:, [i, j]]
            X_train, X_test, y_train, y_test = train_test_split(X_2d, y, test_size=0.3, random_state=42)
            
            # Creating mesh grid
            x_min, x_max = X_2d[:, 0].min() - 1, X_2d[:, 0].max() + 1
            y_min, y_max = X_2d[:, 1].min() - 1, X_2d[:, 1].max() + 1
            xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))
            
            # Training KNN
            knn = KNeighborsClassifier(n_neighbors=k)
            knn.fit(X_train, y_train)
            
            # Predicting on mesh grid
            Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
            Z = np.array([species.index(z) for z in Z]).reshape(xx.shape)
            
            # Creating Plotly contour plot
            fig = go.Figure()
            fig.add_trace(go.Contour(
                x=xx[0], y=yy[:, 0], z=Z,
                colorscale='Viridis',
                opacity=0.4,
                showscale=False
            ))
            
            # Adding scatter points
            for idx, species_name in enumerate(species):
                mask = y_train == species_name
                fig.add_trace(go.Scatter(
                    x=X_train[mask, 0], y=X_train[mask, 1],
                    mode='markers',
                    marker=dict(size=8, line=dict(width=1, color='black')),
                    name=species_name
                ))
            
            fig.update_layout(
                title=f'KNN Decision Boundaries (K={k}, {features[i]} vs {features[j]})',
                xaxis_title=features[i] + ' (Normalized)',
                yaxis_title=features[j] + ' (Normalized)',
                legend_title='Species',
                width=600, height=500
            )
            fig.show()

# Main execution
def main():
    # Loading and cleaning data
    X, y, data = load_and_clean_data()
    if X is None:
        return
    
    # Normalizing features
    X_normalized, scaler = normalize_features(X)
    
    # Training and evaluating models
    results, X_train, X_test, y_train, y_test = train_and_evaluate_knn(X_normalized, y)
    
    # Visualizing decision boundaries for all feature pairs
    plot_decision_boundaries(X_normalized, y, data)
    
    # Finding best K
    best_k = max(results, key=lambda k: results[k]['accuracy'])
    print(f'\nBest K based on test accuracy: {best_k} (Accuracy: {results[best_k]["accuracy"]:.4f})')

if __name__ == '__main__':
    main()



K=1 Results:
Accuracy: 0.9091
Precision: 0.9116
Recall: 0.9091
F1-Score: 0.9072
5-Fold CV Accuracy: 0.9384 (±0.0401)
Confusion Matrix:
[[17  0  0]
 [ 0  8  3]
 [ 0  1 15]]

K=3 Results:
Accuracy: 0.9318
Precision: 0.9341
Recall: 0.9318
F1-Score: 0.9322
5-Fold CV Accuracy: 0.9520 (±0.0353)
Confusion Matrix:
[[17  0  0]
 [ 0 10  1]
 [ 0  2 14]]

K=5 Results:
Accuracy: 0.9318
Precision: 0.9341
Recall: 0.9318
F1-Score: 0.9322
5-Fold CV Accuracy: 0.9520 (±0.0353)
Confusion Matrix:
[[17  0  0]
 [ 0 10  1]
 [ 0  2 14]]

K=7 Results:
Accuracy: 0.9318
Precision: 0.9341
Recall: 0.9318
F1-Score: 0.9322
5-Fold CV Accuracy: 0.9520 (±0.0353)
Confusion Matrix:
[[17  0  0]
 [ 0 10  1]
 [ 0  2 14]]

K=9 Results:
Accuracy: 0.9318
Precision: 0.9341
Recall: 0.9318
F1-Score: 0.9322
5-Fold CV Accuracy: 0.9384 (±0.0257)
Confusion Matrix:
[[17  0  0]
 [ 0 10  1]
 [ 0  2 14]]

K=11 Results:
Accuracy: 0.9545
Precision: 0.9545
Recall: 0.9545
F1-Score: 0.9545
5-Fold CV Accuracy: 0.9455 (±0.0342)
Confusion Matrix


Best K based on test accuracy: 15 (Accuracy: 0.9773)
