In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# Set random state for reproducibility
RANDOM_STATE = 42

# Load the Wine dataset
wine_data = load_wine()
X = wine_data.data
y = wine_data.target

print("Dataset loaded successfully!")
print(f"Dataset shape: {X.shape}")
print(f"Number of features: {X.shape[1]}")
print(f"Number of samples: {X.shape[0]}")
print(f"Number of classes: {len(np.unique(y))}")

# Basic data exploration
wine_df = pd.DataFrame(X, columns=wine_data.feature_names)
wine_df['target'] = y

print("\nDataset Info:")
print(f"Shape: {wine_df.shape}")
print(f"Features: {list(wine_data.feature_names)}")
print(f"Target classes: {wine_data.target_names}")

print("\nClass Distribution:")
class_counts = pd.Series(y).value_counts().sort_index()
for i, count in enumerate(class_counts):
    print(f"Class {i} ({wine_data.target_names[i]}): {count} samples ({count/len(y)*100:.1f}%)")

print("\nFeature Statistics (First 5 features):")
print(wine_df.iloc[:, :5].describe())

# Visualize class distribution and feature correlations
plt.figure(figsize=(12, 4))

# Class distribution plot
plt.subplot(1, 2, 1)
class_names = [f"Class {i}\n({wine_data.target_names[i]})" for i in range(len(wine_data.target_names))]
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']
plt.bar(class_names, class_counts.values, color=colors, alpha=0.8, edgecolor='black')
plt.title('Wine Dataset - Class Distribution', fontsize=14, fontweight='bold')
plt.ylabel('Number of Samples')
plt.grid(axis='y', alpha=0.3)

# Add value labels on bars
for i, v in enumerate(class_counts.values):
    plt.text(i, v + 1, str(v), ha='center', va='bottom', fontweight='bold')

# Feature correlation heatmap (sample of features)
plt.subplot(1, 2, 2)
sample_features = wine_df.iloc[:, :8]  # First 8 features for visualization
correlation_matrix = sample_features.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, fmt='.2f', cbar_kws={'shrink': 0.8})
plt.title('Feature Correlation Matrix\n(First 8 Features)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# Split dataset into 80% training and 20% testing with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

print(f"\nTraining set size: {X_train.shape[0]} samples ({X_train.shape[0]/X.shape[0]*100:.1f}%)")
print(f"Testing set size: {X_test.shape[0]} samples ({X_test.shape[0]/X.shape[0]*100:.1f}%)")

# Check class distribution in splits
print("\nClass distribution in splits:")
train_classes = pd.Series(y_train).value_counts().sort_index()
test_classes = pd.Series(y_test).value_counts().sort_index()

for i in range(len(wine_data.target_names)):
    train_pct = train_classes[i] / len(y_train) * 100
    test_pct = test_classes[i] / len(y_test) * 100
    print(f"Class {i}: Train={train_classes[i]} ({train_pct:.1f}%), Test={test_classes[i]} ({test_pct:.1f}%)")

# K-Nearest Neighbors (KNN) Implementation
print("\nK-NEAREST NEIGHBORS (KNN) IMPLEMENTATION")
print("=" * 60)

# Define k values to test
k_values = [1, 5, 11, 15, 21]
knn_accuracies = []
knn_models = {}

print("Testing different k values for KNN:")
print("-" * 40)

for k in k_values:
    # Create and train KNN classifier
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    
    # Make predictions
    y_pred = knn.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    knn_accuracies.append(accuracy)
    knn_models[k] = knn
    
    print(f"k = {k:2d}: Accuracy = {accuracy:.4f} ({accuracy*100:.2f}%)")

print(f"\nBest KNN performance: k = {k_values[np.argmax(knn_accuracies)]} with accuracy = {max(knn_accuracies):.4f}")

# Detailed analysis for best KNN model
best_k = k_values[np.argmax(knn_accuracies)]
best_knn = knn_models[best_k]
y_pred_best_knn = best_knn.predict(X_test)

print(f"\nDETAILED ANALYSIS FOR BEST KNN MODEL (k = {best_k})")
print("=" * 60)

print("\nClassification Report:")
print(classification_report(y_test, y_pred_best_knn, 
                          target_names=wine_data.target_names))

# Confusion Matrix for KNN
cm_knn = confusion_matrix(y_test, y_pred_best_knn)
print("\nConfusion Matrix:")
print(cm_knn)

# Radius Neighbors (RNN) Implementation
print("\nRADIUS NEIGHBORS (RNN) IMPLEMENTATION")
print("=" * 60)

# Define radius values to test
radius_values = [350, 400, 450, 500, 550, 600]
rnn_accuracies = []
rnn_models = {}

print("Testing different radius values for RNN:")
print("-" * 45)

for radius in radius_values:
    try:
        # Create and train RNN classifier
        rnn = RadiusNeighborsClassifier(radius=radius)
        rnn.fit(X_train, y_train)
        
        # Make predictions
        y_pred = rnn.predict(X_test)
        
        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        rnn_accuracies.append(accuracy)
        rnn_models[radius] = rnn
        
        print(f"Radius = {radius:3d}: Accuracy = {accuracy:.4f} ({accuracy*100:.2f}%)")
        
    except Exception as e:
        print(f"Radius = {radius:3d}: Error - {str(e)}")
        rnn_accuracies.append(0.0)  # Add 0 for failed attempts

if max(rnn_accuracies) > 0:
    best_radius_idx = np.argmax(rnn_accuracies)
    best_radius = radius_values[best_radius_idx]
    print(f"\nBest RNN performance: radius = {best_radius} with accuracy = {max(rnn_accuracies):.4f}")
else:
    print("\nAll RNN models failed - radius values may be too large for this dataset")

# Detailed analysis for best RNN model (if successful)
if max(rnn_accuracies) > 0:
    best_radius = radius_values[np.argmax(rnn_accuracies)]
    best_rnn = rnn_models[best_radius]
    y_pred_best_rnn = best_rnn.predict(X_test)
    
    print(f"\nDETAILED ANALYSIS FOR BEST RNN MODEL (radius = {best_radius})")
    print("=" * 60)
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred_best_rnn, 
                              target_names=wine_data.target_names))
    
    # Confusion Matrix for RNN
    cm_rnn = confusion_matrix(y_test, y_pred_best_rnn)
    print("\nConfusion Matrix:")
    print(cm_rnn)
else:
    print("\nCannot provide detailed RNN analysis - all models failed")
    print("Suggestion: Try smaller radius values or scale the data")

# Visualization and Comparison
print("\nVISUALIZATION AND COMPARISON")
print("=" * 50)

# Create comprehensive visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('KNN vs RNN Classification Analysis', fontsize=16, fontweight='bold')

# Plot 1: KNN Accuracy Trends
axes[0, 0].plot(k_values, knn_accuracies, 'bo-', linewidth=2, markersize=8, 
                markerfacecolor='lightblue', markeredgecolor='blue')
axes[0, 0].set_xlabel('K Value', fontweight='bold')
axes[0, 0].set_ylabel('Accuracy', fontweight='bold')
axes[0, 0].set_title('KNN: Accuracy vs K Values', fontweight='bold')
axes[0, 0].grid(True, alpha=0.3)
axes[0, 0].set_ylim([0, 1])

# Add value labels
for i, (k, acc) in enumerate(zip(k_values, knn_accuracies)):
    axes[0, 0].annotate(f'{acc:.3f}', (k, acc), 
                       textcoords="offset points", xytext=(0,10), ha='center')

# Plot 2: RNN Accuracy Trends
if max(rnn_accuracies) > 0:
    axes[0, 1].plot(radius_values, rnn_accuracies, 'ro-', linewidth=2, markersize=8,
                    markerfacecolor='lightcoral', markeredgecolor='red')
    axes[0, 1].set_xlabel('Radius Value', fontweight='bold')
    axes[0, 1].set_ylabel('Accuracy', fontweight='bold')
    axes[0, 1].set_title('RNN: Accuracy vs Radius Values', fontweight='bold')
    axes[0, 1].grid(True, alpha=0.3)
    axes[0, 1].set_ylim([0, 1])
    
    # Add value labels
    for i, (r, acc) in enumerate(zip(radius_values, rnn_accuracies)):
        if acc > 0:
            axes[0, 1].annotate(f'{acc:.3f}', (r, acc), 
                               textcoords="offset points", xytext=(0,10), ha='center')
else:
    axes[0, 1].text(0.5, 0.5, 'RNN Failed\n(Radius too large)', 
                   ha='center', va='center', transform=axes[0, 1].transAxes,
                   fontsize=14, bbox=dict(boxstyle="round,pad=0.3", facecolor="lightcoral"))
    axes[0, 1].set_title('RNN: Failed Analysis', fontweight='bold')

# Plot 3: Comparison Bar Chart
models = ['Best KNN', 'Best RNN'] if max(rnn_accuracies) > 0 else ['Best KNN']
accuracies = [max(knn_accuracies), max(rnn_accuracies)] if max(rnn_accuracies) > 0 else [max(knn_accuracies)]
colors = ['lightblue', 'lightcoral'] if max(rnn_accuracies) > 0 else ['lightblue']

bars = axes[1, 0].bar(models, accuracies, color=colors, edgecolor='black', alpha=0.8)
axes[1, 0].set_ylabel('Accuracy', fontweight='bold')
axes[1, 0].set_title('Model Comparison', fontweight='bold')
axes[1, 0].set_ylim([0, 1])
axes[1, 0].grid(axis='y', alpha=0.3)

# Add value labels on bars
for bar, acc in zip(bars, accuracies):
    height = bar.get_height()
    axes[1, 0].text(bar.get_x() + bar.get_width()/2., height + 0.01,
                   f'{acc:.3f}', ha='center', va='bottom', fontweight='bold')

# Plot 4: Confusion Matrix for Best Model
if max(rnn_accuracies) > 0 and max(rnn_accuracies) > max(knn_accuracies):
    cm = cm_rnn
    title = f'Best Model Confusion Matrix\n(RNN, radius={best_radius})'
else:
    cm = cm_knn
    title = f'Best Model Confusion Matrix\n(KNN, k={best_k})'

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=wine_data.target_names,
            yticklabels=wine_data.target_names,
            ax=axes[1, 1])
axes[1, 1].set_title(title, fontweight='bold')
axes[1, 1].set_xlabel('Predicted', fontweight='bold')
axes[1, 1].set_ylabel('Actual', fontweight='bold')

plt.tight_layout()
plt.show()

# Performance summary table
print("\nPERFORMANCE SUMMARY TABLE")
print("=" * 60)

summary_data = {
    'K/Radius': k_values + radius_values,
    'Algorithm': ['KNN'] * len(k_values) + ['RNN'] * len(radius_values),
    'Accuracy': knn_accuracies + rnn_accuracies
}

summary_df = pd.DataFrame(summary_data)
summary_df['Accuracy (%)'] = summary_df['Accuracy'] * 100

print(summary_df.to_string(index=False, float_format='%.3f'))

# Analysis and Interpretation
print("\nANALYSIS AND INTERPRETATION")
print("=" * 50)

print("\nObservations:")
print(f"• Best KNN model: k = {k_values[np.argmax(knn_accuracies)]} with accuracy = {max(knn_accuracies):.4f}")

if max(rnn_accuracies) > 0:
    best_rnn_idx = np.argmax(rnn_accuracies)
    print(f"• Best RNN model: radius = {radius_values[best_rnn_idx]} with accuracy = {max(rnn_accuracies):.4f}")
    
    if max(rnn_accuracies) > max(knn_accuracies):
        print("• RNN outperformed KNN")
    elif max(knn_accuracies) > max(rnn_accuracies):
        print("• KNN outperformed RNN")
    else:
        print("• KNN and RNN achieved similar performance")
else:
    print("• RNN failed with the tested radius values (too large for dataset scale)")

print(f"\nOverall Best Model:")
if max(rnn_accuracies) > 0:
    if max(rnn_accuracies) > max(knn_accuracies):
        best_overall = f"RNN (radius={radius_values[np.argmax(rnn_accuracies)]})"
        best_acc = max(rnn_accuracies)
    else:
        best_overall = f"KNN (k={k_values[np.argmax(knn_accuracies)]})"
        best_acc = max(knn_accuracies)
else:
    best_overall = f"KNN (k={k_values[np.argmax(knn_accuracies)]})"
    best_acc = max(knn_accuracies)

print(f"   {best_overall} with accuracy = {best_acc:.4f} ({best_acc*100:.2f}%)")

print(f"\n")
print("• Small k values in KNN can lead to overfitting (high variance)")
print("• Large k values in KNN can lead to underfitting (high bias)")
print("• RNN requires careful radius selection based on data scale")
print("• Feature scaling might improve RNN performance")
print("• Wine dataset has clear class separability, enabling high accuracy")


# Final results summary
print("\nCONCLUSION")
print("=" * 30)



print(f"\nFinal Results:")
print(f"   • Dataset: Wine Dataset ({X.shape[0]} samples, {X.shape[1]} features, 3 classes)")
print(f"   • Best performing model: {best_overall}")
print(f"   • Achieved accuracy: {best_acc:.4f} ({best_acc*100:.2f}%)")

