In [1]:
import pandas as pd
import numpy as np
import random
from collections import Counter

# Load the dataset
df = pd.read_csv('Data/iris.csv')
print("Dataset shape:", df.shape)
print("\nDataset head:")
print(df.head())

# Check the target distribution
print("\nClass distribution:")
print(df.iloc[:, -1].value_counts())

# Prepare features and target
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# Convert target to numerical labels
unique_classes = np.unique(y)
class_to_num = {cls: i for i, cls in enumerate(unique_classes)}
num_to_class = {i: cls for i, cls in enumerate(unique_classes)}
y_num = np.array([class_to_num[cls] for cls in y])

print(f"\nClass mapping: {class_to_num}")
print(f"Features shape: {X.shape}")
print(f"Target shape: {y_num.shape}")

Dataset shape: (150, 6)

Dataset head:
   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa

Class distribution:
Species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64

Class mapping: {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}
Features shape: (150, 5)
Target shape: (150,)


In [2]:
def stratified_split(X, y, train_size, val_size, test_size, random_state=42):
    """Stratified split maintaining class proportions"""
    np.random.seed(random_state)
    
    # Get indices for each class
    class_indices = {}
    for cls in np.unique(y):
        class_indices[cls] = np.where(y == cls)[0]
    
    train_indices, val_indices, test_indices = [], [], []
    
    for cls, indices in class_indices.items():
        np.random.shuffle(indices)
        n_samples = len(indices)
        
        n_train = int(n_samples * train_size)
        n_val = int(n_samples * val_size)
        
        train_indices.extend(indices[:n_train])
        val_indices.extend(indices[n_train:n_train+n_val])
        test_indices.extend(indices[n_train+n_val:])
    
    # Shuffle final indices
    np.random.shuffle(train_indices)
    np.random.shuffle(val_indices)
    np.random.shuffle(test_indices)
    
    return (X[train_indices], X[val_indices], X[test_indices], 
            y[train_indices], y[val_indices], y[test_indices])

# Create splits for both proportions
print("Creating 80:10:10 split...")
X_train_80, X_val_80, X_test_80, y_train_80, y_val_80, y_test_80 = stratified_split(
    X, y_num, 0.8, 0.1, 0.1)

print("Creating 70:15:15 split...")
X_train_70, X_val_70, X_test_70, y_train_70, y_val_70, y_test_70 = stratified_split(
    X, y_num, 0.7, 0.15, 0.15)

print(f"\n80:10:10 split - Train: {len(X_train_80)}, Val: {len(X_val_80)}, Test: {len(X_test_80)}")
print(f"70:15:15 split - Train: {len(X_train_70)}, Val: {len(X_val_70)}, Test: {len(X_test_70)}")

Creating 80:10:10 split...
Creating 70:15:15 split...

80:10:10 split - Train: 120, Val: 15, Test: 15
70:15:15 split - Train: 105, Val: 21, Test: 24


In [3]:
class LinearRegression:
    def __init__(self):
        self.weights = None
        self.bias = None
    
    def fit(self, X, y):
        # Add bias term to X
        X_with_bias = np.column_stack([np.ones(X.shape[0]), X])
        
        # Compute weights using normal equation: (X^T X)^-1 X^T y
        self.weights = np.linalg.pinv(X_with_bias.T @ X_with_bias) @ X_with_bias.T @ y
        self.bias = self.weights[0]
        self.weights = self.weights[1:]
    
    def predict(self, X):
        return X @ self.weights + self.bias

class LinearClassifier:
    def __init__(self):
        self.models = {}
        self.classes = None
    
    def fit(self, X, y):
        self.classes = np.unique(y)
        
        for cls in self.classes:
            # Create binary target (1 for current class, 0 for others)
            y_binary = (y == cls).astype(int)
            
            # Train linear regression for this class
            model = LinearRegression()
            model.fit(X, y_binary)
            self.models[cls] = model
    
    def predict(self, X):
        # Get predictions from all models
        predictions = {}
        for cls, model in self.models.items():
            predictions[cls] = model.predict(X)
        
        # Choose class with highest prediction for each sample
        pred_array = np.array([predictions[cls] for cls in self.classes]).T
        return self.classes[np.argmax(pred_array, axis=1)]
    
    def predict_proba(self, X):
        predictions = {}
        for cls, model in self.models.items():
            predictions[cls] = model.predict(X)
        
        return np.array([predictions[cls] for cls in self.classes]).T

def accuracy_score(y_true, y_pred):
    return np.mean(y_true == y_pred)

def classification_report(y_true, y_pred, classes):
    report = {}
    for cls in classes:
        tp = np.sum((y_true == cls) & (y_pred == cls))
        fp = np.sum((y_true != cls) & (y_pred == cls))
        fn = np.sum((y_true == cls) & (y_pred != cls))
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        report[cls] = {'precision': precision, 'recall': recall, 'f1': f1}
    
    return report

In [4]:
def random_oversampling(X, y):
    """Simple random oversampling to balance classes"""
    class_counts = Counter(y)
    max_count = max(class_counts.values())
    
    X_resampled = []
    y_resampled = []
    
    for cls in np.unique(y):
        cls_indices = np.where(y == cls)[0]
        cls_X = X[cls_indices]
        cls_y = y[cls_indices]
        
        # Randomly sample with replacement to reach max_count
        n_to_sample = max_count
        sampled_indices = np.random.choice(len(cls_X), size=n_to_sample, replace=True)
        
        X_resampled.append(cls_X[sampled_indices])
        y_resampled.append(cls_y[sampled_indices])
    
    return np.vstack(X_resampled), np.hstack(y_resampled)

def evaluate_model(X_train, y_train, X_val, y_val, X_test, y_test, model_name):
    """Train and evaluate a linear classifier"""
    # Train model
    clf = LinearClassifier()
    clf.fit(X_train, y_train)
    
    # Predictions
    train_pred = clf.predict(X_train)
    val_pred = clf.predict(X_val)
    test_pred = clf.predict(X_test)
    
    # Accuracies
    train_acc = accuracy_score(y_train, train_pred)
    val_acc = accuracy_score(y_val, val_pred)
    test_acc = accuracy_score(y_test, test_pred)
    
    print(f"\n{model_name} Results:")
    print(f"Train Accuracy: {train_acc:.4f}")
    print(f"Val Accuracy: {val_acc:.4f}")
    print(f"Test Accuracy: {test_acc:.4f}")
    
    # Detailed classification report for test set
    report = classification_report(y_test, test_pred, np.unique(y_test))
    print("\nTest Set Classification Report:")
    for cls, metrics in report.items():
        cls_name = num_to_class[cls]
        print(f"Class {cls_name}: Precision={metrics['precision']:.4f}, "
              f"Recall={metrics['recall']:.4f}, F1={metrics['f1']:.4f}")
    
    return {'train_acc': train_acc, 'val_acc': val_acc, 'test_acc': test_acc, 'report': report}

In [5]:
results = {}

# Baseline models (no oversampling)
print("="*50)
print("BASELINE MODELS (NO OVERSAMPLING)")
print("="*50)

results['80-10-10_baseline'] = evaluate_model(
    X_train_80, y_train_80, X_val_80, y_val_80, X_test_80, y_test_80, 
    "80:10:10 Baseline"
)

results['70-15-15_baseline'] = evaluate_model(
    X_train_70, y_train_70, X_val_70, y_val_70, X_test_70, y_test_70, 
    "70:15:15 Baseline"
)

# Random Oversampling
print("\n" + "="*50)
print("RANDOM OVERSAMPLING")
print("="*50)

# Random oversampling for 80:10:10 split
print(f"\nOriginal 80:10:10 train class distribution: {Counter(y_train_80)}")
X_train_80_over, y_train_80_over = random_oversampling(X_train_80, y_train_80)
print(f"After oversampling: {Counter(y_train_80_over)}")

results['80-10-10_oversampled'] = evaluate_model(
    X_train_80_over, y_train_80_over, X_val_80, y_val_80, X_test_80, y_test_80,
    "80:10:10 Random Oversampled"
)

# Random oversampling for 70:15:15 split
print(f"\nOriginal 70:15:15 train class distribution: {Counter(y_train_70)}")
X_train_70_over, y_train_70_over = random_oversampling(X_train_70, y_train_70)
print(f"After oversampling: {Counter(y_train_70_over)}")

results['70-15-15_oversampled'] = evaluate_model(
    X_train_70_over, y_train_70_over, X_val_70, y_val_70, X_test_70, y_test_70,
    "70:15:15 Random Oversampled"
)

BASELINE MODELS (NO OVERSAMPLING)

80:10:10 Baseline Results:
Train Accuracy: 0.8750
Val Accuracy: 1.0000
Test Accuracy: 0.8667

Test Set Classification Report:
Class Iris-setosa: Precision=1.0000, Recall=1.0000, F1=1.0000
Class Iris-versicolor: Precision=0.7143, Recall=1.0000, F1=0.8333
Class Iris-virginica: Precision=1.0000, Recall=0.6000, F1=0.7500

70:15:15 Baseline Results:
Train Accuracy: 0.8952
Val Accuracy: 0.8095
Test Accuracy: 0.9167

Test Set Classification Report:
Class Iris-setosa: Precision=1.0000, Recall=1.0000, F1=1.0000
Class Iris-versicolor: Precision=0.8000, Recall=1.0000, F1=0.8889
Class Iris-virginica: Precision=1.0000, Recall=0.7500, F1=0.8571

RANDOM OVERSAMPLING

Original 80:10:10 train class distribution: Counter({np.int64(0): 40, np.int64(2): 40, np.int64(1): 40})
After oversampling: Counter({np.int64(0): 40, np.int64(1): 40, np.int64(2): 40})

80:10:10 Random Oversampled Results:
Train Accuracy: 0.8833
Val Accuracy: 1.0000
Test Accuracy: 0.8667

Test Set Clas

In [6]:
def euclidean_distance(x1, x2):
    """Calculate euclidean distance between two points"""
    return np.sqrt(np.sum((x1 - x2) ** 2))

def find_k_nearest_neighbors(X, point_idx, k=2):
    """Find k nearest neighbors for a given point"""
    point = X[point_idx]
    distances = []
    
    for i, other_point in enumerate(X):
        if i != point_idx:
            dist = euclidean_distance(point, other_point)
            distances.append((dist, i))
    
    distances.sort(key=lambda x: x[0])
    return [idx for _, idx in distances[:k]]

def smote_random_pair(X, y, k=2):
    """SMOTE with random 2 samples from minority class"""
    class_counts = Counter(y)
    max_count = max(class_counts.values())
    
    X_synthetic = []
    y_synthetic = []
    
    for cls in np.unique(y):
        cls_indices = np.where(y == cls)[0]
        cls_X = X[cls_indices]
        current_count = len(cls_X)
        
        X_synthetic.append(cls_X)
        y_synthetic.append(np.full(current_count, cls))
        
        if current_count < max_count:
            n_synthetic = max_count - current_count
            
            for _ in range(n_synthetic):
                # Randomly select 2 samples from minority class
                if len(cls_X) >= 2:
                    idx1, idx2 = np.random.choice(len(cls_X), size=2, replace=False)
                    x1, x2 = cls_X[idx1], cls_X[idx2]
                else:
                    # If only one sample, duplicate it
                    x1 = x2 = cls_X[0]
                
                # Generate synthetic sample
                alpha = np.random.random()
                synthetic_sample = x1 + alpha * (x2 - x1)
                
                X_synthetic.append([synthetic_sample])
                y_synthetic.append([cls])
    
    return np.vstack(X_synthetic), np.hstack(y_synthetic)

def smote_nearest_neighbors(X, y, k=1):
    """SMOTE with nearest neighbor approach"""
    class_counts = Counter(y)
    max_count = max(class_counts.values())
    
    X_synthetic = []
    y_synthetic = []
    
    for cls in np.unique(y):
        cls_indices = np.where(y == cls)[0]
        cls_X = X[cls_indices]
        current_count = len(cls_X)
        
        X_synthetic.append(cls_X)
        y_synthetic.append(np.full(current_count, cls))
        
        if current_count < max_count:
            n_synthetic = max_count - current_count
            
            for _ in range(n_synthetic):
                # Randomly select one sample from minority class
                random_idx = np.random.randint(0, len(cls_X))
                selected_sample = cls_X[random_idx]
                
                # Find its nearest neighbor
                if len(cls_X) > 1:
                    neighbors = find_k_nearest_neighbors(cls_X, random_idx, k=1)
                    neighbor_sample = cls_X[neighbors[0]]
                else:
                    neighbor_sample = selected_sample
                
                # Generate synthetic sample
                alpha = np.random.random()
                synthetic_sample = selected_sample + alpha * (neighbor_sample - selected_sample)
                
                X_synthetic.append([synthetic_sample])
                y_synthetic.append([cls])
    
    return np.vstack(X_synthetic), np.hstack(y_synthetic)

In [7]:
print("\n" + "="*50)
print("SMOTE WITH RANDOM PAIRS")
print("="*50)

# SMOTE with random pairs for 80:10:10 split
X_train_80_smote1, y_train_80_smote1 = smote_random_pair(X_train_80, y_train_80)
print(f"\n80:10:10 SMOTE random pairs - Final distribution: {Counter(y_train_80_smote1)}")

results['80-10-10_smote_random'] = evaluate_model(
    X_train_80_smote1, y_train_80_smote1, X_val_80, y_val_80, X_test_80, y_test_80,
    "80:10:10 SMOTE Random Pairs"
)

# SMOTE with random pairs for 70:15:15 split
X_train_70_smote1, y_train_70_smote1 = smote_random_pair(X_train_70, y_train_70)
print(f"\n70:15:15 SMOTE random pairs - Final distribution: {Counter(y_train_70_smote1)}")

results['70-15-15_smote_random'] = evaluate_model(
    X_train_70_smote1, y_train_70_smote1, X_val_70, y_val_70, X_test_70, y_test_70,
    "70:15:15 SMOTE Random Pairs"
)

print("\n" + "="*50)
print("SMOTE WITH NEAREST NEIGHBORS")
print("="*50)

# SMOTE with nearest neighbors for 80:10:10 split
X_train_80_smote2, y_train_80_smote2 = smote_nearest_neighbors(X_train_80, y_train_80)
print(f"\n80:10:10 SMOTE nearest neighbors - Final distribution: {Counter(y_train_80_smote2)}")

results['80-10-10_smote_nn'] = evaluate_model(
    X_train_80_smote2, y_train_80_smote2, X_val_80, y_val_80, X_test_80, y_test_80,
    "80:10:10 SMOTE Nearest Neighbors"
)

# SMOTE with nearest neighbors for 70:15:15 split
X_train_70_smote2, y_train_70_smote2 = smote_nearest_neighbors(X_train_70, y_train_70)
print(f"\n70:15:15 SMOTE nearest neighbors - Final distribution: {Counter(y_train_70_smote2)}")

results['70-15-15_smote_nn'] = evaluate_model(
    X_train_70_smote2, y_train_70_smote2, X_val_70, y_val_70, X_test_70, y_test_70,
    "70:15:15 SMOTE Nearest Neighbors"
)


SMOTE WITH RANDOM PAIRS

80:10:10 SMOTE random pairs - Final distribution: Counter({np.int64(0): 40, np.int64(1): 40, np.int64(2): 40})

80:10:10 SMOTE Random Pairs Results:
Train Accuracy: 0.8750
Val Accuracy: 1.0000
Test Accuracy: 0.8667

Test Set Classification Report:
Class Iris-setosa: Precision=1.0000, Recall=1.0000, F1=1.0000
Class Iris-versicolor: Precision=0.7143, Recall=1.0000, F1=0.8333
Class Iris-virginica: Precision=1.0000, Recall=0.6000, F1=0.7500

70:15:15 SMOTE random pairs - Final distribution: Counter({np.int64(0): 35, np.int64(1): 35, np.int64(2): 35})

70:15:15 SMOTE Random Pairs Results:
Train Accuracy: 0.8952
Val Accuracy: 0.8095
Test Accuracy: 0.9167

Test Set Classification Report:
Class Iris-setosa: Precision=1.0000, Recall=1.0000, F1=1.0000
Class Iris-versicolor: Precision=0.8000, Recall=1.0000, F1=0.8889
Class Iris-virginica: Precision=1.0000, Recall=0.7500, F1=0.8571

SMOTE WITH NEAREST NEIGHBORS

80:10:10 SMOTE nearest neighbors - Final distribution: Count

In [8]:
print("\n" + "="*70)
print("COMPREHENSIVE PERFORMANCE COMPARISON")
print("="*70)

# Create comparison table
print(f"{'Method':<30} {'Split':<10} {'Train Acc':<12} {'Val Acc':<10} {'Test Acc':<10}")
print("-" * 70)

methods = [
    ('Baseline', 'baseline'),
    ('Random Oversampling', 'oversampled'),
    ('SMOTE Random Pairs', 'smote_random'),
    ('SMOTE Nearest Neighbors', 'smote_nn')
]

splits = [('80-10-10', '80-10-10'), ('70-15-15', '70-15-15')]

for method_name, method_key in methods:
    for split_name, split_key in splits:
        key = f"{split_key}_{method_key}"
        if key in results:
            result = results[key]
            print(f"{method_name:<30} {split_name:<10} {result['train_acc']:<12.4f} "
                  f"{result['val_acc']:<10.4f} {result['test_acc']:<10.4f}")

# Find best performing methods
print(f"\n{'='*70}")
print("BEST PERFORMING METHODS BY TEST ACCURACY")
print("="*70)

sorted_results = sorted(results.items(), key=lambda x: x[1]['test_acc'], reverse=True)

for i, (method, result) in enumerate(sorted_results[:3], 1):
    method_parts = method.split('_')
    split_info = method_parts[0]
    method_type = '_'.join(method_parts[1:])
    
    print(f"{i}. {method_type.replace('_', ' ').title()} ({split_info}): "
          f"Test Accuracy = {result['test_acc']:.4f}")

# Summary insights
print(f"\n{'='*70}")
print("SUMMARY INSIGHTS")
print("="*70)

# Compare splits
split_80_methods = [k for k in results.keys() if k.startswith('80-10-10')]
split_70_methods = [k for k in results.keys() if k.startswith('70-15-15')]

avg_80 = np.mean([results[k]['test_acc'] for k in split_80_methods])
avg_70 = np.mean([results[k]['test_acc'] for k in split_70_methods])

print(f"Average Test Accuracy - 80:10:10 split: {avg_80:.4f}")
print(f"Average Test Accuracy - 70:15:15 split: {avg_70:.4f}")

# Compare methods across splits
method_performance = {}
for method_name, method_key in methods:
    accuracies = []
    for split_name, split_key in splits:
        key = f"{split_key}_{method_key}"
        if key in results:
            accuracies.append(results[key]['test_acc'])
    if accuracies:
        method_performance[method_name] = np.mean(accuracies)

print(f"\nAverage Test Accuracy by Method:")
for method, avg_acc in sorted(method_performance.items(), key=lambda x: x[1], reverse=True):
    print(f"{method}: {avg_acc:.4f}")


COMPREHENSIVE PERFORMANCE COMPARISON
Method                         Split      Train Acc    Val Acc    Test Acc  
----------------------------------------------------------------------
Baseline                       80-10-10   0.8750       1.0000     0.8667    
Baseline                       70-15-15   0.8952       0.8095     0.9167    
Random Oversampling            80-10-10   0.8833       1.0000     0.8667    
Random Oversampling            70-15-15   0.9143       0.8095     0.9167    
SMOTE Random Pairs             80-10-10   0.8750       1.0000     0.8667    
SMOTE Random Pairs             70-15-15   0.8952       0.8095     0.9167    
SMOTE Nearest Neighbors        80-10-10   0.8750       1.0000     0.8667    
SMOTE Nearest Neighbors        70-15-15   0.8952       0.8095     0.9167    

BEST PERFORMING METHODS BY TEST ACCURACY
1. Baseline (70-15-15): Test Accuracy = 0.9167
2. Oversampled (70-15-15): Test Accuracy = 0.9167
3. Smote Random (70-15-15): Test Accuracy = 0.9167

SUMMARY