# Task 1 Kernelized SVM

In [5]:
import numpy as np
from sklearn.datasets import load_digits, load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC, LinearSVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
import itertools
from scipy.spatial.distance import cdist

In [2]:
digits = load_digits()
X_digits, y_digits = digits.data, digits.target

# Split data
X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(
    X_digits, y_digits, test_size=0.3, random_state=42
)

## Subtask 1 SVMs With Different Kernels

In [3]:
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
best_models = {}
for kernel in kernels:
    if kernel == 'linear':
        param_grid = {'C': [0.1, 1, 10]}
    elif kernel == 'poly':
        param_grid = {
            'C': [0.1, 1, 10],
            'gamma': ['scale', 'auto'],
            'degree': [2, 3]
        }
    else:  # rbf and sigmoid
        param_grid = {
            'C': [0.1, 1, 10],
            'gamma': ['scale', 'auto']
        }
    
    grid = GridSearchCV(SVC(kernel=kernel), param_grid, cv=5)
    grid.fit(X_train_d, y_train_d)
    best_models[kernel] = grid


In [4]:
# Report results
print("Task 1 - Subtask 1: SVM with Different Kernels")
for kernel, model in best_models.items():
    train_acc = accuracy_score(y_train_d, model.predict(X_train_d))
    test_acc = accuracy_score(y_test_d, model.predict(X_test_d))
    print(f"Kernel: {kernel}")
    print(f"Best params: {model.best_params_}")
    print(f"Train Accuracy: {train_acc:.4f}, Test Accuracy: {test_acc:.4f}\n")
print('Method of choosing the best parameters: GridSearchCV')

Task 1 - Subtask 1: SVM with Different Kernels
Kernel: linear
Best params: {'C': 0.1}
Train Accuracy: 1.0000, Test Accuracy: 0.9796

Kernel: poly
Best params: {'C': 0.1, 'degree': 3, 'gamma': 'auto'}
Train Accuracy: 1.0000, Test Accuracy: 0.9889

Kernel: rbf
Best params: {'C': 10, 'gamma': 'scale'}
Train Accuracy: 1.0000, Test Accuracy: 0.9889

Kernel: sigmoid
Best params: {'C': 1, 'gamma': 'scale'}
Train Accuracy: 0.9196, Test Accuracy: 0.9074

Method of choosing the best parameters: GridSearchCV


## Subtask 2 Develop Your Own Kernel

In [12]:
def custom_kernel(X, Y):
    # composite kernel: linear + RBF
    linear = np.dot(X, Y.T)
    rbf = np.exp(-0.1 * np.linalg.norm(X[:, None] - Y[None, :], axis=2)**2)
    return linear + rbf

# Validate kernel properties (positive definiteness)
def is_positive_definite(K):
    try:
        np.linalg.cholesky(K)
        return True
    except np.linalg.LinAlgError:
        return False

# Train SVM with custom kernel
custom_svm = SVC(kernel=custom_kernel)
custom_svm.fit(X_train_d, y_train_d)
custom_train_acc = accuracy_score(y_train_d, custom_svm.predict(X_train_d))
custom_test_acc = accuracy_score(y_test_d, custom_svm.predict(X_test_d))
print(is_positive_definite(custom_kernel))
print("Task 1 - Subtask 2: Custom Kernel")
print(f"Custom Kernel Train Accuracy: {custom_train_acc:.4f}")
print(f"Custom Kernel Test Accuracy: {custom_test_acc:.4f}")


False
Task 1 - Subtask 2: Custom Kernel
Custom Kernel Train Accuracy: 1.0000
Custom Kernel Test Accuracy: 0.9796


## Subtask 3  Classifier Evaluation I

In [7]:
def custom_confusion_matrix(y_true, y_pred):
    labels = np.unique(y_true)
    cm = np.zeros((len(labels), len(labels)), dtype=int)
    for i, true_label in enumerate(labels):
        for j, pred_label in enumerate(labels):
            cm[i, j] = np.sum((y_true == true_label) & (y_pred == pred_label))
    return cm

# Evaluate best models from Subtask 1 and 2
best_standard_model = max(best_models.values(), key=lambda x: x.best_score_)
standard_pred = best_standard_model.predict(X_test_d)
custom_pred = custom_svm.predict(X_test_d)

cm_standard = custom_confusion_matrix(y_test_d, standard_pred)
cm_custom = custom_confusion_matrix(y_test_d, custom_pred)

print("Task 1 - Subtask 3: Confusion Matrices")
print("Standard SVM Confusion Matrix:")
print(cm_standard)
print("\nCustom SVM Confusion Matrix:")
print(cm_custom)


Task 1 - Subtask 3: Confusion Matrices
Standard SVM Confusion Matrix:
[[53  0  0  0  0  0  0  0  0  0]
 [ 0 50  0  0  0  0  0  0  0  0]
 [ 0  0 47  0  0  0  0  0  0  0]
 [ 0  0  1 52  0  1  0  0  0  0]
 [ 0  0  0  0 60  0  0  0  0  0]
 [ 0  0  0  0  0 65  0  0  0  1]
 [ 0  0  0  0  0  0 53  0  0  0]
 [ 0  0  0  0  0  0  0 54  0  1]
 [ 0  0  0  0  0  0  0  0 43  0]
 [ 0  0  0  0  0  0  0  1  1 57]]

Custom SVM Confusion Matrix:
[[53  0  0  0  0  0  0  0  0  0]
 [ 0 50  0  0  0  0  0  0  0  0]
 [ 0  0 47  0  0  0  0  0  0  0]
 [ 0  0  1 52  0  1  0  0  0  0]
 [ 0  0  0  0 60  0  0  0  0  0]
 [ 0  0  0  0  0 65  0  0  0  1]
 [ 0  0  0  0  0  0 53  0  0  0]
 [ 0  0  0  0  0  0  0 54  0  1]
 [ 0  0  0  0  0  1  0  0 42  0]
 [ 0  0  0  0  0  0  0  1  1 57]]


# Task 2 Feature Selection

In [8]:
cancer = load_breast_cancer()
X_cancer, y_cancer = cancer.data, cancer.target
feature_names = cancer.feature_names

# Normalize and split data
scaler = MinMaxScaler()
X_cancer_scaled = scaler.fit_transform(X_cancer)
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X_cancer_scaled, y_cancer, test_size=0.3, random_state=42
)


## Subtask 1 Forward Greedy Feature Selection

In [9]:
def forward_selection(X, y, feature_names, model):
    n_features = X.shape[1]
    selected = []
    scores = []
    for _ in range(n_features):
        best_score = -1
        best_feature = None
        for feature in range(n_features):
            if feature not in selected:
                current_features = selected + [feature]
                score = np.mean(cross_val_score(model, X[:, current_features], y, cv=10))
                if score > best_score:
                    best_score = score
                    best_feature = feature
        selected.append(best_feature)
        scores.append(best_score)
    return [feature_names[i] for i in selected], scores

In [10]:
model_lsvc = LinearSVC(random_state=42)
forward_order, forward_scores = forward_selection(X_train_c, y_train_c, feature_names, model_lsvc)

In [11]:
print("\nTask 2 - Subtask 1: Forward Selection")
for i, (feature, score) in enumerate(zip(forward_order, forward_scores)):
    print(f"Step {i+1}: {feature} - Score: {score:.4f}")


Task 2 - Subtask 1: Forward Selection
Step 1: mean concave points - Score: 0.9071
Step 2: worst texture - Score: 0.9372
Step 3: worst radius - Score: 0.9648
Step 4: worst concavity - Score: 0.9697
Step 5: texture error - Score: 0.9748
Step 6: mean concavity - Score: 0.9748
Step 7: fractal dimension error - Score: 0.9749
Step 8: mean texture - Score: 0.9749
Step 9: mean smoothness - Score: 0.9749
Step 10: mean compactness - Score: 0.9749
Step 11: mean radius - Score: 0.9749
Step 12: radius error - Score: 0.9749
Step 13: mean symmetry - Score: 0.9749
Step 14: mean perimeter - Score: 0.9749
Step 15: worst smoothness - Score: 0.9749
Step 16: worst concave points - Score: 0.9799
Step 17: mean area - Score: 0.9799
Step 18: mean fractal dimension - Score: 0.9799
Step 19: worst symmetry - Score: 0.9799
Step 20: area error - Score: 0.9799
Step 21: compactness error - Score: 0.9799
Step 22: smoothness error - Score: 0.9799
Step 23: concave points error - Score: 0.9799
Step 24: worst area - Scor

## Subtask 2 Backward Greedy Feature Selection

In [16]:
def backward_selection(X, y, feature_names, model):
    n_features = X.shape[1]
    selected = list(range(n_features))
    scores = [np.mean(cross_val_score(model, X, y, cv=10))]
    for _ in range(n_features-1):
        worst_score = 1
        worst_feature = None
        for feature in selected:
            current_features = [f for f in selected if f != feature]
            score = np.mean(cross_val_score(model, X[:, current_features], y, cv=10))
            if score < worst_score:
                worst_score = score
                worst_feature = feature
        selected.remove(worst_feature)
        scores.append(worst_score)
    return [feature_names[i] for i in selected], scores



In [17]:
model_lsvc = LinearSVC(random_state=42)
backward_order, backward_scores = backward_selection(X_train_c, y_train_c, feature_names, model_lsvc)
print("\nTask 2 - Subtask 2: Backward Selection")
for i, (feature, score) in enumerate(zip(backward_order, backward_scores)):
    print(f"Step {i+1}: {feature} - Score: {score:.4f}")


Task 2 - Subtask 2: Backward Selection
Step 1: compactness error - Score: 0.9723


## Subtask 3 Feature Importance

In [18]:
forward_top6 = forward_order[:6]
backward_top6 = backward_order[:6]

print("\nTask 2 - Subtask 3: Top 6 Features")
print(f"Forward Selection: {forward_top6}")
print(f"Backward Selection: {backward_top6}")
print(f"Overlap: {set(forward_top6) & set(backward_top6)}")


Task 2 - Subtask 3: Top 6 Features
Forward Selection: ['mean concave points', 'worst texture', 'worst radius', 'worst concavity', 'texture error', 'mean concavity']
Backward Selection: ['compactness error']
Overlap: set()
