In [1]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from scipy.ndimage import median_filter, gaussian_filter, sobel
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC



In [2]:
#First step, we make our train/test split (60k each class for training, testing will be last 10k of each level + mnist) 
#we have 13 levels of obfuscation, so 4615 from each is 59995 for our label "corrupted"
#we take all of the uncorrupted images from mnist for our labeled "uncorrupted" (first 60k) 
corrupted_imgs = np.empty((0, 28, 28))
test_corrupted = np.empty((0, 28, 28))
for i in range(0, 56000, 4615): 
    #take from i to i+4614 for the training set and set its label to 1 (for corrupted = true) 
    file_num = int((i/4615)*5 + 10) # get the number of the thing we should be on 
    current_np = np.load("mnist_noisy/mnist_" + str(file_num) + ".npy")
    corrupted_imgs = np.concatenate((corrupted_imgs, current_np[i:i+4615]), axis = 0)
    test_corrupted = np.concatenate((test_corrupted, current_np[69230:]), axis = 0)
print(corrupted_imgs.shape)
print(test_corrupted.shape)
np.save("mnist_noisy/training", corrupted_imgs)



(59995, 28, 28)
(10010, 28, 28)


In [3]:
clean_imgs = np.load('mnist_noisy/mnist.npy') 
test_clean = clean_imgs[60000:]
clean_imgs = clean_imgs[:60000]


In [4]:
#now, we combine them and add labels 
train_corrupted = 59995
train_clean = 60000
num_test_corrupted = 10010
num_test_clean = 10000

train_data = np.concatenate((corrupted_imgs, clean_imgs), axis = 0)
train_labels = np.concatenate([np.ones(train_corrupted), np.zeros(train_clean)])
test_data = np.concatenate((test_corrupted, test_clean), axis = 0)
test_labels = np.concatenate([np.ones(num_test_corrupted), np.zeros(num_test_clean)])

In [5]:
#Second step, we perform normalization and denoising before extracting features - do to both the train and test data 

def denoise_image(image):
    """Apply median filter and gaussian filter."""
    image = image.astype(np.float32) / 255.0  # Normalize to [0,1]

    # Remove salt-and-pepper noise
    image = median_filter(image, size=3)

    # Remove Gaussian noise
    image = gaussian_filter(image, sigma=1)

    #ensure validity 
    image = np.clip(image, 0, 1)

    return image

def extract_features(image):
    """Extract classic features: flattened pixels + histogram + edge count."""

    features = []

    # 1. Flattened raw pixels
    flat_pixels = image.flatten()
    features.append(flat_pixels)

    # 2. Histogram of pixel intensities
    # 16 bins between 0 and 1 (since image normalized to [0,1])
    hist, _ = np.histogram(image, bins=16, range=(0.0, 1.0))
    hist = hist / np.sum(hist)  # Normalize histogram
    features.append(hist)

    # 3. Edge magnitude (Sobel)
    dx = sobel(image, axis=0, mode='constant')
    dy = sobel(image, axis=1, mode='constant')
    edge_magnitude = np.hypot(dx, dy)

    # Edge features
    mean_edge_strength = np.mean(edge_magnitude)
    total_edge_strength = np.sum(edge_magnitude)
    features.append([mean_edge_strength, total_edge_strength])

    # Combine all features into a single 1D vector
    return np.concatenate(features)

def prepare_dataset(images):
    processed_images = []

    for img in images:
        denoised_img = denoise_image(img)
        features = extract_features(denoised_img)
        processed_images.append(features)

    return np.array(processed_images)



In [6]:
#normalize, denoise, and feature extract the data from test and train 
train_features = prepare_dataset(train_data)
test_featrues = prepare_dataset(test_data)
print(train_features.shape)

(119995, 802)


In [7]:
#Third, we scale our data
scaler = StandardScaler()
X_train = scaler.fit_transform(train_features)
X_test = scaler.fit_transform(test_featrues)
#too computationally expensive for now 
# #set the hyperparam grid for the SVM 
# param_grid = {'C': [0.1, 1, 10, 100, 1000]}

# #perform grid_search using cross validation 
# grid_search = GridSearchCV(SVC(kernel='linear'), param_grid, cv=5, scoring='accuracy', verbose=2)
# grid_search.fit(X_train, train_labels)

# #now that we have the best C, we save it 
# best_C = grid_search.best_params_['C']
# print(f"Best C value: {best_C}")


In [None]:
    #Train the model with the best C
    clf = LinearSVC(C=1.0, max_iter=10000)
    clf.fit(X_train, train_labels)

    #Evaluate the classifier
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(test_labels, y_pred)
    print(f"Test Accuracy with C=1: {accuracy:.4f}")