In [None]:
import maxvolpy.maxvol as mv
import numpy as np
import pandas as pd

from scipy.sparse.linalg import svds
from scipy.sparse import csr_matrix

from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

import matplotlib.pyplot as plt

In [None]:
train = np.loadtxt('datasets/arcene_train.data')
train_labels = np.loadtxt('datasets/arcene_train.labels')
valid = np.loadtxt('datasets/arcene_valid.data')
valid_labels = np.loadtxt('datasets/arcene_valid.labels')

# uncomment this part to run the same experiment with scaling

#scaler = MinMaxScaler() 
#train = scaler.fit_transform(train)
#valid = scaler.transform(valid)

print(f'Train matrix rank: {np.linalg.matrix_rank(train)}')

In [None]:
maxvol_indices = {}

# uncomment if you don't want to recalculate the maxvol-selected indices
# `maxvol_arcene_indices_noscale.txt` contains indices calculated for the dataset without scaling
# `maxvol_arcene_indices_minmax.txt` contains indices calculated for the dataset with MinMax scaling
'''
with open('maxvol_arcene_indices_noscale.txt', 'r') as f:
    for line in f:
        inds = np.array(list(map(int, line.split())))
        maxvol_indices[len(inds)] = inds
'''

In [None]:
%%time

model = Perceptron(tol=1e-3)

mv_accs = []
non_mv_accs = []
mean_rand_accs = []

k_range = [100, 500, 1000, 3000, 5000, 7000, 9000]
for k in k_range:
    print(f'Processing k = {k}')
    if k in maxvol_indices:
        inds = maxvol_indices[k]
    else:
        inds, C = mv.rect_maxvol(train.T, maxK=k, minK=k)
        maxvol_indices[k] = inds
        
    non_inds = [i for i in range(train.shape[1]) if i not in inds]
    train_maxvol = train[:, inds]
    valid_maxvol = valid[:, inds]
    
    # fit on maxvol features
    model_maxvol = model.fit(train_maxvol, train_labels)
    mv_accs.append(model_maxvol.score(valid_maxvol, valid_labels))

    # fit on non-maxvol features
    model_nonmaxvol = model.fit(train[:, non_inds], train_labels)
    non_mv_accs.append(model_nonmaxvol.score(valid[:, non_inds], valid_labels))

    # fit on random features
    rand_accuracies = []
    for i in range(100):
        rand_indices = np.random.choice(train.shape[1], k, replace=False)
        train_rand = train[:, rand_indices]
        valid_rand = valid[:, rand_indices]

        model_rand = model.fit(train_rand, train_labels)
        rand_accuracies.append(model_rand.score(valid_rand, valid_labels))

    mean_rand_accs.append(np.array(rand_accuracies).mean())

In [None]:
# uncomment if you want to save calculated maxvol indices 
'''
with open('maxvol_arcene_indices_noscale.txt', 'w') as f:
    for k in maxvol_indices:
        f.write(' '.join(map(str, maxvol_indices[k])) + '\n')
'''

In [None]:
# fit on non-maxvol features
pca = PCA().fit(train)
model_pca = model.fit(pca.transform(train), train_labels)
pca_acc = model_pca.score(pca.transform(valid), valid_labels)

# fit on all features
model_full = model.fit(train, train_labels)
full_acc = model_full.score(valid, valid_labels) 

In [None]:
plt.figure(figsize=(10, 5))

plt.plot(k_range, mv_accs, c='g', label='maxvol')
#plt.plot(k_range, non_mv_accs, c='b', label='non-maxvol')
plt.plot(k_range, mean_rand_accs, c='y', label='random')
plt.plot(k_range, [full_acc] * len(k_range), c='r', linestyle='--', label='all features')
plt.plot(k_range, [pca_acc] * len(k_range), c='pink', linestyle='-.', label='PCA')

plt.title('Perceptron classification accuracy')
plt.legend()
plt.xticks(k_range, rotation=90)
plt.xlabel('Number of selected features')
plt.ylabel('Accuracy')
plt.grid()
plt.savefig('maxvol_noscale.png', dpi=500)