In [9]:
import pandas as pd
import numpy as np
from scipy import linalg
from scipy.spatial.distance import pdist, squareform
from scipy import exp
from scipy.linalg import eigh


In [21]:
class rbfpca():
    def __init__(self, n_components = None, gamma = None):
        self.n_components = n_components
        self.gamma = gamma 

    def fit_transform(self, X): 
        # Calculating the squared Euclidean distances for every pair of points
        # in the MxN dimensional dataset.
        sq_dists = pdist(X, 'sqeuclidean')

        # Converting the pairwise distances into a symmetric MxM matrix.
        mat_sq_dists = squareform(sq_dists)

        # Computing the MxM kernel matrix.
        gamma = self.gamma
        K = exp(-gamma * mat_sq_dists)

        # Centering the symmetric NxN kernel matrix.
        N = K.shape[0]
        one_n = np.ones((N,N)) / N
        K_norm = K - one_n.dot(K) - K.dot(one_n) + one_n.dot(K).dot(one_n)
        eigvals, eigvecs = eigh(K_norm)
        # Obtaining the i eigenvectors (alphas) that corresponds to the i highest eigenvalues (lambdas).
        n_components = self.n_components
        alphas = np.column_stack((eigvecs[:,-i] for i in range(1,n_components+1)))
        lambdas = [eigvals[-i] for i in range(1,n_components+1)]
        self.lambdas = lambdas
        self.alphas = alphas
        self.X_fit = X
        return alphas

    def transform(self, X):
        gamma = self.gamma
        alphas = self.alphas
        lambdas = self.lambdas
        X_fit = self.X_fit
        pairs_d = []
        for x in X:
            pair_d = [np.sum((x-row)**2) for row in X_fit]
            pairs_d.append(pair_d)
        pairs_dist = np.array(pairs_d)
        print pairs_dist.shape
        k = np.exp(-gamma * pairs_dist)
        print k.shape, alphas.shape
        return k.dot(alphas / lambdas)

In [3]:
path=""
X =pd.read_csv(path+"Xtr.csv", header=None)
Y =pd.read_csv(path+"Ytr.csv")

In [4]:
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(
        X, Y["Prediction"], test_size=0.2)

In [5]:
X.shape


(5000, 784)

In [6]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train,y_train)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [7]:
from sklearn.metrics import accuracy_score
predicted_labels = svc.predict(X_test)
print accuracy_score(predicted_labels,y_test)


0.85


In [12]:
X_train.shape

(4000, 784)

In [22]:
from sklearn.decomposition import KernelPCA
kpca = KernelPCA(kernel = 'rbf', degree = 3, n_components= 35, gamma= 0.01)
kpca = rbfpca(n_components= 35, gamma= 0.01)

X_train_kpca = kpca.fit_transform(X_train.as_matrix())
X_test_kpca = kpca.transform(X_test.as_matrix())

(1000, 4000)
(1000, 4000) (4000, 35)


In [23]:
X_test.shape, X_train_kpca.shape

((1000, 784), (4000, 35))

In [24]:
%%time 
from sklearn.grid_search import GridSearchCV 
from sklearn.neighbors import KNeighborsClassifier
NN = {'n_neighbors': range(1,15,1)}
knn = KNeighborsClassifier(weights='distance')
grid_search = GridSearchCV(knn, NN, n_jobs = 3)
alg = grid_search.fit(X_train_kpca, y_train)
#alg = knn.fit(X_train_kpca, y_train)
predicted_label = alg.predict(X_test_kpca)

print("SVM - Best C & associated score", grid_search.best_params_, grid_search.best_score_)
print("SVM - Score on test_data : ", accuracy_score(y_train, alg.predict(X_train_kpca)))
print("SVM - Score on test_data : ", accuracy_score(y_test, predicted_label))

('SVM - Best C & associated score', {'n_neighbors': 6}, 0.89024999999999999)
('SVM - Score on test_data : ', 1.0)
('SVM - Score on test_data : ', 0.879)
CPU times: user 4.7 s, sys: 91.4 ms, total: 4.79 s
Wall time: 20.9 s


In [7]:
from sklearn.grid_search import GridSearchCV 
Cs = {'C': np.logspace(-5, 0, 10)}

grid_search = GridSearchCV(SVC(cache_size= 1000), Cs, n_jobs = 3)
alg = grid_search.fit(X_train, y_train)
predicted_label = alg.predict(X_test)

print("SVM - Best C & associated score", grid_search.best_params_, grid_search.best_score_)
print("SVM - Score on test_data : ", accuracy_score(y_train, alg.predict(X_train)))
print("SVM - Score on test_data : ", accuracy_score(y_test, predicted_label))


('SVM - Best C & associated score', {'C': 1.0}, 0.82999999999999996)
('SVM - Score on test_data : ', 0.86275000000000002)
('SVM - Score on test_data : ', 0.85999999999999999)


In [8]:
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
svc = SVC(kernel='poly')
svc.fit(X_train,y_train)
predicted_labels = svc.predict(X_test)
print accuracy_score(predicted_labels,y_test)


0.12


In [9]:
svc = SVC(kernel='linear')
svc.fit(X_train,y_train)
predicted_labels = svc.predict(X_test)
print accuracy_score(predicted_labels,y_test)


0.852
