In [1]:
import pandas as pd
import numpy as np
from scipy import linalg
from scipy.spatial.distance import pdist, squareform
from scipy import exp
from scipy.linalg import eigh


In [2]:
class rbfpca():
    def __init__(self, n_components = None, gamma = None):
        self.n_components = n_components
        self.gamma = gamma 

    def fit_transform(self, X): 
        # Calculating the squared Euclidean distances for every pair of points
        # in the MxN dimensional dataset.
        sq_dists = pdist(X, 'sqeuclidean')

        # Converting the pairwise distances into a symmetric MxM matrix.
        mat_sq_dists = squareform(sq_dists)

        # Computing the MxM kernel matrix.
        gamma = self.gamma
        K = exp(-gamma * mat_sq_dists)

        # Centering the symmetric NxN kernel matrix.
        N = K.shape[0]
        one_n = np.ones((N,N)) / N
        K_norm = K - one_n.dot(K) - K.dot(one_n) + one_n.dot(K).dot(one_n)
        eigvals, eigvecs = eigh(K_norm)
        # Obtaining the i eigenvectors (alphas) that corresponds to the i highest eigenvalues (lambdas).
        n_components = self.n_components
        alphas = np.column_stack((eigvecs[:,-i] for i in range(1,n_components+1)))
        lambdas = [eigvals[-i] for i in range(1,n_components+1)]
        self.lambdas = lambdas
        self.alphas = alphas
        self.X_fit = X
        return alphas

    def transform(self, X):
        gamma = self.gamma
        alphas = self.alphas
        lambdas = self.lambdas
        X_fit = self.X_fit
        pairs_d = []
        for x in X:
            pair_d = [np.sum((x-row)**2) for row in X_fit]
            pairs_d.append(pair_d)
        pairs_dist = np.array(pairs_d)
        print pairs_dist.shape
        k = np.exp(-gamma * pairs_dist)
        print k.shape, alphas.shape
        return k.dot(alphas / lambdas)

In [3]:
import numpy as np
class k_nn:
    def __init__(self, n_neighbor = 5, weigh_dist = True):
        self.n_neighbor = n_neighbor
        self.weigh_dist = weigh_dist
    def fit(self, X, y):
        self.training_set = X
        self.training_label = y
        return self
    def distance(self,X,Y) :
        return np.sum((X-Y)**2)
            
    def find_nn(self,X):
        distances = []
        neighbors = []
        train_set = self.training_set
        for x in X:
            neigh = []
            dist = []
            for i,t in enumerate(train_set):
                dist.append(self.distance(x,t))
            dist = np.array(dist)
            neigh = np.argsort(dist)
            dist = dist[neigh]
            distances.append(dist)
            neighbors.append(neigh)
       
        return np.array(neighbors),np.array(distances)
    
    def predict(self, X):
        prediction = []
        neighbors, distances = self.find_nn(X)
        #print neighbors[:,0:self.n_neighbor]
        #print distances[:,0:self.n_neighbor]
        for i,x in enumerate(X):           
            count = np.zeros(10)
            for j, neigh in enumerate(neighbors[i][0:self.n_neighbor]):
                #print j
                #print (self.training_label)[j]
                if(self.weigh_dist):
                    count[self.training_label[neigh]]+= 1./distances[i,j]
                else:
                    count[self.training_label[neigh]]+= 1
                    
            prediction.append(np.argmax(count))
        return prediction

In [4]:
path=""
X_train =pd.read_csv(path+"Xtr.csv", header=None)
Y =pd.read_csv(path+"Ytr.csv")
X_test =pd.read_csv(path+"Xte.csv", header=None)


In [5]:
y_train = Y["Prediction"].values
X_train = X_train.values
X_test = X_test.values

In [6]:
kpca = rbfpca(n_components= 35, gamma= 0.01)

X_train_kpca = kpca.fit_transform(X_train)
X_test_kpca = kpca.transform(X_test)
X_test.shape, X_train_kpca.shape

(10000, 5000)
(10000, 5000) (5000, 35)


((10000, 784), (5000, 35))

In [7]:
knn = k_nn(n_neighbor=6)
alg = knn.fit(X_train_kpca, y_train)
predicted_label = alg.predict(X_test_kpca)


In [8]:
X_test.shape

(10000, 784)

In [22]:
def make_submission(predicted_label, name = 'submit.csv'):
    submit_d = d = {'Id' : pd.Series(np.arange(1,X_test.shape[0]+1).astype(int)),
                'Prediction' : pd.Series(predicted_label).astype(int)}
    submit = pd.DataFrame(submit_d)
    submit.to_csv(name,index=False)
    return submit

In [23]:
submit = make_submission(predicted_label)

In [24]:
submit.describe()

Unnamed: 0,Id,Prediction
count,10000.0,10000.0
mean,5000.5,4.3929
std,2886.89568,2.949068
min,1.0,0.0
25%,2500.75,2.0
50%,5000.5,4.0
75%,7500.25,7.0
max,10000.0,9.0
