In [52]:
import numpy as np
import pandas as pd 
import sklearn
import scipy.sparse 

for p in [np, pd, sklearn, scipy]:
    print (p.__name__, p.__version__)

numpy 1.13.3
pandas 0.21.0
sklearn 0.19.1
scipy 1.0.0


#### Objective: To show efficacy of using features extracted from KNN as an input to a classifier. We should expact that the KNN features should contain better information to strengthen the signal and improve the accuracy of the classifier.

KNN summary:
1. Non-parametric
2. Used for both regression and classification.
3. Computationally demanding as stores entire train data set.
4. Needs to calculate similarity between input and all instances when predicting.
5. Value k determines the number of neighbors to use in kNN model.
6. Algorithm: Find k nearest neighors of the input, and use voting to determine class of input.

In [111]:
from sklearn.datasets import load_digits, fetch_mldata
from sklearn.model_selection import train_test_split
X, y = load_digits(return_X_y=True)
X = np.array(X)
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.1, random_state=10)
X_train, X_valid, y_train, y_valid  = train_test_split(X_train, y_train, test_size=0.3, random_state=10)

In [112]:
skf_seed = 10
n_splits = 5

In [113]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.neighbors import NearestNeighbors
from multiprocessing import Pool


import numpy as np


class NearestNeighborsFeats(BaseEstimator, ClassifierMixin):
    # Starter class provided as an exercise in Advanced Machine Learning Course by
    # National Research University Higher School of Economics
    ''' This class implements KNN features extraction  '''
    def __init__(self, n_jobs, k_list, metric, n_classes=None, n_neighbors=None, eps=1e-6):
        self.n_jobs = n_jobs
        self.k_list = k_list
        self.metric = metric
        
        if n_neighbors is None:
            self.n_neighbors = max(k_list) 
        else:
            self.n_neighbors = n_neighbors
            
        self.eps = eps        
        self.n_classes_ = n_classes
        
    def fit(self, X, y):
        ''' Set's up the train set and self.NN object '''
        # Create a NearestNeighbors (NN) object. 
        self.NN = NearestNeighbors(n_neighbors=max(self.k_list), 
                                      metric=self.metric, 
                                      n_jobs=1, 
                                      algorithm='brute' if self.metric=='cosine' else 'auto')
        self.NN.fit(X)
        
        # Store labels 
        self.y_train = y
        
        # Save how many classes we have
        self.n_classes = np.unique(y).shape[0] if self.n_classes_ is None else self.n_classes_
        
    def predict(self, X): 
        ''' Produces KNN features for every object of a dataset X '''
        if self.n_jobs == 1:
            test_feats = []
            for i in range(X.shape[0]):
                test_feats.append(self.get_features_for_one(X[i:i+1]))
        else:
            p = Pool(self.n_jobs)
            test_feats = p.map(self.get_features_for_one, map(lambda x: x.reshape(1, -1),X))
            
        return np.vstack(test_feats)
    
    def get_features_for_one(self, x):
        ''' Computes KNN features for a single object `x` '''
        NN_output = self.NN.kneighbors(x)
        
        # Vector of size `n_neighbors`
        # Stores indices of the neighbors
        neighs = NN_output[1][0]
        
        # Vector of size `n_neighbors`
        # Stores distances to corresponding neighbors
        neighs_dist = NN_output[0][0] 

        # Vector of size `n_neighbors`
        # Stores labels of corresponding neighbors
        neighs_y = self.y_train[neighs]

        return_list = [] 

        ### 1. Fraction of objects of every class. ###
        # It is basically a KNNСlassifiers predictions.
        for k in self.k_list:
            feats = np.array(neighs_y[:k])
            # Specificy minlength to make sure assertion passes
            feats = np.bincount(feats.astype('int'), minlength=self.n_classes)/k
            assert np.sum(feats) == 1
            assert len(feats) == self.n_classes
            return_list += [feats]
        
        ### 2. Same label streak ###
        # The largest number N, such that N nearest neighbors have the same label.
        result = np.where(np.diff(neighs_y) != 0)
        # Need to increment by 1 as np.diff reduces length
        feats = np.array([result[0][0]+1] if result[0].size else [len(neighs_y)])
        
        assert len(feats) == 1
        return_list += [feats]
        
        ### 3. Minimum distance to objects of each class ###
        # Find the first instance of a class and take its distance as features.
        #       
        # If there are no neighboring objects of some classes, 
        # then set distance to that class to be 999.
        feats = []
        for c in range(self.n_classes):
            idx = np.where(neighs_y == c)
            # Use .size to check if np.where returns empty tuple
            # TODO: Is this really the best way?
            neighbors = 1 if idx[0].size else 0
            dist = min(neighs_dist[idx]) if neighbors else 999
            feats.append(dist)
        feats = np.array(feats)    
        assert len(feats) == self.n_classes
        return_list += [feats]
        
        ### 4. Minimum *normalized* distance to objects of each class ###
        # As 3. but we normalize (divide) the distances
        # by the distance to the closest neighbor.
               
        # If there are no neighboring objects of some classes, 
        # Then set distance to that class to be 999.
        feats = []
        for c in range(self.n_classes):
            idx = np.where(neighs_y == c)
            neighbors = True if idx[0].size else False
            dist = min(neighs_dist[idx]/(neighs_dist[0]+self.eps)) if neighbors else 999
            feats.append(dist)
        
        feats = np.array(feats)
        assert len(feats) == self.n_classes
        return_list += [feats]
        
        ### 5. 
        # 5.1 Distance to Kth neighbor
        #           
        # 5.2 Distance to Kth neighbor normalized by distance to the first neighbor        
        for k in self.k_list:
            feat_51 = np.array(neighs_dist[k-1])
            feat_52 = np.array(neighs_dist[k-1]/(neighs_dist[0] + self.eps))
            feats = np.array([feat_51, feat_52])
            return_list += [feats]
            
        ### 6. Mean distance to neighbors of each class for each K from `k_list` 
        # For each class select the neighbors of that class among K nearest neighbors 
        # and compute the average distance to those objects.
        # If no objects of a certain class among K neighbors, set mean distance to 999
        for k in self.k_list:
            w = neighs_dist[:k]
            count = np.bincount(neighs_y[:k], minlength=self.n_classes)
            feats = np.bincount(neighs_y[:k],w, minlength=self.n_classes) / (count + self.eps)
            feats[np.where(count == 0)] = 999
            assert len(feats) == self.n_classes
            return_list += [feats]
        
        # merge
        knn_feats = np.hstack(return_list)
        assert knn_feats.shape == (87,) or knn_feats.shape == (87, 1)
        return knn_feats

In [114]:
# a list of K in KNN, starts with one 
k_list = [3, 8, 32]


metric = 'minkowski'
# Create instance of our KNN feature extractor
NNF = NearestNeighborsFeats(n_jobs=2, k_list=k_list, metric=metric)

# Fit on train set
NNF.fit(X_train, y_train)

# Get features for valid
test_knn_feats = NNF.predict(X_test)

# Dump the features to disk
np.save('data/digits_knn_feats_%s_valid.npy' % metric , test_knn_feats)

In [115]:
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold

# We will use two metrics for KNN

metric = 'minkowski'
    
# Set up splitting scheme, use StratifiedKFold
# use skf_seed and n_splits defined above with shuffle=True
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=skf_seed)

# Create instance of our KNN feature extractor
# n_jobs can be larger than the number of cores
NNF = NearestNeighborsFeats(n_jobs=4, k_list=k_list, metric=metric)

# Get KNN features using OOF use cross_val_predict with right parameters
train_knn_feats = cross_val_predict(NNF, X_train,y_train,cv=skf)
# Save the features
np.save('data/knn_feats_%s_train.npy' % metric, train_knn_feats)

Train multi logistic classifier on original train data set

In [117]:
from sklearn import metrics
from sklearn.linear_model import LogisticRegression


clf = LogisticRegression(multi_class='multinomial',penalty='l1', solver='saga', tol=0.1)

clf.fit(X_train, y_train)
sparsity = np.mean(clf.coef_ == 0) * 100
score = clf.score(X_test, y_test)
print(score)

0.944444444444


Train multi logistic classifier on knn features generated from train data.

In [123]:
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
#print(metrics.accuracy_score(y_test, test_knn_feats))

clf = LogisticRegression(multi_class='multinomial',penalty='l1', solver='saga', tol=0.1)

clf.fit(train_knn_feats, y_train)
sparsity = np.mean(clf.coef_ == 0) * 100

score = clf.score(test_knn_feats, y_test)
print(score)

0.972222222222


In [124]:
combined_train = np.c_[X_train, train_knn_feats]
combined_test = np.c_[X_test, test_knn_feats]
print(X_train.shape)
print(train_knn_feats.shape)
print(combined.shape)

(1131, 64)
(1131, 87)
(1131, 151)


Train multi logistic classifier on knn features + original features found in train data.

In [125]:
clf = LogisticRegression(multi_class='multinomial',penalty='l1', solver='saga', tol=0.1)

clf.fit(combined_train, y_train)
sparsity = np.mean(clf.coef_ == 0) * 100

score = clf.score(combined_test, y_test)
print(score)

0.966666666667
