In [1]:
import numpy as np
import pandas as pd 
import sklearn
import scipy.sparse 

for p in [np, pd, sklearn, scipy]:
    print (p.__name__, p.__version__)

numpy 1.13.1
pandas 0.20.3
sklearn 0.19.0
scipy 0.19.1


The versions should be not less than:

    numpy 1.13.1
    pandas 0.20.3
    sklearn 0.19.0
    scipy 0.19.1
    
If it is not the case, uncomment the following cell and run it. Then restart the kernel and check versions again.

In [2]:
# !conda update -y scikit-learn pandas scipy numpy

# Load data

In [3]:
train_path = '../readonly/KNN_features_data/X.npz'
train_labels = '../readonly/KNN_features_data/Y.npy'

test_path = '../readonly/KNN_features_data/X_test.npz'
test_labels = '../readonly/KNN_features_data/Y_test.npy'

# Train data
X = scipy.sparse.load_npz(train_path)
Y = np.load(train_labels)

# Test data
X_test = scipy.sparse.load_npz(test_path)
Y_test = np.load(test_labels)

# Out-of-fold features we loaded above were generated with n_splits=4 and skf seed 123
# So it is better to use seed 123 for generating KNN features as well 
skf_fold = 123
n_splits = 4

Below you need to implement features, based on nearest neaighbours.

In [4]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.neighbors import NearestNeighbors
from multiprocessing import Pool

import numpy as np

# This class should implement KNN features extraction 
class NearestNeighborsFeats(BaseEstimator, ClassifierMixin):
    def __init__(self, n_jobs, k_list, metric, n_classes=None, n_neighbors=None, eps=1e-6):
        self.n_jobs = n_jobs
        self.k_list = k_list
        self.metric = metric
        
        if n_neighbors is None:
            self.n_neighbors = max(k_list) 
        else:
            self.n_neighbors = n_neighbors
            
        self.eps = eps        
        self.n_classes_ = n_classes
        
        
    def get_features_for_one(self, x):
        '''
            x is of shape (L,)
        '''

        NN_output = self.NN.kneighbors(x)
        
        # Vector of size `n_neighbors`
        # Stores indices of the neighbors
        neighs = NN_output[1][0]
        
        # Vector of size `n_neighbors`
        # Stores distances to corresponding neighbors
        neighs_dist = NN_output[0][0] 

        # Vector of size `n_neighbors`
        # Stores labels of corresponding neighbors
        neighs_y = self.y_train[neighs] 
        
        ## ========================================== ##
        ##              YOUR CODE BELOW
        ## ========================================== ##
        
        # We will accumulate the computed features here
        # Eventually it will be a list of lists or np.arrays
        # and we will use np.hstack to concatenate those
        return_list = [] 
        
        # ----------------------------------------------
        # 1. Fraction of objects of every class.
        #    It is basically a KNNСlassifiers predictions.
        #    take a look at np.bincount function, it can be very helpful
        #    note that the values should sum up to one
        for k in self.k_list:
            # YOUR CODE GOES HERE
            
            assert len(feat) == self.n_classes
            return_list += [feat]
        
        # ----------------------------------------------
        # 2.  Streak: how many closest objects have same label
        #     We need to be careful with the case when all neighbors are of the same class
        #     Answer 'res' is list of size 1
        #     What can help you:
        #     np.where, list.index(), map
        
        out = # YOUR CODE GOES HERE
        
        assert len(out) == 1
        return_list += [out]
        
        # ----------------------------------------------
        # 3. Minimum distance to objects of each class
        #    Finds first instance of a class and take it's distance as features.
        #    `out` is a list of size `n_class`
        #    If there are no neighbouring objects of some classes, 
        #    Then set distance to that class to be max distance + 1
        #    `np.where` might be helpful
        
        out = []
        for c in range(self.n_classes):
            # YOUR CODE GOES HERE
        
        assert len(out) == self.n_classes
        return_list += [out]
        
        # ----------------------------------------------
        # 4. Minimum *normalized* distance to objects of each class
        #    As 3. but we normalize (divide) the distances
        #    by distance to the closest neighbor
        #    Do not forget to add self.eps to denominator
        
        out = []
        for c in range(self.n_classes):
            # YOUR CODE GOES HERE
        
        return_list += [out]
        
        # ----------------------------------------------
        # 5. 
        #    5.1 Distance to Kth neighbor
        #        Think of this as of quantiles of a distribution
        #    5.2 Distance to Kth neighbor normalized by 
        #        distance to the first neighbor
        #    
        #    feat_51, feat_52 are answers to 5.1. and 5.2
        #    should be scalars
        for k in self.k_list:
            
            feat_51 = # YOUR CODE GOES HERE
            feat_52 = # YOUR CODE GOES HERE
            return_list += [[feat_51, feat_52]]
        
        # ----------------------------------------------
        # 6. Mean distance to first K neighbors per class 
        # 
        #    You can use np.bincount with appropriate weights
        #    Don't forget, that if you divide by something, 
        #    You need to add self.eps to denominator
        
        for k in self.k_list:
            
            # YOUR CODE GOES IN HERE
            
            return_list += [feat]
        
        
        # merge
        knn_feats = np.hstack(return_list)
        
        assert knn_feats.shape == (239), 
        return knn_feats
    
    def fit(self, X, y):
        # Create a NearestNeighbors (NN) object. We will use it in `predict` function 
        self.NN = NearestNeighbors(n_neighbors=max(self.k_list), 
                                      metric=self.metric, 
                                      n_jobs=1, 
                                      algorithm='brute' if self.metric=='cosine' else 'auto')
        self.NN.fit(X)
        
        # Store labels 
        self.y_train = y
        
        # Save how many classes we have
        self.n_classes = np.unique(y).shape[0] if self.n_classes_ is None else self.n_classes_
            
        
    def predict(self, X):       
        if self.n_jobs == 1:
            test_feats = []
            for i in range(X.shape[0]):
                test_feats.append(self.get_features_for_one(X[i:i+1]))
        else:
            # Try to make it parallel 
            # You can use whatever you want to do it
            #
            # But I suggest you to use multiprocessing.Pool here.
            # As I could not get joblib working
            #
            # Number of threads should be controlled by self.n_jobs 
            # To make it work you will need to read this 
            # http://qingkaikong.blogspot.ru/2016/12/python-parallel-method-in-class.html
            assert False, 'Not implemented, use n_jobs=1'
            # YOUR CODE GOES HERE
            # test_feats =  # YOUR CODE GOES HERE
            # YOUR CODE GOES HERE
            
        return np.vstack(test_feats)

# http://qingkaikong.blogspot.ru/2016/12/python-parallel-method-in-class.html
def unwrap_self(arg, **kwarg):
    return NearestNeighborsFeats.get_features_for_one(*arg, **kwarg)

SyntaxError: invalid syntax (<ipython-input-4-15cd3b16df39>, line 68)

## Sanity check

To make sure you've implemented everything correctly we provide you the correct features for the first 50 objects.

In [7]:
# Load correct features
true_knn_feats_first50 = np.load('../readonly/KNN_features_data/knn_feats_test_first50.npy')

# Create instance of our KNN feature extractor
NNF = NearestNeighborsFeats(n_jobs=1, k_list=k_list, metric='minkowski')

# Fit on train set
NNF.fit(X,Y)

# Get features for test
test_knn_feats = NNF.predict(X_test[:50])

# This should be zero
print (np.abs(test_knn_feats - true_knn_feats_first50).mean())

NameError: name 'NearestNeighborsFeats' is not defined

Now implement parallel computations and compute features for the train and test sets. 

## Get features for test

In [12]:
for m in ['minkowski', 'cosine']:
    print m
    
    # Create instance of our KNN feature extractor
    NNF = NearestNeighborsFeats(n_jobs=20, k_list=k_list, metric=m)
    
    # Fit on train set
    NNF.fit(X,Y)

    # Get features for test
    test_knn_feats = NNF.predict(X_test)
    
    # Dump the features to disk
    np.save('data/knn_feats_test.npy', test_knn_feats)

minkowski
cosine


## Get features for train

Compute features for train, using out-of-fold strategy.

In [9]:
# Differently from other homework we will not implement OOF predictions ourselves
# but use sklearn's `cross_val_predict`
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold

# a list of K in KNN 
k_list = [3, 8,32]

# We will use two metrics for KNN
for m in ['minkowski', 'cosine']:
    print m
    
    # Set up splitting scheme, use StratifiedKFold
    # use skf_seed and n_splits defined above
    skf = # YOUR CODE GOES HERE
    
    # Create instance of our KNN feature extractor
    # n_jobs can be larger than the number of cores
    NNF = NearestNeighborsFeats(n_jobs=10, k_list=k_list, metric=m)
    
    # Get KNN features using OOF use cross_val_predict with right parameters
    preds = # YOUR CODE GOES HERE
    
    # Save the features
    np.save('data/knn_feats_train.npy', preds)

minkowski
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   20.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   30.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   41.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   41.2s finished
cosine
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   21.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   32.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   43.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   43.3s finished


# Submit

You need to submit...