In [6]:
import pickle
import numpy as np

In [7]:
def unpickle(file):
    with open(file, 'rb') as fo:
        data = pickle.load(fo, encoding='bytes')
    return data

In [8]:
def load_CIFAR10(pos, n_chunks=1):
    Xtr = []
    Ytr = []
    for i in range(n_chunks):
        train = unpickle(pos + '/data_batch_{0}'.format(i + 1))
        Xtr.extend(train[b'data'])
        Ytr.extend(train[b'labels'])
    test = unpickle(pos + '/test_batch')
    Xte = test[b'data']
    Yte = test[b'labels']
    return np.array(Xtr), np.array(Ytr), np.array(Xte), np.array(Yte)

In [9]:
Xtr, Ytr, Xte, Yte = load_CIFAR10('cifar-10-batches-py')
Xtr_rows = Xtr.reshape(Xtr.shape[0], 32 * 32 * 3)
Ytr = np.array(Ytr)
Xte_rows = Xte.reshape(Xte.shape[0], 32 * 32 * 3)
Yte = np.array(Yte)

print(Xtr_rows.shape)
print(Ytr.shape)
print(Xte_rows.shape)
print(Yte.shape)

(10000, 3072)
(10000,)
(10000, 3072)
(10000,)


In [10]:
Xval_rows = Xtr_rows[:1000, :]
Yval = Ytr[:1000]
Xtr_rows = Xtr_rows[8000:, :]
Ytr = Ytr[8000:]

print(Xval_rows.shape)
print(Yval.shape)
print(Xtr_rows.shape)
print(Ytr.shape)

(1000, 3072)
(1000,)
(2000, 3072)
(2000,)


In [12]:
def compute_dist(X, point, dist_type):
    if dist_type == 'l1':
        return np.sum(np.abs(X - point), axis = 1)
    elif dist_type == 'l2':
        return np.sqrt(np.sum(np.square(X - point), axis = 1))

In [13]:
def predict_point(distances, ytr, k):
    if k == 1:
        min_index = np.argmin(distances)
        return ytr[min_index]
    elif k > 1:
        min_indices = np.argpartition(distances, k)[:k]
        labels = np.array([ytr[i] for i in min_indices])
        return np.argmax(np.bincount(labels))

In [14]:
class NearestNeighbor(object):
    def __init__(self):
        pass

    def train(self, X, y):
        self.Xtr = X
        self.ytr = y

    def predict(self, X, k=1, dist_type='l1'):
        num_test = X.shape[0]
        Ypred = np.zeros(num_test, dtype = self.ytr.dtype)

        for i in range(num_test):
            distances = compute_dist(self.Xtr, X[i,:], dist_type)
            Ypred[i] = predict_point(distances, self.ytr, k)
            
        return Ypred

In [15]:
validation_accuracies = []
for k in [1, 3, 5, 10, 20, 50, 100]:
    for dist_type in ['l1', 'l2']:
        nn = NearestNeighbor()
        nn.train(Xtr_rows, Ytr)

        Yval_predict = nn.predict(Xval_rows, k=k, dist_type=dist_type)
        acc = np.mean(Yval_predict == Yval)
        print('k: %d, dist_type: %s, accuracy: %f' % (k, dist_type, acc))

        validation_accuracies.append((k, acc))

k: 1, dist_type: l1, accuracy: 0.202000
k: 1, dist_type: l2, accuracy: 0.169000
k: 3, dist_type: l1, accuracy: 0.177000
k: 3, dist_type: l2, accuracy: 0.178000
k: 5, dist_type: l1, accuracy: 0.181000
k: 5, dist_type: l2, accuracy: 0.186000
k: 10, dist_type: l1, accuracy: 0.187000
k: 10, dist_type: l2, accuracy: 0.180000
k: 20, dist_type: l1, accuracy: 0.181000
k: 20, dist_type: l2, accuracy: 0.195000
k: 50, dist_type: l1, accuracy: 0.173000
k: 50, dist_type: l2, accuracy: 0.196000
k: 100, dist_type: l1, accuracy: 0.167000
k: 100, dist_type: l2, accuracy: 0.210000


In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [18]:
X, y = make_classification(n_samples=1000, n_features=4, 
                           n_informative=2, n_redundant=0, 
                           random_state=0, n_classes=2, 
                           shuffle=False)

[ 0.17287856  0.80608704  0.01884792  0.00218648]
[1]


In [19]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X, y)
print(clf.feature_importances_)
print(clf.predict([[0, 0, 0, 0]]))

[ 0.17287856  0.80608704  0.01884792  0.00218648]
[1]
