## Dataset
https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/


In [1]:
import numpy as np
import operator

### Using the knn defined in previous notebook knn.py
We use the same function defined previously.

In [2]:
# inX : input datapoint  
# dataSet : the provided dataset
# labels : labels / classes
# k : k nearest neighbours
def classify0(inX, dataSet, labels, k):
    # get the shape of the dataset
    dataSetShape = dataSet.shape[0]
    
    # create diff dataset
    diffMat = np.tile(inX, (dataSetShape, 1)).astype(float) - dataSet.astype(float)
    
    # square of differences
    diffMat = diffMat ** 2
    
    # sum of squares of diff
    sqDistances = diffMat.sum(axis=1)
    
    # sqrt of distance
    sqDistances = sqDistances ** 0.5
    
    sortedDistIndices = sqDistances.argsort()
    classCount={}
    for i in range(k):
        votedIlabel = labels[sortedDistIndices[i]]
        classCount[votedIlabel] = classCount.get(votedIlabel, 0) + 1
    
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]

### Download the training and testing data 

In [6]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tra
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tes
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.names

--2019-12-26 00:38:28--  https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tra
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 563639 (550K) [application/x-httpd-php]
Saving to: ‘optdigits.tra.1’


2019-12-26 00:38:30 (613 KB/s) - ‘optdigits.tra.1’ saved [563639/563639]

--2019-12-26 00:38:30--  https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tes
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 264712 (259K) [application/x-httpd-php]
Saving to: ‘optdigits.tes’


2019-12-26 00:38:31 (313 KB/s) - ‘optdigits.tes’ saved [264712/264712]

--2019-12-26 00:38:31--  https://archive.ics.uci.edu/ml/machine-learni

### Check the contents of the file

In [16]:
#### linux commands
!head -1 optdigits.tra
!echo "---"
!head -1 optdigits.tes
!echo "---"
!head optdigits.names

0,1,6,15,12,1,0,0,0,7,16,6,6,10,0,0,0,8,16,2,0,11,2,0,0,5,16,3,0,5,7,0,0,7,13,3,0,8,7,0,0,4,12,0,1,13,5,0,0,0,14,9,15,9,0,0,0,0,6,14,7,1,0,0,0
---
0,0,5,13,9,1,0,0,0,0,13,15,10,15,5,0,0,3,15,2,0,11,8,0,0,4,12,0,0,8,8,0,0,5,8,0,0,9,8,0,0,4,11,0,1,12,7,0,0,2,14,5,10,12,0,0,0,0,6,13,10,0,0,0,0
---

1. Title of Database: Optical Recognition of Handwritten Digits

2. Source:
	E. Alpaydin, C. Kaynak
	Department of Computer Engineering
	Bogazici University, 80815 Istanbul Turkey
	alpaydin@boun.edu.tr
	July 1998



### Load the training dataset (labelled data)

In [4]:
training_data_file = open('optdigits.tra')
training_data_line = training_data_file.readline().strip()
split_data = training_data_line.split(',')
dataset = np.array([split_data[0:64]])
labels = [split_data[64]]

for i in range(2000):
    training_data_line = training_data_file.readline().strip()
    split_data = training_data_line.split(',')
    dataset = np.vstack([dataset, split_data[0:64]])
    labels.append(split_data[64])

print(len(labels))
print(dataset.shape)

2001
(2001, 64)


### Load the testing dataset 

In [11]:
test_data_file = open('optdigits.tes')
test_data_line = test_data_file.readline().strip()
split_data = test_data_line.split(',')
test_dataset = np.array([split_data[0:64]])
test_labels = [split_data[64]]


for i in range(500):
    test_data_line = test_data_file.readline().strip()
    split_data = test_data_line.split(',')
    test_dataset = np.vstack([test_dataset, split_data[0:64]])
    test_labels.append(split_data[64])


In [19]:
# check one datapoint inside test dataset
test_dataset[4]

array(['0', '0', '0', '1', '11', '0', '0', '0', '0', '0', '0', '7', '8',
       '0', '0', '0', '0', '0', '1', '13', '6', '2', '2', '0', '0', '0',
       '7', '15', '0', '9', '8', '0', '0', '5', '16', '10', '0', '16',
       '6', '0', '0', '4', '15', '16', '13', '16', '1', '0', '0', '0',
       '0', '3', '15', '10', '0', '0', '0', '0', '0', '2', '16', '4', '0',
       '0'], dtype='<U2')

In [20]:
# confirm the test labels are loaded properly
print(len(test_labels))
print(test_labels[0:10])

501
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']


### Classify one random datapoint from test data

In [21]:
classify0(test_dataset[4], dataset, labels, 3)

'4'

In [10]:
len(labels)

2001

### Classify the test data and confirm the accuracy of the algorithm
### Also calculate the time for execution

In [15]:
import time
start_time = time.time()

correct_label_count = 0
for i in range(len(test_labels)):
    predicted_label = classify0(test_dataset[i], dataset, labels, 3)
    if predicted_label == test_labels[i] :
        correct_label_count = correct_label_count + 1
    else :
        print("i={} incorrect detected label = {} expected = {}".format(i, predicted_label,test_labels[i]))
    if i%100 == 0 and i != 0:
        print("i={} accuracy = {}".format(i, 100*correct_label_count/i))
print("correct label = ", correct_label_count)
print("Accuracy = ", 100*correct_label_count/500)
print("--- %s seconds ---" % (time.time() - start_time))


i=2 incorrect detected label = 1 expected = 2
i=5 incorrect detected label = 9 expected = 5
i=77 incorrect detected label = 1 expected = 2
i=87 incorrect detected label = 1 expected = 4
i=100 accuracy = 97.0
i=129 incorrect detected label = 1 expected = 8
i=154 incorrect detected label = 1 expected = 4
i=170 incorrect detected label = 1 expected = 8
i=171 incorrect detected label = 1 expected = 4
i=200 accuracy = 96.5
i=228 incorrect detected label = 1 expected = 4
i=239 incorrect detected label = 1 expected = 4
i=242 incorrect detected label = 1 expected = 8
i=250 incorrect detected label = 1 expected = 4
i=300 accuracy = 96.33333333333333
i=400 accuracy = 97.25
i=414 incorrect detected label = 9 expected = 8
i=421 incorrect detected label = 9 expected = 5
i=429 incorrect detected label = 9 expected = 7
i=467 incorrect detected label = 9 expected = 7
i=480 incorrect detected label = 9 expected = 7
i=498 incorrect detected label = 9 expected = 7
i=500 accuracy = 96.6
correct label =  4

### ROUGH WORK

In [48]:
print("type(dataset) dataset.shape", type(dataset), dataset.shape)
print("type(test_dataset[0]) test_dataset[0].shape", type(test_dataset[0]), test_dataset[0].shape)


type(dataset) dataset.shape <class 'numpy.ndarray'> (6, 64)
type(test_dataset[0]) test_dataset[0].shape <class 'numpy.ndarray'> (64,)


In [51]:
inX = list(test_dataset[0])
k = 3

In [53]:
dataSetShape = dataset.shape[0]
print(dataSetShape)

6


In [63]:
type(np.tile(inX, (dataSetShape, 1)).astype(int))

numpy.ndarray

In [65]:
np.tile(inX, (dataSetShape, 1)).astype(int) - dataset.astype(int)

array([[  0,  -1,  -1,  -2,  -3,   0,   0,   0,   0,  -7,  -3,   9,   4,
          5,   5,   0,   0,  -5,  -1,   0,   0,   0,   6,   0,   0,  -1,
         -4,  -3,   0,   3,   1,   0,   0,  -2,  -5,  -3,   0,   1,   1,
          0,   0,   0,  -1,   0,   0,  -1,   2,   0,   0,   2,   0,  -4,
         -5,   3,   0,   0,   0,   0,   0,  -1,   3,  -1,   0,   0],
       [  0,   0,  -5,  -3,   3,   1,   0,   0,   0,  -7,  -3,   7,  -6,
         10,   5,   0,   0,  -8,  -1,   2,  -6,  -3,   5,   0,   0,  -8,
          0,   0,   0,  -3,  -3,   0,   0,  -7,  -4,   0,   0,   1,  -4,
          0,   0,  -3,  -4,  -1,   1,  -1,  -4,   0,   0,   2,  -2,  -3,
          0,  -3,  -3,   0,   0,   0,  -4,  -3,  -5,  -3,   0,   0],
       [  0,   0,  -3,  -2,  -7, -12,   0,   0,   0,  -1,   2,   6,  -1,
         -1,   4,   0,   0,   3,  15,   2,  -7,  -3,   8,   0,   0,   4,
          9,  -4, -14,  -4,   6,   0,   0,   4,  -8, -16, -16,  -7,  -2,
          0,   0,   2,  -1, -16,  -9,  12,   7,   0,   0,  

In [60]:
type(dataset)

numpy.ndarray