In [167]:
import numpy as np

In [168]:
digit_train_data_path = 'trainingimages'
digit_train_label_path = 'traininglabels'
digit_val_data_path = 'validationimages'
digit_val_label_path = 'validationlabels'
digit_test_data_path = 'testimages'
digit_test_label_path = 'testlabels'

face_train_data_path = 'facedatatrain'
face_train_label_path = 'facedatatrainlabels'
face_val_data_path = 'facedatavalidation'
face_val_label_path = 'facedatavalidationlabels'
face_test_data_path = 'facedatatest'
face_test_label_path = 'facedatatestlabels'

In [169]:
def convert_to_matrix(file_path):
    imgs = []
    curr_img = []
    flag = False
    with open(file_path, 'rb') as f:
        for l in f:
            l_cleaned = l.decode('utf-8').replace(' ', '').strip()
            if l_cleaned:
                curr_img.append(l.decode('utf-8'))
            else:
                if curr_img:
                    imgs.append(curr_img)
                curr_img = []
    return imgs

def get_label(label_path):
    train_labels = []
    with open(label_path, 'rb') as f:
        for l in f:
            if l.decode('utf-8').strip():
                train_labels.append(int(l.decode('utf-8').strip()))
    
    return train_labels

In [170]:
def make_features(imgs, n_dim):
    image_features = np.zeros((len(imgs), n_dim), int)
    for i, img in enumerate(imgs):
        img_feat = []
        for r in img:
            r_cleaned = r.replace(' ', '0')\
                         .replace('\n', '0')\
                         .replace('+', '1')\
                         .replace('#', '1')

            r_cleaned = np.array(list(r_cleaned)).astype(np.uint8)
            img_feat.append(r_cleaned)
        m_unravel = np.array(img_feat).ravel().reshape(1, -1)
        if m_unravel.shape[1] > n_dim:
            m_unravel = m_unravel[:n_dim]
        elif m_unravel.shape[1] < n_dim:
            pad_len = n_dim - m_unravel.shape[1]
            padd_val = np.zeros((1, pad_len))
            m_unravel = np.hstack((m_unravel, padd_val))

        image_features[i] = m_unravel
    return image_features

### Featurize Digits Train

In [171]:
imgs = convert_to_matrix(digit_train_data_path)

bad_imgs = []
for i__, im in enumerate(imgs):
    if len(im) < 10:
        bad_imgs.append(i__)
        
# for k in bad_imgs:
#     imgs.pop(k)

imgs = np.delete(np.array(imgs), bad_imgs, axis=0)

In [172]:
digit_train_X = make_features(imgs, 580)

In [173]:
digit_train_y = get_label(digit_train_label_path)

In [174]:
digit_train_X.shape

(5000, 580)

In [175]:
len(digit_train_y)

5000

### Featurize Digits Validation

In [176]:
imgs = convert_to_matrix(digit_val_data_path)

bad_imgs = []
for i__, im in enumerate(imgs):
    if len(im) < 10:
        bad_imgs.append(i__)
        
# for k in bad_imgs:
#     imgs.pop(k)

imgs = np.delete(np.array(imgs), bad_imgs, axis=0)

In [177]:
digit_val_X = make_features(imgs, 580)

In [178]:
digit_val_y = get_label(digit_val_label_path)

In [179]:
digit_val_X.shape

(1000, 580)

In [180]:
len(digit_val_y)

1000

### Featurize Digits Test

In [181]:
imgs = convert_to_matrix(digit_test_data_path)

bad_imgs = []
for i__, im in enumerate(imgs):
    if len(im) < 10:
        bad_imgs.append(i__)
        
# for k in bad_imgs:
#     imgs.pop(k)

imgs = np.delete(np.array(imgs), bad_imgs, axis=0)

In [182]:
digit_test_X = make_features(imgs, 580)
digit_test_y = get_label(digit_test_label_path)
digit_test_X.shape

(1000, 580)

In [183]:
len(digit_test_y)


1000

### Featurize Face Data

conditions:
    - greater than 65
    - where the line is empty 

Today:
    - convert all remaining datasets to features
        - digit val
        - digit test
        - face val
        - face test
    - build a model 

In [51]:
def faces_convert_to_matrix(face_train_data_path):
    main_l = []
    tmp_l = []
    cnt = 0
    with open(face_train_data_path, 'rb') as f:
        for l in f:
            l_cleaned = l.decode('utf-8').replace(' ', '').strip()
            cnt += 1
            if cnt < 65:
                tmp_l.append(l.decode('utf-8'))     
            elif cnt >= 65 and l_cleaned:
                continue
            elif cnt >= 65 and not l_cleaned:
                main_l.append(tmp_l)
                cnt = 0
                tmp_l = []
    return main_l

### Train

In [125]:
faces_imgs = faces_convert_to_matrix(face_train_data_path)
face_train_X = make_features(faces_imgs, 3904)
face_train_y = get_label(face_train_label_path)

In [126]:
face_train_X.shape

(451, 3904)

In [127]:
len(face_train_y)

451

### Validation

In [128]:
faces_imgs = faces_convert_to_matrix(face_val_data_path)
face_val_X = make_features(faces_imgs, 3904)
face_val_y = get_label(face_val_label_path)

In [129]:
face_val_X.shape

(301, 3904)

In [130]:
len(face_val_y)

301

### Test

In [131]:
faces_imgs = faces_convert_to_matrix(face_test_data_path)
face_test_X = make_features(faces_imgs, 3904)
face_test_y = get_label(face_test_label_path)

In [132]:
face_test_X.shape

(150, 3904)

In [133]:
len(face_test_y)

150

### Model Building - Perceptron

Components:
    - gradient descent
    - weight update based on batch GD
    - iteration

In [161]:
digits_n_dims = 580
faces_n_dims = 3904

digits_n_classes = 10
faces_n_classes = 2

alpha = 0.01
epochs = 20

In [None]:
[0.45,30,0.46,65,34,31,9, 8, 0.5, 2] = 3

In [150]:
def init_weight(n_dims, n_classes):
    return np.random.rand(n_classes, n_dims + 1)

def predict(w, x):
    pred_val = np.dot(w, np.append(x, 1))
    return np.argmax(pred_val)

# Back prop
def w_update(x, y, pred_class, alpha=alpha):
    w_tmp = w[pred_class]
    w[y] += alpha + np.append(x, 1) # add a bias term to your x
    w[pred_class] -= alpha * np.append(x, 1)

In [154]:
## train
def train(epochs, x_train, y_train):
    num = len(y_train)
    for j in range(0, epochs):
        acc = 0
        order = np.random.permutation(num)
        for i in range(0, num):
            x_ = x_train[order[i]]
            y_ = y_train[order[i]]
            
            y_hat = predict(w, x_)
            
            if y_hat == y_:
                acc += 1
            else:
                w_update(x_.reshape(-1, 1), y_, y_hat)
        
        accuracy = acc / float(num)
        print('Iteration {}: acc = {}'.format(j+1, accuracy*100))

In [155]:
## test
def test(x_test, y_test):
    num = len(y_test)
    acc = 0
    for i in range(0, num):
        x_ = x_test[i]
        y_ = y_test[i]
        
        y_hat = predict(w, x_)
        
        if y_hat == y_:
            acc += 1
    
    accuracy = acc/float(num)
    return accuracy*100

### Train digits model

In [147]:
w = init_weight(digits_n_dims, digits_n_classes)

In [156]:
train(epochs, digit_train_X, digit_train_y)

Iteration 1: acc = 78.38000000000001
Iteration 2: acc = 85.86
Iteration 3: acc = 86.28
Iteration 4: acc = 86.46000000000001
Iteration 5: acc = 87.52
Iteration 6: acc = 87.5
Iteration 7: acc = 87.62
Iteration 8: acc = 87.53999999999999
Iteration 9: acc = 88.02
Iteration 10: acc = 88.24
Iteration 11: acc = 88.06
Iteration 12: acc = 88.28
Iteration 13: acc = 88.24
Iteration 14: acc = 88.08
Iteration 15: acc = 88.48
Iteration 16: acc = 88.48
Iteration 17: acc = 88.56
Iteration 18: acc = 88.34
Iteration 19: acc = 88.94
Iteration 20: acc = 88.1


In [157]:
test(digit_val_X, digit_val_y)

82.8

In [158]:
test(digit_test_X, digit_test_y)

83.3

### Train faces model

In [162]:
w = init_weight(faces_n_dims, faces_n_classes)

In [163]:
w.shape

(2, 3905)

In [164]:
train(epochs, face_train_X, face_train_y)

Iteration 1: acc = 66.07538802660754
Iteration 2: acc = 90.46563192904657
Iteration 3: acc = 96.23059866962306
Iteration 4: acc = 99.33481152993349
Iteration 5: acc = 99.55654101995566
Iteration 6: acc = 100.0
Iteration 7: acc = 100.0
Iteration 8: acc = 100.0
Iteration 9: acc = 100.0
Iteration 10: acc = 100.0
Iteration 11: acc = 100.0
Iteration 12: acc = 100.0
Iteration 13: acc = 100.0
Iteration 14: acc = 100.0
Iteration 15: acc = 100.0
Iteration 16: acc = 100.0
Iteration 17: acc = 100.0
Iteration 18: acc = 100.0
Iteration 19: acc = 100.0
Iteration 20: acc = 100.0


In [165]:
test(face_val_X, face_val_y)

86.37873754152824

In [166]:
test(face_test_X, face_test_y)

85.33333333333334

### Model - Naive Bayes

In [188]:
import numpy as np
from sklearn.datasets import fetch_openml
from scipy.stats import norm
from scipy.stats import multivariate_normal as mvn
import time

In [198]:
class NaiveBayes(object):
    def fit(self, X, y, smoothing=10e-3):
        self.gaussians = dict()
        self.priors = dict()
        labels = set(y)
        for c in labels:
            current_x = X[y == c]
            self.gaussians[c] = {
                'mean': current_x.mean(axis=0),
                'var': current_x.var(axis=0) + smoothing, 
            }
            self.priors[c] = float(len(y[y==c])) / len(y)
            
    def score(self, X, y):
        p = self.predict(X)
        return np.mean(p == y)
    
    def predict(self, X):
        N, D = X.shape
        K =len(self.gaussians)
        P = np.zeros((N, K))
        for c, g in self.gaussians.items():
            mean, var = g['mean'], g['var']
            P[:, c] = mvn.logpdf(X, mean=mean, cov=var) + np.log(self.priors[c])
        return np.argmax(P, axis=1)

#### Naive Bayes on Digits

In [199]:
X_train, y_train = digit_train_X, np.array(digit_train_y)
X_val, y_val = digit_val_X, np.array(digit_val_y)
X_test, y_test = digit_test_X, np.array(digit_test_y)

model = NaiveBayes()
t0 = time.time()
model.fit(X_train, y_train)
print ("Training time:", time.time() - t0)

print("Train accuracy", model.score(X_train, y_train))
print("Validation accuracy", model.score(X_val, y_val))
print("Test accuracy:", model.score(X_test, y_test))

Training time: 0.023715734481811523
Train accuracy 0.7942
Validation accuracy 0.759
Test accuracy: 0.685


#### Naive Bayes on Faces Data

In [200]:
X_train, y_train = face_train_X, np.array(face_train_y)
X_val, y_val = face_val_X, np.array(face_val_y)
X_test, y_test = face_test_X, np.array(face_test_y)

model = NaiveBayes()
t0 = time.time()
model.fit(X_train, y_train)
print ("Training time:", time.time() - t0)

print("Train accuracy", model.score(X_train, y_train))
print("Validation accuracy", model.score(X_val, y_val))
print("Test accuracy:", model.score(X_test, y_test))

Training time: 0.04402589797973633
Train accuracy 0.9201773835920177
Validation accuracy 0.8604651162790697
Test accuracy: 0.8733333333333333
