In [28]:
import numpy as np
import random
import sys
sys.path.insert(0, '/Users/trimcao/Dropbox/Richardson/Fall-2017/cs6375-ml-ruozzi/solution/lib')
sys.path.insert(0, '/home/trimcao/Dropbox/Richardson/Fall-2017/cs6375-ml-ruozzi/solution/lib')
from importlib import reload
# using reload to avoid having to restart the Python kernel after modifying source code
import NaiveBayes; reload(NaiveBayes)
from NaiveBayes import MultinomialNB, MixtureNB, MixtureNBDirichlet
from scipy.stats import dirichlet

In [8]:
def read_in(file_path):
    X = []
    y = []
    with open(file_path, 'r') as f:
        for line in f:
            info = line.strip('\n').split(',')
            # label is the first column
            X.append(info[1])
            y.append(info[0])
    return X, y

In [9]:
def preprocess(X, y):
    """
    Convert text strings to a data matrix of numbers.
    """
    random.seed(0) # choose a seed so I can reproduce the results
    label2int = {label:i for i, label in enumerate(set(y))}
    int2label = {i:label for i, label in enumerate(set(y))}
    chars = {'A', 'G', 'T', 'C'}
    char2int = {char:i for i, char in enumerate(chars)}
    int2char = {i:char for i, char in enumerate(chars)}
    counts = {i:0 for i in int2char}
    # feature matrix
    data_matrix = np.zeros((len(X), len(chars)))
    for i in range(data_matrix.shape[0]):
        string = X[i].split()[0]
        for char in string:
            if char == 'D':
                char = random.choice(['A', 'G', 'T'])
            elif char == 'N':
                char = random.choice(['A', 'G', 'C', 'T'])
                # print(char)
            elif char == 'S':
                char = random.choice(['C', 'G'])
            elif char == 'R':
                char = random.choice(['A', 'G'])
            data_matrix[i, char2int[char]] += 1
    # label vector
    labels = np.zeros((len(y),))
    for i in range(labels.shape[0]):
        labels[i] = label2int[y[i]]
    return data_matrix, labels, label2int, int2label, char2int, int2char

In [10]:
X, y = read_in('hw4_data/bio.data')

In [11]:
X, y, label2int, int2label, char2int, int2char = preprocess(X, y)

### One Multinomial NB Model

In [97]:
clf = MultinomialNB()
clf.fit(X,y)
preds = clf.predict(X)
np.mean(preds==y)

0.5263322884012539

### Mixture of Naive Bayes

In [12]:
# gather the data
X_train = X[:2126]
y_train = y[:2126]
X_test = X[2126:]
y_test = y[2126:]

In [13]:
X_train.shape

(2126, 4)

In [14]:
K = 5 # number of NB models
L = 3 # number of different labels
D = 4 # number of features
N = X_train.shape[0] # number of training samples
np.random.seed(0)

In [114]:
np.random.seed(0)
clf = MixtureNB(K, L, D)
for i in range(200):
    clf.em(X_train, y_train)
    if i % 50 == 0:
        print('iteration', i)
        print(clf.p_z)

preds = clf.predict(X_test)
np.mean(preds==y_test)

iteration 0
[-1.43156228 -2.97738535 -1.24658236 -1.21156418 -2.08006585]
iteration 50
[-1.42750475 -1.6528797  -2.04554912 -1.53876251 -1.49325698]
iteration 100
[-1.64871911 -1.46614275 -2.10439881 -1.47529213 -1.48607487]
iteration 150
[-1.70886814 -1.42725948 -2.2083292  -1.48988034 -1.41189935]


0.55263157894736847

In [31]:
np.random.seed(0)
# test new Dirichlet naive bayes
num_iter = 200
clf = MixtureNBDirichlet(K=5, L=3, D=4)
for i in range(num_iter):
    clf.em(X_train, y_train)
    if i % 10 == 0:
        print(i)

0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190


In [32]:
preds = clf.predict(X_test)
np.mean(preds==y_test)

0.51973684210526316

In [10]:
def initialize(K=5, L=3, D=4):
    """
    k: number of NB models
    D: number of features
    L: number of different labels
    """
    models = [MultinomialNB() for i in range(K)]
    for i in range(K):
        p_y = np.random.random(L)
        p_y = np.log(p_y / p_y.sum())
        models[i].prob_y = p_y
        p_x_given_y = np.random.random((L,D))
        for j in range(L):
            p_x_given_y[j] /= p_x_given_y[j].sum()
        models[i].prob_x_given_y = np.log(p_x_given_y)
    # initialize lambda z
    p_z = np.random.random(K)
    p_z = p_z / p_z.sum()
    p_z = np.log(p_z)
    return models, p_z

#### EM algorithm

In [11]:
def em(X_train, y_train, models, p_z, L=3, D=4):
    """
    EM algorithm for Mixture of Naive Bayes models.
    """
    N = X_train.shape[0] # number of training samples 
    K = len(models)
    
    # E-step, update Qi(z)
    q_z = np.zeros((N, K))
    for i in range(N):
        for j in range(K):
            # update q_z for sample i and NB model j
            q_z[i,j] = p_z[j] + models[j].log_prob_x_y(X_train[i],y_train[i])
        # normalize q_z[i], then take log
        q_z[i] = np.exp(q_z[i])
        q_z[i] = np.log(q_z[i] / q_z[i].sum())
    # M-step, update parameters in each model
    # k = NB model number
    # l = value of y
    # d = value of x

    # NOTE: keep q_z and p_z to be probability, not log probability
    # calculate the p_y as probabilty for each model, only calculate the log 
    # at the end
    q_z_nonlog = np.exp(q_z)
    p_z = np.sum(q_z_nonlog, axis=0)
    for k in range(K):
        # build the mttrix q_z[k] * X_train
        X_q = np.tile(q_z_nonlog[:,k], (D,1)).T * X_train
        # q_z_nonlog for z = l
        q_k = q_z_nonlog[:,k]
        p_k = p_z[k]
        models[k].prob_y = np.zeros(L)
        models[k].prob_x_given_y = np.zeros((L,D))
        for l in range(L):
            # matrix q_z for the rows when y == l
            models[k].prob_y[l] = np.log(q_k[y_train==l].sum() / p_z[k])
            # update P(x|y) for each model
            X_l = X_q[y_train==l] # part of X_q that has y == l
            models[k].prob_x_given_y[l] = np.log(X_l.sum(axis=0) / X_l.sum())        
    # divide p_z by N
    p_z = np.log(p_z / N)
    return models, q_z, p_z

What's next:
- Combine the EM steps to make a training function
- How to check for convergence
- Prediction
- Test the numbers

### Test EM algorithm 

In [12]:
np.random.seed(0)

In [13]:
models, p_z = initialize(K=5, L=3, D=4)

In [14]:
models[1].prob_y

array([-2.37845213, -3.83925239, -0.12126818])

In [15]:
models[1].prob_x_given_y

array([[-1.48220488, -1.37062569, -1.25299114, -1.45557348],
       [-1.46656711, -0.94103233, -2.82799688, -1.13965968],
       [-2.64778195, -0.76225956, -1.3557171 , -1.58563049]])

In [16]:
for i in range(200):
    models, q_z, p_z = em(X_train, y_train, models, p_z)
    if i % 50 == 0:
        print('iteration', i)
        print(p_z)

iteration 0
[-1.43156228 -2.97738535 -1.24658236 -1.21156418 -2.08006585]
iteration 50
[-1.42750475 -1.6528797  -2.04554912 -1.53876251 -1.49325698]
iteration 100
[-1.64871911 -1.46614275 -2.10439881 -1.47529213 -1.48607487]
iteration 150
[-1.70886814 -1.42725948 -2.2083292  -1.48988034 -1.41189935]


In [18]:
models[1].prob_y

array([-0.20289666, -2.00499454, -3.0164174 ])

In [19]:
models[1].prob_x_given_y

array([[-1.66554019, -1.08871828, -1.49101687, -1.38981973],
       [-1.09691085, -2.01982143, -1.95988956, -0.9351018 ],
       [-2.07750529, -1.13208376, -1.10667195, -1.50625281]])

In [20]:
print(p_z)

[-1.791748   -1.85833596 -2.28992597 -1.19878837 -1.29256288]


### Prediction 

In [33]:
def predict_prob_log(X, K=5):
    probs = []
    for i in range(X.shape[0]):
        prob = None
        for k in range(K):
            _, predict_prob = models[k].predict_single(X[i])
            if prob is None:
                prob = np.exp(predict_prob) * np.exp(p_z[k])
            else:
                prob += np.exp(predict_prob) * np.exp(p_z[k])
            # print(prob)
        probs.append(prob)
    return np.log(probs)

In [34]:
predict_prob_log(X_test[:2])

array([[-83.13788439, -83.69393242, -84.26781131],
       [-84.08328226, -83.98040351, -84.96280829]])

In [40]:
def predict(X, K=5):
    preds_prob = predict_prob_log(X)
    return np.argmax(preds_prob, axis=1)

In [43]:
preds = predict(X_test)
np.mean(preds==y_test)

0.5385338345864662

### Experiment: 10 times running EM

In [47]:
num_iter = 200
acc_train = 0
acc_test = 0
for m in range(10):
    print('start EM number', m)
    np.random.seed(m+10)
    models, p_z = initialize(K=5, L=3, D=4)
    for i in range(num_iter):
        models, q_z, p_z = em(X_train, y_train, models, p_z)
    print('finish EM.')
    train_preds = predict(X_train)
    test_preds = predict(X_test)
    acc_train += np.mean(train_preds==y_train)
    acc_test += np.mean(test_preds==y_test)    
    print('Train accuracy:', np.mean(train_preds==y_train))
    print('Test accuracy:', np.mean(test_preds==y_test))
    print()
    
print('Average accuracy on training set:', acc_train/10)
print('Average accuracy on test set:', acc_test/10)

start EM number 0
finish EM.
Train accuracy: 0.53809971778
Test accuracy: 0.522556390977

start EM number 1
finish EM.
Train accuracy: 0.541392285983
Test accuracy: 0.527255639098

start EM number 2
finish EM.
Train accuracy: 0.554562558796
Test accuracy: 0.548872180451

start EM number 3


  from ipykernel import kernelapp as app
  log_prob = (x * self.prob_x_given_y[y]).sum() + self.prob_y[y]


finish EM.
Train accuracy: 0.50987770461
Test accuracy: 0.536654135338

start EM number 4
finish EM.
Train accuracy: 0.56020696143
Test accuracy: 0.536654135338

start EM number 5
finish EM.
Train accuracy: 0.555032925682
Test accuracy: 0.552631578947

start EM number 6
finish EM.
Train accuracy: 0.542803386642
Test accuracy: 0.536654135338

start EM number 7
finish EM.
Train accuracy: 0.540451552211
Test accuracy: 0.523496240602

start EM number 8
finish EM.
Train accuracy: 0.547507055503
Test accuracy: 0.547932330827

start EM number 9
finish EM.
Train accuracy: 0.55409219191
Test accuracy: 0.546052631579

Average accuracy on training set: 0.544402634055
Average accuracy on test set: 0.53787593985


### EM with Dirichlet distribution

In [30]:
from scipy.stats import dirichlet

In [45]:
def initialize_dirichlet(K=5, L=3, D=4):
    """
    k: number of NB models
    D: number of features
    L: number of different labels
    """
    models = [MultinomialNB() for i in range(K)]
    for i in range(K):
        models[i].prob_y = dirichlet([2]*L)
        # create the dirichlet distribution individually so I have
        # separate scipy.stats.dirichlet objects (and I can change
        # them later).
        prob_x_given_y = []
        for j in range(L):
            prob_x_given_y.append(dirichlet([2]*D))
#         models[i].prob_x_given_y = [dirichlet([2]*D)]*L
        models[i].prob_x_given_y = prob_x_given_y
    # initialize lambda z
    p_z = np.random.random(K)
    p_z = p_z / p_z.sum()
    p_z = np.log(p_z)
    return models, p_z

In [85]:
def em_dirichlet(X_train, y_train, models, p_z, L=3, D=4):
    """
    EM algorithm for Mixture of Naive Bayes models.
    """
    N = X_train.shape[0] # number of training samples 
    K = len(models)
    
    # E-step, update Qi(z)
    q_z = np.zeros((N, K))
    for i in range(N):
        for j in range(K):
            # update q_z for sample i and NB model j
            q_z[i,j] = p_z[j] + models[j].log_prob_x_y_dirichlet(X_train[i],y_train[i])
        # normalize q_z[i], then take log
        q_z[i] = np.exp(q_z[i])
        q_z[i] = np.log(q_z[i] / q_z[i].sum())
        
    # M-step, update parameters in each model
    # k = NB model number
    # l = value of y
    # d = value of x

    # NOTE: keep q_z and p_z to be probability, not log probability
    # calculate the p_y as probabilty for each model, only calculate the log 
    # at the end
    q_z_nonlog = np.exp(q_z)
    p_z = np.sum(q_z_nonlog, axis=0)
    for k in range(K):
        # build the mttrix q_z[k] * X_train
        X_q = np.tile(q_z_nonlog[:,k], (D,1)).T * X_train
        # q_z_nonlog for z = l
        q_k = q_z_nonlog[:,k]
        p_k = p_z[k]
        # models[k].prob_y = np.zeros(L)
        # models[k].prob_x_given_y = np.zeros((L,D))
        for l in range(L):
            # matrix q_z for the rows when y == l
            # models[k].prob_y.alpha[l] = np.int(q_k[y_train==l].sum())
            models[k].prob_y.alpha[l] += np.int(q_k[y_train==l].sum())
            # update P(x|y) for each model
            X_l = X_q[y_train==l] # part of X_q that has y == l
            # print(X_l.sum(axis=0))
            # print(X_l.sum(axis=0).astype(np.int64))
            # models[k].prob_x_given_y[l].alpha = X_l.sum(axis=0).astype(np.int64)
            models[k].prob_x_given_y[l].alpha += X_l.sum(axis=0).astype(np.int64)
    # divide p_z by N
    p_z = np.log(p_z / N)
    return models, q_z, p_z

In [86]:
# test EM algorithm with Dirichlet distribution
num_iter = 1000
print('start EM number', 1)
np.random.seed(0)
models, p_z = initialize_dirichlet(K=5, L=3, D=4)
for i in range(num_iter):
    models, q_z, p_z = em_dirichlet(X_train, y_train, models, p_z)
    if i % 50 == 0:
        print(i)
        print(p_z)
        print(models[0].prob_y.alpha)
        print(models[0].prob_x_given_y[0].alpha)
        print()
print('finish EM.')

start EM number 1
0
[-1.64887203 -1.44873541 -1.54561853 -1.61164416 -1.8317879 ]
[104 201 101]
[1419 1261 1599 2018]

50
[-1.60852164 -1.46430882 -1.52467218 -1.64719095 -1.84328641]
[102 217 105]
[1374 1250 1613 1925]

100
[-1.58897832 -1.4730118  -1.52637274 -1.68021009 -1.81416123]
[104 221 108]
[1397 1273 1638 1953]

150
[-1.58950314 -1.51938611 -1.51576935 -1.67947996 -1.76599358]
[105 220 107]
[1410 1283 1654 1985]

200
[-1.60682987 -1.52179235 -1.50873987 -1.67337562 -1.75819163]
[103 217 105]
[1379 1259 1624 1933]

250
[-1.60955387 -1.56197539 -1.49667903 -1.68684157 -1.70745781]
[101 217 105]
[1364 1245 1603 1906]

300
[-1.62287366 -1.56671507 -1.52921972 -1.65799161 -1.67817978]
[101 214 103]
[1353 1237 1591 1893]

350
[-1.6542578  -1.5969406  -1.52178872 -1.61632879 -1.66440462]
[ 98 207 100]
[1320 1202 1550 1849]

400
[-1.65678781 -1.60177321 -1.53453543 -1.59136418 -1.66862   ]
[ 97 207 100]
[1308 1192 1537 1833]

450
[-1.64837321 -1.61822944 -1.56335148 -1.57161408 -1.64

In [87]:
p_z

array([-1.64207738, -1.60597727, -1.63509683, -1.58770854, -1.57792833])

In [None]:
[-1.79174801 -1.85833594 -2.28992598 -1.19878836 -1.29256288]

In [80]:
def predict_prob_log_dirichlet(X, K=5):
    probs = []
    for i in range(X.shape[0]):
        prob = None
        for k in range(K):
            _, predict_prob = models[k].predict_single_dirichlet(X[i])
            if prob is None:
                prob = np.exp(predict_prob) * np.exp(p_z[k])
            else:
                prob += np.exp(predict_prob) * np.exp(p_z[k])
            # print(prob)
        probs.append(prob)
    return np.log(probs)

In [81]:
def predict_dirichlet(X, K=5):
    preds_prob = predict_prob_log_dirichlet(X)
    return np.argmax(preds_prob, axis=1)

In [88]:
predict_prob_log_dirichlet(X_test[1],K=5)

array([[ 21.24369977,  21.50989469,  21.25596562],
       [  8.25809567,   8.51072578,   8.24030257],
       [ 17.24715383,  17.5006175 ,  17.24148748],
       [ 14.24876503,  14.50583837,  14.24863845]])

In [89]:
preds = predict_dirichlet(X_test, K=5)
np.mean(preds==y_test)

0.51221804511278191

### Experiment: 10 times running EM with Dirichlet distribution

In [91]:
num_iter = 600
acc_train = 0
acc_test = 0
for m in range(10):
    print('start EM number', m)
    np.random.seed(m+10)
    models, p_z = initialize_dirichlet(K=5, L=3, D=4)
    for i in range(num_iter):
        models, q_z, p_z = em_dirichlet(X_train, y_train, models, p_z)
    print('finish EM.')
    train_preds = predict_dirichlet(X_train)
    test_preds = predict_dirichlet(X_test)
    acc_train += np.mean(train_preds==y_train)
    acc_test += np.mean(test_preds==y_test)    
    print('Train accuracy:', np.mean(train_preds==y_train))
    print('Test accuracy:', np.mean(test_preds==y_test))
    print()
    
print('Average accuracy on training set:', acc_train/10)
print('Average accuracy on test set:', acc_test/10)

start EM number 0
finish EM.
Train accuracy: 0.498118532455
Test accuracy: 0.513157894737

start EM number 1
finish EM.
Train accuracy: 0.505174035748
Test accuracy: 0.516917293233

start EM number 2
finish EM.
Train accuracy: 0.507996237065
Test accuracy: 0.523496240602

start EM number 3
finish EM.
Train accuracy: 0.504233301976
Test accuracy: 0.515977443609

start EM number 4
finish EM.
Train accuracy: 0.501881467545
Test accuracy: 0.520676691729

start EM number 5
finish EM.
Train accuracy: 0.503762935089
Test accuracy: 0.518796992481

start EM number 6
finish EM.
Train accuracy: 0.50611476952
Test accuracy: 0.515037593985

start EM number 7
finish EM.
Train accuracy: 0.508936970837
Test accuracy: 0.513157894737

start EM number 8
finish EM.
Train accuracy: 0.50611476952
Test accuracy: 0.510338345865

start EM number 9
finish EM.
Train accuracy: 0.503762935089
Test accuracy: 0.515037593985

Average accuracy on training set: 0.504609595484
Average accuracy on test set: 0.51625939849