In [1]:
from __future__ import division
from ps4_utils import load_data,load_experiment
from ps4_utils import AbstractGenerativeModel
from ps4_utils import save_submission
#from scipy.misc import logsumexp
from scipy.special import logsumexp
import numpy as np
import random
data_fn = "datasets-ps4.h5"
MAX_OUTER_ITER = 50

In [8]:
class MixtureModel(AbstractGenerativeModel):
    def __init__(self, CLASSES, NUM_FEATURES, NUM_MIXTURE_COMPONENTS, MAX_ITER=50, EPS=10**(-5)):
        AbstractGenerativeModel.__init__(self, CLASSES, NUM_FEATURES)
        self.num_mixture_components = NUM_MIXTURE_COMPONENTS # list of num_mixture_components (length num_classes)
        self.max_iter = MAX_ITER # max iterations of EM
        self.epsilon = EPS # help with stability, to be used according to hint given at end of pset4.pdf
        self.params = { # lists of length CLASSES
            'pi': [np.repeat(1/k,k) for k in self.num_mixture_components], # with pi_c for each class
            'theta': [np.zeros((self.num_features,k)) for k in self.num_mixture_components], # with theta_c for each class
        }
    def pack_params(self, X, class_idx):
        pi,theta = self.fit(X[class_idx],class_idx) # fit parameters
        self.params['pi'][class_idx] = pi # update member variable pi
        self.params['theta'][class_idx] = theta #update member variable theta
        
    #make classification based on which mixture model gives higher probability to generating point xi
    def classify(self, X):
        P = list()
        pi = self.params['pi']
        theta = self.params['theta']
        for c in range(self.num_classes):
            _,Pc = self.findP(X, pi[c], theta[c])
            P.append(Pc)
        return np.vstack(P).T.argmax(-1) # np.array of class predictions for each data point in X

    # --- E-step
    def updateLatentPosterior(self, X, pi, theta, num_mixture_components): # update the latent posterior
        # --- gamma: responsibilities (probabilities), np.array (matrix)
        # ---        shape: number of data points in X (where X consists of datapoints from class c) by NUM_MIXTURE_COMPONENTS[c]
        # note: can use output of findP here (with care taken to return gamma containing proper probabilities)
        gamma = np.zeros((X.shape[0],num_mixture_components))
        t,total = self.findP(X,pi,theta)
        for c in range(num_mixture_components):
            gamma[:,c] = np.exp(t[:,c]-total)
        return gamma
    
    # --- M-step (1)
    @staticmethod
    def updatePi(gamma): #update the pi component using the posteriors (gammas)
        # --- pi_c: class specific pi, np.array (vector)
        # ---        shape: NUM_MIXTURE_COMPONENTS[c]
        pi_c = np.sum(gamma,axis = 0)/gamma.shape[0]
        return pi_c
    
    # -- M-step (2)
    @staticmethod
    def updateTheta(X, gamma): #update theta component using posteriors (gammas)
        # --- theta_c: class specific theta, np.array matrix
        # ---        shape: NUM_FEATURES by NUM_MIXTURE_COMPONENTS[c]
        theta_c = np.dot(X.T,gamma)/np.sum(gamma,axis = 0)
        return theta_c 
    
    @staticmethod
    def findP(X, pi, theta):
        # --- t: logprobabilities of x given each component of mixture
        # ---        shape: number of data points in X (where X consists of datapoints from class c) by NUM_MIXTURE_COMPONENTS[c] 
        # --- logsumexp(t,axis=1): (for convenience) once exponentiated, gives normalization factor over all mixture components
        # ---        shape: number of data points in X (where X consists of datapoints from class c)
        
        new_theta = theta + 10**(-7)
        t = np.log(pi) + np.dot(X,np.log(new_theta)) + np.dot((1-X),np.log(1-new_theta))
        return t,logsumexp(t,axis=1)
        
    # --- execute EM procedure
    def fit(self, X, class_idx):
        max_iter = self.max_iter
        eps = self.epsilon
        N = X.shape[0]
        pi = self.params['pi'][class_idx]
        theta = self.params['theta'][class_idx]
        num_mixture_components = self.num_mixture_components[class_idx]
        # INITIALIZE theta, note theta is currently set to zeros but needs to be officially initialized here
        for i in range(num_mixture_components):
            a = np.random.randint(N, size= N//num_mixture_components)
            theta[:,i] = np.mean(X[a,:],axis=0)
        for i in range(max_iter):
            # E-step: gamma = self.updateLatentPosterior
            # M-step(1): pi = self.updatePi 
            # M-step(2): theta = self.updateTheta
            theta[theta<eps] = eps
            theta[theta>1-eps] = 1-eps
            gamma = self.updateLatentPosterior(X, pi, theta, num_mixture_components)
            pi = self.updatePi(gamma)
            theta = self.updateTheta(X, gamma)
        theta[theta<eps] = eps
        theta[theta>1-eps] = 1-eps
        return pi,theta #pi and theta, given class_idx

In [3]:
class NaiveBayesModel(AbstractGenerativeModel):
    def __init__(self, CLASSES, NUM_FEATURES, EPS=10**(-12)):
        AbstractGenerativeModel.__init__(self, CLASSES, NUM_FEATURES)
        self.epsilon = EPS # help with stability
        self.params = {
            'p': [np.zeros((NUM_FEATURES))] * self.num_classes # estimated log-probabilities of features for each class
        }
    def pack_params(self, X, class_idx):
        p = self.fit(X[class_idx])
        self.params['p'][class_idx] = p
    def classify(self, X): # naive bayes classifier
        # --- predictions: predictions for data points in X (where X consists of datapoints from class c), np.array (vector)
        # ---        shape: number of data points
        array = np.zeros((X.shape[0],len(self.params['p'])))
        for i in range(X.shape[0]):
            x = X[i]
            for c in range(len(self.params['p'])): 
                prob = self.params['p'][c]+self.epsilon #theta_c
                array[i][c] = np.sum(x*np.log(prob) + (1-x)*np.log(1-prob))
        predictions = np.argmax(array,axis = 1)
        return predictions
    def fit(self, X):
        # --- estimated_p: estimated p's of features for input X (where X consists of datapoints from class c), np.array (vector)
        # ---        shape: NUM_FEATURES
        estimated_p = np.sum(X,axis = 0)/X.shape[0]
        return estimated_p

In [9]:
from sklearn import metrics
experiment_name = "sentiment_analysis"
# --- SENTIMENT ANALYSIS setup
Xtrain,Xval,num_classes,num_features = load_experiment(data_fn, experiment_name)
# -- build naive bayes model for sentiment analysis
print("SENTIMENT ANALYSIS -- NAIVE BAYES MODEL:")
nbm = NaiveBayesModel(num_classes, num_features)
nbm.train(Xtrain)
print("ACCURACY ON VALIDATION: " + str(nbm.val(Xval)))
array1 =[]
array2 =[]

for i in range(num_classes):
    array1.append(nbm.classify(Xval[i]))
    array2.append(np.array([i]*len(nbm.classify(Xval[i]))))
y_pred = np.hstack(array1)
y_true = np.hstack(array2)
print() 
print('The confusion matrix of the validation \n dataset by running sentiment_analysis on naive bayes model')
print(metrics.confusion_matrix(y_pred,y_true))
print()

# -- build mixture model for sentiment analysis
print("SENTIMENT ANALYSIS -- MIXTURE MODEL:")
for i in range(MAX_OUTER_ITER):
    num_mixture_components =  np.random.randint(2,15,num_classes)
    print("COMPONENTS: " + " ".join(str(i) for i in num_mixture_components))
    mm = MixtureModel(num_classes, num_features, num_mixture_components)
    mm.train(Xtrain)
    print("ACCURACY ON VALIDATION: " + str(mm.val(Xval)))

# submit to kaggle
Xkaggle = load_data(data_fn, experiment_name, "kaggle")
save_submission("mm-{}-submission.csv".format(experiment_name), mm.classify(Xkaggle))

SENTIMENT ANALYSIS -- NAIVE BAYES MODEL:
ACCURACY ON VALIDATION: 0.74

The confusion matrix of the validation 
 dataset by running sentiment_analysis on naive bayes model
[[ 91  56]
 [ 74 279]]

SENTIMENT ANALYSIS -- MIXTURE MODEL:
COMPONENTS: 8 4
ACCURACY ON VALIDATION: 0.728
COMPONENTS: 3 14
ACCURACY ON VALIDATION: 0.658
COMPONENTS: 11 13
ACCURACY ON VALIDATION: 0.72
COMPONENTS: 9 11
ACCURACY ON VALIDATION: 0.726
COMPONENTS: 6 9
ACCURACY ON VALIDATION: 0.702
COMPONENTS: 5 6
ACCURACY ON VALIDATION: 0.712
COMPONENTS: 14 5
ACCURACY ON VALIDATION: 0.694
COMPONENTS: 7 13
ACCURACY ON VALIDATION: 0.71
COMPONENTS: 10 5
ACCURACY ON VALIDATION: 0.7
COMPONENTS: 12 4
ACCURACY ON VALIDATION: 0.728
COMPONENTS: 3 7
ACCURACY ON VALIDATION: 0.708
COMPONENTS: 4 13
ACCURACY ON VALIDATION: 0.678
COMPONENTS: 10 12
ACCURACY ON VALIDATION: 0.698
COMPONENTS: 14 13
ACCURACY ON VALIDATION: 0.72
COMPONENTS: 14 8
ACCURACY ON VALIDATION: 0.694
COMPONENTS: 10 7
ACCURACY ON VALIDATION: 0.732
COMPONENTS: 14 4
ACCUR

In [10]:
experiment_name = "mnist"
# --- MNIST DIGIT CLASSIFICATION setup
Xtrain,Xval,num_classes,num_features = load_experiment(data_fn, experiment_name)
# -- build naive bayes model for mnist digit classification
print("MNIST DIGIT CLASSIFICATION -- NAIVE BAYES MODEL:")
nbm = NaiveBayesModel(num_classes, num_features)
nbm.train(Xtrain)
print("ACCURACY ON VALIDATION: " + str(nbm.val(Xval)))

array1 =[]
array2 =[]

for i in range(num_classes):
    array1.append(nbm.classify(Xval[i]))
    array2.append(np.array([i]*len(nbm.classify(Xval[i]))))
y_pred = np.hstack(array1)
y_true = np.hstack(array2)
print()
print('The confusion matrix of the validation \n dataset by running mnist on naive bayes model')
print(metrics.confusion_matrix(y_pred,y_true))
print()

# -- build mixture model for mnist digit classification
print("MNIST DIGIT CLASSIFICATION -- MIXTURE MODEL:")
for i in range(MAX_OUTER_ITER):
    num_mixture_components =  np.random.randint(2,15,num_classes)
    print("COMPONENTS: " + " ".join(str(i) for i in num_mixture_components))
    mm = MixtureModel(num_classes, num_features, num_mixture_components)
    mm.train(Xtrain)
    print("ACCURACY ON VALIDATION: " + str(mm.val(Xval)))
    
# submit to kaggle
Xkaggle = load_data(data_fn, experiment_name, "kaggle")
save_submission("mm-{}-submission.csv".format(experiment_name), mm.classify(Xkaggle))

MNIST DIGIT CLASSIFICATION -- NAIVE BAYES MODEL:
ACCURACY ON VALIDATION: 0.7355

The confusion matrix of the validation 
 dataset by running mnist on naive bayes model
[[151   0   5   3   2   7   2   1   2   1]
 [  0 206  14   7   3  12   8  13  18   5]
 [  5   5 147  10   5   3   6   2  13   3]
 [  5   1  11 137   0  31   2   3  20  10]
 [  0   2   6   0 147   9   9  12   5  22]
 [ 10   3   2   8   4  91   9   1   8   5]
 [ 11   1   9   2   2   5 177   0   3   1]
 [  1   2   3   4   6   6   1 179   0  21]
 [  2   2   9   8   3   6   2   2 122   4]
 [  2   1   4   4  21   2   1  20   5 114]]

MNIST DIGIT CLASSIFICATION -- MIXTURE MODEL:
COMPONENTS: 3 12 14 5 12 11 12 2 7 11
ACCURACY ON VALIDATION: 0.7625
COMPONENTS: 10 6 8 2 12 10 11 12 6 7
ACCURACY ON VALIDATION: 0.7655
COMPONENTS: 14 4 8 8 6 3 7 13 6 3
ACCURACY ON VALIDATION: 0.7685
COMPONENTS: 2 11 5 11 9 13 8 14 9 10
ACCURACY ON VALIDATION: 0.772
COMPONENTS: 8 11 11 10 13 8 4 12 7 2
ACCURACY ON VALIDATION: 0.763
COMPONENTS: 14 5 6 