In [1]:
from __future__ import division
from ps4_utils import load_data,load_experiment
from ps4_utils import AbstractGenerativeModel
from ps4_utils import save_submission
from scipy.misc import logsumexp
import numpy as np
data_fn = "datasets-ps4.h5"
MAX_OUTER_ITER = 15

In [2]:
def log(x):
    return np.log(x + np.finfo(x.dtype).eps)

In [3]:
class MixtureModel(AbstractGenerativeModel):
    def __init__(self, CLASSES, NUM_FEATURES, NUM_MIXTURE_COMPONENTS, MAX_ITER=50, EPS=10**(-7)):
        AbstractGenerativeModel.__init__(self, CLASSES, NUM_FEATURES)
        self.num_mixture_components = NUM_MIXTURE_COMPONENTS # list of num_mixture_components (length num_classes)
        self.max_iter = MAX_ITER # max iterations of EM
        self.epsilon = EPS # help with stability, to be used according to hint given at end of pset4.pdf
        self.params = { # lists of length CLASSES
            'pi': [np.repeat(1/k,k) for k in self.num_mixture_components], # with pi_c for each class
            'theta': [np.zeros((self.num_features,k)) for k in self.num_mixture_components], # with theta_c for each class
        }
    def pack_params(self, X, class_idx):
        pi,theta = self.fit(X[class_idx],class_idx) # fit parameters
        self.params['pi'][class_idx] = pi # update member variable pi
        self.params['theta'][class_idx] = theta #update member variable theta
        
    #make classification based on which mixture model gives higher probability to generating point xi
    def classify(self, X):
        P = list()
        pi = self.params['pi']
        theta = self.params['theta']
        for c in range(self.num_classes):
            _,Pc = self.findP(X, pi[c], theta[c])
            P.append(Pc)
        return np.vstack(P).T.argmax(-1) # np.array of class predictions for each data point in X

    # --- E-step
    def updateLatentPosterior(self, X, pi, theta): # update the latent posterior
        # YOUR CODE HERE
        # --- gamma: np.array (matrix)
        # ---        shape: number of data points in X (where X consists of datapoints from class c) by NUM_MIXTURE_COMPONENTS[c]
        p, logP = self.findP(X, pi, theta)
        return (p.T / np.e ** logP).T
    # --- M-step (1)
    @staticmethod
    def updatePi(gamma): #update the pi component using the posteriors (gammas)
        # YOUR CODE HERE
        # --- pi_c: class specific pi, np.array (vector)
        # ---        shape: NUM_MIXTURE_COMPONENTS[c]
        return gamma.mean(axis=0)
    # -- M-step (2)
    @staticmethod
    def updateTheta(X, gamma): #update theta component using posteriors (gammas)
        # YOUR CODE HERE
        # --- theta_c: class specific theta, np.array matrix
        # ---        shape: NUM_FEATURES by NUM_MIXTURE_COMPONENTS[c]
        return X.T.dot(gamma) / gamma.sum(axis=0)
    
    @staticmethod
    def findP(X, pi, theta):
        # YOUR CODE HERE
        # --- t: probabilities of x given each component of mixture
        # ---        shape: number of data points in X (where X consists of datapoints from class c) by NUM_MIXTURE_COMPONENTS[c] 
        # --- logsumexp(t,axis=1): normalized by factor of probabilities of x over all components of mixture
        # ---        shape: number of data points in X (where X consists of datapoints from class c)
        t = log(pi) + X.dot(log(theta)) + (1 - X).dot(log(1 - theta))
        return np.e ** t, logsumexp(t, axis=1)
        
    # --- execute EM procedure
    def fit(self, X, class_idx):
        max_iter = self.max_iter
        eps = self.epsilon
        N = X.shape[0]
        pi = self.params['pi'][class_idx]
        theta = self.params['theta'][class_idx]
        num_mixture_components = self.num_mixture_components[class_idx]
        
        # INITIALIZE theta
        components = [[] for _ in range(num_mixture_components)]
        for x in X:
            components[np.random.randint(len(components))].append(x)
        theta = np.array([np.array(component).mean(axis=0) for component in components]).T
        
        for i in range(max_iter):
            # YOUR CODE HERE, E-step: gamma = self.updateLatentPosterior
            gamma = self.updateLatentPosterior(X, pi, theta)
            # YOUR CODE HERE, M-step(1): pi = self.updatePi
            pi = self.updatePi(gamma)
            # YOUR CODE HERE, M-step(2): theta = self.updateTheta
            theta = np.clip(self.updateTheta(X, gamma), self.epsilon, 1 - self.epsilon)
        return pi,theta #pi and theta, given class_idx

In [4]:
class NaiveBayesModel(AbstractGenerativeModel):
    def __init__(self, CLASSES, NUM_FEATURES, EPS=10**(-12)):
        AbstractGenerativeModel.__init__(self, CLASSES, NUM_FEATURES)
        self.epsilon = EPS # help with stability
        self.params = {
            'logp': [np.zeros((NUM_FEATURES))] * self.num_classes # estimated log-probabilities of features for each class
        }
    def pack_params(self, X, class_idx):
        logp = self.fit(X[class_idx])
        self.params['logp'][class_idx] = logp
    def classify(self, X): # naive bayes classifier
        # YOUR CODE HERE
        # --- predictions: predictions for data points in X (where X consists of datapoints from class c), np.array (vector)
        # ---       shape: number of data points
        logp = np.array(self.params['logp'])
        return (X.dot(logp.T) + (1 - X).dot(log(1 - np.e ** logp).T)).argmax(axis=1)
    def fit(self, X):
        # YOUR CODE HERE
        # --- estimated_logp: estimated logp's of features for input X (where X consists of datapoints from class c), np.array (vector)
        # ---          shape: NUM_FEATURES
        return log(X.mean(axis=0))

In [5]:
experiment_name = "sentiment_analysis"
# --- SENTIMENT ANALYSIS setup
Xtrain,Xval,num_classes,num_features = load_experiment(data_fn, experiment_name)

# -- build naive bayes model for sentiment analysis
print("SENTIMENT ANALYSIS -- NAIVE BAYES MODEL:")
nbm = NaiveBayesModel(num_classes, num_features)
nbm.train(Xtrain)
print("ACCURACY ON VALIDATION: " + str(nbm.val(Xval)))

# -- build mixture model for sentiment analysis
print("SENTIMENT ANALYSIS -- MIXTURE MODEL:")

best_model = None
best_accuracy = None

for i in range(MAX_OUTER_ITER):
    num_mixture_components =  np.random.randint(2,15,num_classes)
    print("COMPONENTS: " + " ".join(str(i) for i in num_mixture_components))
    mm = MixtureModel(num_classes, num_features, num_mixture_components)
    mm.train(Xtrain)
    accuracy = mm.val(Xval)
    print("ACCURACY ON VALIDATION:", accuracy)
    
    if best_accuracy is None or accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = mm

# submit to kaggle
Xkaggle = load_data(data_fn, experiment_name, "kaggle")
save_submission("mm-{}-submission.csv".format(experiment_name), mm.classify(Xkaggle))

SENTIMENT ANALYSIS -- NAIVE BAYES MODEL:
ACCURACY ON VALIDATION: 0.72
SENTIMENT ANALYSIS -- MIXTURE MODEL:
COMPONENTS: 6 13
ACCURACY ON VALIDATION: 0.718
COMPONENTS: 2 14
ACCURACY ON VALIDATION: 0.61
COMPONENTS: 9 8
ACCURACY ON VALIDATION: 0.7
COMPONENTS: 4 9
ACCURACY ON VALIDATION: 0.724
COMPONENTS: 10 14
ACCURACY ON VALIDATION: 0.712
COMPONENTS: 11 5
ACCURACY ON VALIDATION: 0.708
COMPONENTS: 10 4
ACCURACY ON VALIDATION: 0.732
COMPONENTS: 11 9
ACCURACY ON VALIDATION: 0.702
COMPONENTS: 14 10
ACCURACY ON VALIDATION: 0.718
COMPONENTS: 9 9
ACCURACY ON VALIDATION: 0.734
COMPONENTS: 4 3
ACCURACY ON VALIDATION: 0.74
COMPONENTS: 11 2
ACCURACY ON VALIDATION: 0.698
COMPONENTS: 10 12
ACCURACY ON VALIDATION: 0.748
COMPONENTS: 9 13
ACCURACY ON VALIDATION: 0.71
COMPONENTS: 9 8
ACCURACY ON VALIDATION: 0.728
Saved: mm-sentiment_analysis-submission.csv


In [6]:
experiment_name = "mnist"
# --- MNIST DIGIT CLASSIFICATION setup
Xtrain,Xval,num_classes,num_features = load_experiment(data_fn, experiment_name)

# -- build naive bayes model for mnist digit classification
print("MNIST DIGIT CLASSIFICATION -- NAIVE BAYES MODEL:")
nbm = NaiveBayesModel(num_classes, num_features)
nbm.train(Xtrain)
print("ACCURACY ON VALIDATION: " + str(nbm.val(Xval)))

# -- build mixture model for mnist digit classification
print("MNIST DIGIT CLASSIFICATION -- MIXTURE MODEL:")

best_model = None
best_accuracy = None

for i in range(MAX_OUTER_ITER):
    num_mixture_components =  np.random.randint(2,15,num_classes)
    print("COMPONENTS: " + " ".join(str(i) for i in num_mixture_components))
    mm = MixtureModel(num_classes, num_features, num_mixture_components)
    mm.train(Xtrain)
    accuracy = mm.val(Xval)
    print("ACCURACY ON VALIDATION:", accuracy)
    
    if best_accuracy is None or accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = mm

# submit to kaggle
Xkaggle = load_data(data_fn, experiment_name, "kaggle")
save_submission("mm-{}-submission.csv".format(experiment_name), best_model.classify(Xkaggle))

MNIST DIGIT CLASSIFICATION -- NAIVE BAYES MODEL:
ACCURACY ON VALIDATION: 0.733
MNIST DIGIT CLASSIFICATION -- MIXTURE MODEL:
COMPONENTS: 5 6 3 12 9 12 14 8 12 7
ACCURACY ON VALIDATION: 0.7805
COMPONENTS: 4 10 10 5 2 13 8 7 11 3
ACCURACY ON VALIDATION: 0.782
COMPONENTS: 13 5 12 6 13 10 9 12 12 7
ACCURACY ON VALIDATION: 0.782
COMPONENTS: 9 11 9 10 6 7 3 13 10 4
ACCURACY ON VALIDATION: 0.778
COMPONENTS: 11 5 3 6 5 4 2 5 3 9
ACCURACY ON VALIDATION: 0.785
COMPONENTS: 8 2 14 13 3 3 4 5 10 13
ACCURACY ON VALIDATION: 0.7825
COMPONENTS: 10 10 12 3 5 3 5 4 9 7
ACCURACY ON VALIDATION: 0.7915
COMPONENTS: 4 5 4 3 12 3 3 6 10 7
ACCURACY ON VALIDATION: 0.7795
COMPONENTS: 14 2 5 8 10 13 3 8 13 6
ACCURACY ON VALIDATION: 0.777
COMPONENTS: 8 3 7 7 10 13 7 12 6 6
ACCURACY ON VALIDATION: 0.7935
COMPONENTS: 5 6 14 14 7 2 11 13 4 8
ACCURACY ON VALIDATION: 0.773
COMPONENTS: 2 14 9 7 14 7 11 11 10 9
ACCURACY ON VALIDATION: 0.783
COMPONENTS: 4 13 13 7 11 11 8 6 14 12
ACCURACY ON VALIDATION: 0.7885
COMPONENTS: 9 