# Naive Bayes v2
In this version of the Naive Bayes notebook, we attempt to simplify our previous notebook by creating a function to automatically fit the model and find the optimal alpha values

## Load the libraries

In [1]:
# set the path
import sys, os

pathArr = os.getcwd().split("/")
scriptPath = '/'.join(map(str, pathArr[:len(pathArr)-1]))
sys.path.append(scriptPath)

# import my tools
from tools import save4later, submit, getdata

# import the sklearn libraries and numpy
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
import numpy as np

## Define accuracy functions

In [18]:
IMAGE_SIZE = 96

def get_accuracy(models_list, verbose=False, ret_acc=True):
    ''' Calculates the accuracy for a given suite of models '''
    if verbose:
        print "{:30} Accuracy".format("Model")
    
    acc_list = []
    
    for index,(feat,model) in enumerate(models_list):
        predications = model.predict(train_data.tolist())
        accuracy = np.mean(1 - abs(train_labels[:,index] - predications)/ IMAGE_SIZE)
        acc_list.append(accuracy)

        if verbose:
            print " - {f:<27} {a:.3%}".format(f=FEATURES[index],a=accuracy)
    
    if ret_acc:
        return acc_list
    
def compare_accuracies(iter_model_lists, model_labels):
    ''' Compares the accuracy of different model suites '''
    accuracies = []
    
    for mod in iter_model_lists:
        accuracies.append( get_accuracy(mod, verbose=False) )
    
    # Print report
    print " Feature  |    ACCURACIES:    ", '   '.join(model_labels)
    
    for f in xrange(len(accuracies[0])):  # Num of FEATURES
        # format all the accuracies
        _entry = "{:<27}   ".format(FEATURES[f])
        for m in xrange(len(accuracies)):
            _entry += " {:.2%}     ".format(accuracies[m][f])
        
        print _entry

## Load the data

In [3]:
# load the data
_loaded = getdata.load_data(0, test=True, nonas=True)

FEATURES = _loaded['features']
print 'Number of features:', len(FEATURES)

train_data = _loaded['training']['data']
train_labels = _loaded['training']['labels']
print 'Training dataset size: ', train_data.shape

test_data = _loaded['test']['data']
print 'Test dataset size: ', test_data.shape

Number of features: 30
Training dataset size:  (2140,)
Test dataset size:  (1783,)


## Load all the preprocessed data

In [4]:
# load the masked training data
train_masked = save4later.load_preprod("masked_nonas")

# load the sobel training data
train_sobel = save4later.load_preprod("sobel_nonas")

# load the blurred HOG training data
train_HOG = save4later.load_preprod("bhog_nonas")

# load the laplace & gaussian training data
train_LapG = save4later.load_preprod("lapgauss_nonas")

# load the gaussian blurred training data
train_gauss = save4later.load_preprod("gauss_nonas")

print "...finished loading pre-processed data"

Loaded pk
Loaded pk
Loaded pk
Loaded pk
Loaded pk
...finished loading pre-processed data


## Create a function that will train a multinomial naive bayes model on some data and find the optimal alpha values

In [5]:
def MultiNBfit(training_data, training_labels, features, parameters = {'alpha':[0.001, 0.01, 0.1, 0.5, 1.0, 5.0, 10.0]}):
    """A function that takes some training data and some parameters of interest and 
    returns a list of fitted multinomial naive bayes model on that training data where
    each model corresponds with a column in the training labels data and is optimized
    for the given alpha values"""
    
    # create an array to store the models
    multinomials = []
    
    # loop through all the possible features
    for index,feature in enumerate(features):
        
        # initalize the multinomial naive bayes model
        multinomial = MultinomialNB()
        
        # set the search for the optimal alpha given our parameter options
        alpha_search = GridSearchCV(multinomial,parameters)
        
        # fit the search model to the training data
        alpha_search.fit(training_data, training_labels[:,index])
        
        # find the best alpha value
        best_alpha = alpha_search.best_params_
        
        # fit a model with the best alpha value
        multinomial_optimal = MultinomialNB(alpha = best_alpha['alpha'])
        multinomial_optimal.fit(training_data, training_labels[:,index])
        
        # create a tuple with the model and its associated facial feature
        appending = feature, multinomial_optimal

        # append the model and its name to our list
        multinomials.append(appending)
        
    # return the list of models
    return multinomials

## Fit a naive bayes model to the original training data

In [6]:
# fit a multinomial naive bayes model and save it
multi_base = MultiNBfit(train_data.tolist(),train_labels, FEATURES)
save4later.save_model(multi_base, 'Multi_NB', 
                      'Multinomial naive bayes without preprocessed data with no NAs',overwrite=True)



## Fit a naive bayes model to masked training data
We mask our training data by removing the non-facial parts of the photographs

In [7]:
# fit a multinomial naive bayes model and save it
multi_mask = MultiNBfit(train_masked,train_labels, FEATURES)
save4later.save_model(multi_mask, 'Multi_NB_mask', 
                      'Multinomial naive bayes with masked data with no NAs',overwrite=True)

## Fit the naive bayes model to the sobel training data

In [8]:
# fit a multinomial naive bayes model and save it
multi_sobel = MultiNBfit(train_sobel,train_labels, FEATURES)
save4later.save_model(multi_sobel, 'Multi_NB_sobel', 
                      'Multinomial naive bayes with sobel data with no NAs',overwrite=True)

## Fit the naive bayes model to the blurred HOG training data

In [9]:
# fit a multinomial naive bayes model and save it
multi_HOG = MultiNBfit(train_HOG,train_labels, FEATURES)
save4later.save_model(multi_HOG, 'Multi_NB_HOG', 
                      'Multinomial naive bayes with blurred HOG data with no NAs',overwrite=True)

## Fit the naive bayes model to the LaPlace and Gaussian transformed data

In [10]:
# fit a multinomial naive bayes model and save it
multi_LapG = MultiNBfit(train_LapG,train_labels, FEATURES)
save4later.save_model(multi_LapG, 'Multi_NB_LapG', 
                      'Multinomial naive bayes with Laplace and Gaussian transformed data with no NAs',overwrite=True)

## Fit the naive bayes model to the Gaussian blurred data

In [11]:
# fit a multinomial naive bayes model and save it
multi_gauss = MultiNBfit(train_gauss,train_labels, FEATURES)
save4later.save_model(multi_gauss, 'Multi_NB_gauss', 
                      'Multinomial naive bayes with Gaussian blurred data with no NAs',overwrite=True)

## Compare the accuracies of the different models

In [14]:
# set the models and the model names to a list
all_models = [multi_base, multi_mask, multi_sobel, multi_HOG, multi_LapG]
all_model_names = ['MultNB', 'Mult_mask','Mult_sobel', 'Mult_HOG', 'Mult_LapG']

In [19]:
# compare the accuracies
compare_accuracies(all_models,all_model_names)

 Feature  |    ACCURACIES:     MultNB   Mult_mask   Mult_sobel   Mult_HOG   Mult_LapG
left_eye_center_x              96.02%      92.85%      93.59%      87.82%      93.54%     
left_eye_center_y              96.26%      92.56%      91.99%      89.60%      92.45%     
right_eye_center_x             97.17%      91.19%      91.80%      90.31%      93.66%     
right_eye_center_y             96.63%      92.03%      92.63%      88.60%      93.53%     
left_eye_inner_corner_x        97.40%      93.87%      92.02%      89.67%      93.48%     
left_eye_inner_corner_y        96.87%      93.76%      93.78%      90.30%      93.63%     
left_eye_outer_corner_x        96.66%      91.17%      93.08%      86.21%      93.01%     
left_eye_outer_corner_y        96.04%      91.53%      92.04%      88.88%      91.81%     
right_eye_inner_corner_x       97.66%      94.68%      95.43%      92.01%      94.03%     
right_eye_inner_corner_y       96.66%      93.22%      93.59%      90.68%      93.56%     
righ

## Create a submission from each of the models on the test data

In [22]:
# loop through model, generating the submission file
for index,model in enumerate(all_models):
    submit.create_generate(test_data, model, all_model_names[index], verbose=False)
    


... Created the csv file: ../../data/submissions/MultNB_submission.csv

... Created the csv file: ../../data/submissions/Mult_mask_submission.csv

... Created the csv file: ../../data/submissions/Mult_sobel_submission.csv

... Created the csv file: ../../data/submissions/Mult_HOG_submission.csv

... Created the csv file: ../../data/submissions/Mult_LapG_submission.csv
