# Alex's explorations

## Gaussian Blur
Let's modify the Guassian blur from the first problem set. This may help reduce some of the noise in the data. We especially work on the efficiency of the function from the first problem set.

In [1]:
def GaussianWeight(x, y, sigma = 1.5):
    "function that calculates the weights of neighboring pixels using a 2-D normal distribution"
    
    # calculate the 2 halves of the formula separately for readability
    first_half = 1/(2 * np.power(np.pi,2))
    second_half = np.exp(-(np.power(x,2) + np.power(y,2))/(2 * np.power(sigma,2)))
    
    # bring the two-halves together
    weight = first_half * second_half
    
    return weight

def GaussianBlur(array, pic_dimension):
    "function that takes the weights the value of a individual pixel by the 8 neighboring pixels"
    
    # source for Gaussian Blur method:
    # http://www.pixelstech.net/article/1353768112-Gaussian-Blur-Algorithm
    
    # create a new array to hold the blurred values
    blurred = []
    
    # create an array to store the relative locations, we hard-code these values
    # as specific to the 8 pixel neighbors
    relative_locations = [[0,1],[-1,1],[1,1],[0,-1],[-1,-1],[1,-1],[-1,0],[1,0]]

    # we find the weights for each of the relative locations
    # create an array to hold the relative weights
    relative_weights = []
    
    # loop through each of the relative locations, finding the appropriate weight
    for relative_location in relative_locations:
        
        # append the Gaussian weight to the relative weights array
        relative_weights.append(GaussianWeight(relative_location[0],relative_location[1]))
    
    # pull each data point in the array
    for digit in array: 
        
        # reshape the digit to be a numpy array using the constansts defined
        # above for the size of the digit
        digit = np.array(digit)
        digit_reshaped = np.reshape(digit,(pic_dimension,pic_dimension))
        
        # create an array to hold the blurred digit outcome
        digit_blurred = []
    
        # loop through every row of the array
        for index_i, row in enumerate(digit_reshaped):
            
            # loop through every column of the row of the array
            for index_j, column in enumerate(row):

                # set the coordinates relative to the pixel in question
                # for example, the pixel in question is at (0,0), but the 
                # the pixel just above would be (0,1) because we go up 1
                # but not move horizontally

                # set the coordinates of the pixel and weight
                coor = index_i, index_j, GaussianWeight(0,0)

                # create an array to hold the value of each pixel
                coordinates = []

                # create an array to hold the weights from the Gaussian function
                Gaussian_weights = []   
                
                # first add the initial pixel that is being weighted to the first place in the
                # array
                coordinates.append(digit_reshaped[coor[0]][coor[1]])
                Gaussian_weights.append(coor[2])

                # loop through each of the relative locations
                for index_k,location in enumerate(relative_locations):

                    # calculate the new location for the neighboring pixel
                    relative_i = index_i + location[0]
                    relative_j = index_j + location[1]

                    # attempt 2 things: (1) find the neighboring pixel's value and 
                    # (2) find that pixels Gaussian weight
                    try:
                        coordinates.append(digit_reshaped[relative_i][relative_j])
                        Gaussian_weights.append(relative_weights[index_k])
                    except Exception:
                        pass               

                # calculate the relative Gaussian weights
                # find the sum of the Gaussian weights and create an array to store the relative weights
                relative_weights_sum = sum(Gaussian_weights)
                relative_Gaussian_weights = []

                # loop through each of the weights, recalculating as relative to the other weights
                for weight in Gaussian_weights:
                    relative_Gaussian_weights.append(weight/relative_weights_sum)

                # create a new array to store the relative weights multiplied by the pixel values
                Gaussian_pixels = []

                # multiply the relative Gaussian weights by each pixel value
                for index, weight in enumerate(relative_Gaussian_weights):
                    Gaussian_pixels.append(coordinates[index] * weight)

                # recalculate the new pixel as the sum of all neighboring pixels, appropriately weighted
                new_pixel = sum(Gaussian_pixels)

                # append this new pixel value to blurred array
                digit_blurred.append(new_pixel)
                
        # append the blurred digit to the blurred array
        blurred.append(digit_blurred)
            
    # return the new blurred array
    return blurred

In [2]:
# we verified that this modified function worked using digit data, now let's try it with
# our actual data

# set up our tools
import sys
sys.path.append('/Users/Alex/Documents/Berkeley/1601Spring/W207/PS4/facial-keypoint-detection/scripts')

# import submit module from our tools subfolder
from tools import submit, getdata

# import our libraries
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor


# load the data
_loaded = getdata.load_data(0, test=True, nonas=True)

FEATURES = _loaded['features']
print 'Number of features:', len(FEATURES)

train_data = _loaded['training']['data']
train_labels = _loaded['training']['labels']
print 'Training dataset size: ', train_data.shape

test_data = _loaded['test']['data']
print 'Test dataset size: ', test_data.shape

Number of features: 30
Training dataset size:  (2140,)
Test dataset size:  (1783,)


In [7]:
# dimensions of the image, 96x96
PIC_DIM = 96

# let's put our data through the Gaussian blur function
test_blurred = GaussianBlur(train_data[:10].tolist(), PIC_DIM)
print "Done blurring the data"

Done blurring the data


In [22]:
mini_train_data = train_data[:250]
mini_train_labels = train_labels[:250]
print mini_train_data.shape
print mini_train_labels.shape
print FEATURES

(250,)
(250, 30)
Index([u'left_eye_center_x', u'left_eye_center_y', u'right_eye_center_x',
       u'right_eye_center_y', u'left_eye_inner_corner_x',
       u'left_eye_inner_corner_y', u'left_eye_outer_corner_x',
       u'left_eye_outer_corner_y', u'right_eye_inner_corner_x',
       u'right_eye_inner_corner_y', u'right_eye_outer_corner_x',
       u'right_eye_outer_corner_y', u'left_eyebrow_inner_end_x',
       u'left_eyebrow_inner_end_y', u'left_eyebrow_outer_end_x',
       u'left_eyebrow_outer_end_y', u'right_eyebrow_inner_end_x',
       u'right_eyebrow_inner_end_y', u'right_eyebrow_outer_end_x',
       u'right_eyebrow_outer_end_y', u'nose_tip_x', u'nose_tip_y',
       u'mouth_left_corner_x', u'mouth_left_corner_y', u'mouth_right_corner_x',
       u'mouth_right_corner_y', u'mouth_center_top_lip_x',
       u'mouth_center_top_lip_y', u'mouth_center_bottom_lip_x',
       u'mouth_center_bottom_lip_y'],
      dtype='object')


## Train a logistic regression model

In [23]:
### LOGISTIC REGRESSION FITTED WITH THE BEST 'C'
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV

# create a list to hold logistic regressions
logistics = []

# initalize a potential set of reasonable C values
Lparameters = {'C':[0.001, 0.01, 0.1, 0.5, 1.0, 5.0, 10.0, 100.0]}


# loop through every facial feature
for index,facial_feature in enumerate(FEATURES):

    # initalize the logistic regression model
    logistic = LogisticRegression()

    # set the C search with the given C options and the logistic model
    C_search = GridSearchCV(logistic,Lparameters)

    # fit the Gridsearch model to the data
    C_search.fit(mini_train_data.tolist(),mini_train_labels[:,index])

    # find the best C parameter
    best_C = C_search.best_params_

    # initalize a model with the best C
    logistic_optimal = LogisticRegression(C = best_C['C'])
    logistic_optimal.fit(mini_train_data.tolist(),mini_train_labels[:,index])
    
    # create a tuple with the name of the feature and the model
    appending = facial_feature, logistic_optimal
    
    # append the name and the model to our list of facial feature models
    logistics.append(appending)



## Use the logistic regression model to predict the testing data

## Train a multinomial naive bayes model 

In [8]:
## NAIVE BAYES WITH OPTIMAL ALPHA

from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# create an array to store the multinomial naive bayes models
multinomials = []

# initalize a set of reasonable alphas that we would like to search for the optimal alpha
MNparameters = {'alpha':[0.001, 0.01, 0.1, 0.5, 1.0, 5.0, 10.0]}

# loop through all the facial features
for index,facial_feature in enumerate(FEATURES):
    
    # initalize the multinomail naive bayes model
    Multinomial = MultinomialNB()

    # set the alpha search with the given alpha options and the Multinomial model
    alpha_search = GridSearchCV(Multinomial,MNparameters)

    # fit the Gridsearch model on the training data
    alpha_search.fit(train_data.tolist(),train_labels[:,index])

    # find the best parameter
    best_alpha = alpha_search.best_params_

    # fit a model with the best alpha
    Multinomial_optimal = MultinomialNB(alpha = best_alpha['alpha'])
    Multinomial_optimal.fit(train_data.tolist(),train_labels[:,index])
    
    # create a tuple with the model and its associated facial feature
    appending = facial_feature, Multinomial_optimal
    
    # append the model and its name to our list
    multinomials.append(appending)

## Use the multinomial naive bayes model to predict the test data

In [10]:
submit.create_generate(test_data.tolist(), multinomials, 'multinomials', verbose=True)

 Predicting "left_eye_center_x"...

TypeError: list indices must be integers, not tuple

## Train a decision tree model on the data

In [11]:
# import the decision tree libraries
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

# create an array to hold all the decision trees
decisiontrees = []

# loop through the facial features
for index,facial_feature in enumerate(FEATURES):
    
    # fit a decision tree on the training data
    decisiontree = DecisionTreeClassifier.fit(train_data.tolist(),train_labels[:,index])
    
    # create a tuple with the model and its associated facial feature name
    appending = facial_feature, decisiontree
    
    # append the model to the array of decision trees
    decisiontrees.append(decisiontree)

TypeError: unbound method fit() must be called with DecisionTreeClassifier instance as first argument (got list instance instead)

## Use the decision tree model to predict the test data

## Fit the random forest model on the training data
This classifier is quite cool because it fits a number of different decision trees on subsets of the data and averages over all these samples to improve the accuracy of the prediction.

In [5]:
# import the library
from sklearn.ensemble import RandomForestClassifier 

# create an array to hold the random forest models
randomforests = []

# loop through the facial features
for index,facial_feature in enumerate(FEATURES):
    
    # fit the random forest meta model to the data
    randomforest = RandomForestClassifier.fit(train_data.tolist(),train_labels[:,index])
    
    # create a tuple with the model and its associated facial feature name
    appending = facial_feature, randomforest
    
    # append the model and its name to the array of random forests
    randomforests.append(appending)

TypeError: unbound method fit() must be called with RandomForestClassifier instance as first argument (got list instance instead)

## Use the random forests to make some predictions

In [2]:
### RUN SOME TESTS USING DATA WE ALREADY KNOW WORKS

# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# Import a bunch of libraries.
import time
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_mldata
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report

# Set the randomizer seed so results are the same each time.
np.random.seed(0)



In [3]:
# Load the digit data either from mldata.org, or once downloaded to data_home, from disk. The data is about 53MB 
# so this cell should take a while the first time your run it.
mnist = fetch_mldata('MNIST original', data_home='~/datasets/mnist')
X, Y = mnist.data, mnist.target

# Rescale grayscale values to [0,1].
X = X / 255.0

# Shuffle the input: create a random permutation of the integers between 0 and the number of data points and 
# apply this permutation to X and Y.
# NOTE: Each time you run this cell, you'll re-shuffle the data, resulting in a different ordering.
shuffle = np.random.permutation(np.arange(X.shape[0]))
X, Y = X[shuffle], Y[shuffle]

print 'data shape: ', X.shape
print 'label shape:', Y.shape

# Set some variables to hold test, dev, and training data.
test_data, test_labels = X[61000:], Y[61000:]
dev_data, dev_labels = X[60000:61000], Y[60000:61000]
train_data, train_labels = X[:60000], Y[:60000]
mini_train_data, mini_train_labels = X[:1000], Y[:1000]

data shape:  (70000, 784)
label shape: (70000,)


In [14]:
# set up some real mini data
real_data, real_labels = test_data[:10], test_labels[:10]

# blur some data
real_blurred = GaussianBlur(real_data, 28)
print "done"

done
