<font face="Calibri" size="18px" ><strong> helper </strong></font>

# Setup Notebook

This notebook contains all the helper functions and data necessary to run the other notebooks for this week. All the functions are stored within the object <font color="purple">helper</font>

In [33]:
import numpy as np
import matplotlib.pyplot as plt
# Useful module for dealing with the Gaussian density
from scipy.stats import norm, multivariate_normal 
import ipywidgets as widgets
from IPython.display import display
from ipywidgets import interact, interactive, fixed, interact_manual, IntSlider

In [34]:
helper = type('helper', (), {})()
helper.bi  = type('bi', (), {})()
helper.hw  = type('hw', (), {})()
helper.uni = type('uni', (), {})()

In [35]:
print "The object 'helper' has been imported into this notebook."

The object 'helper' has been imported into this notebook.


# Create Dataset

In [4]:
# Now load "wine.data.txt" data set.
# This needs to be in the same directory
# 178 lines, each with one point. First value is the label (1,2,3), remaining 13 numbers are features
helper.data = np.loadtxt('wine.data.txt', delimiter=',')
# Names of features
helper.featurenames = ['Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash','Magnesium', 'Total phenols', 
                'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 
                'OD280/OD315 of diluted wines', 'Proline']

In [5]:
helper.perm = np.array([  4,  93, 103, 152,  77,  81,  14,  58, 139,  53,  40, 167,  20,
        80, 130,  16, 110, 158,  42, 135,   8,  69, 153,  94,  91,  51,
       117, 146,  72, 142, 137,  88, 165, 106,  33,  67, 133, 113, 171,
       129, 141,  21,  12,  44,   3, 164, 169,  41,   6, 177,  17, 174,
       104, 176, 168,  26, 173, 122, 159, 111, 163,  50,  15,  37, 114,
         2, 109,  68,  39,  96,  36, 149, 151, 124, 156, 108, 107,  30,
        43,  28,  54,  59, 154,  78,  92, 157, 140,  73,  34,  49, 160,
       118, 125, 126, 127, 145, 144,   9,  24,  90,  84,  55,  19, 148,
        25,  61, 123,   0,  38,  97,  32,  85,  29,  45, 128,  75,  66,
        86,  47, 102, 175,  63,  82,  83, 115, 136,  98,  46,  62, 150,
       162, 134, 138,  76,  87, 170, 105,  65,  89,  71, 112,  56,  74,
       132, 100,  27,  64, 166,  22, 155,  57, 119,  99,   7,  23,  13,
       121, 101, 116, 172,  95, 131,  10,  35,  11,  60, 161,   1,  18,
       147, 143,  31,  79,  48,   5, 120,  52,  70])
# Split 178 instances into training set (x, y) of size 130 and test set (tx, ty) of size 48
# Also split apart data and labels
# perm = np.random.permutation(178)
helper.x = helper.data[helper.perm[0:130],1:14]
helper.y = helper.data[helper.perm[0:130],0]
helper.tx = helper.data[helper.perm[130:178], 1:14]
helper.ty = helper.data[helper.perm[130:178],0]

x,y,tx,ty = helper.x, helper.y, helper.tx, helper.ty

# Univariate Notebook

In [6]:
def densityPlot(feature, label):
    plt.hist(x[y==label,feature], normed=True)
    #
    mu = np.mean(x[y==label,feature]) # mean
    var = np.var(x[y==label,feature]) # variance
    std = np.sqrt(var) # standard deviation
    #
    x_axis = np.linspace(mu - 3*std, mu + 3*std, 1000)
    plt.plot(x_axis, norm.pdf(x_axis,mu,std), 'r', lw=2)
    plt.title("Winery "+str(label) )
    plt.xlabel(featurenames[feature], fontsize=14, color='red')
    plt.ylabel('Density', fontsize=14, color='red')
    plt.show()
    
helper.uni.densityPlot = densityPlot
del densityPlot

In [7]:
# Assumes y takes on values 1,2,3
def fit_generative_model(feature):
    k = len(np.unique(y)) # number of classes
    mu = np.zeros(k+1) # list of means
    var = np.zeros(k+1) # list of variances
    pi = np.zeros(k+1) # list of class weights
    for label in range(1,k+1):
        indices = (y==label)
        mu[label] = np.mean(x[indices,feature])
        var[label] = np.var(x[indices,feature])
        pi[label] = float(sum(indices))/float(len(y))
    return mu, var, pi

helper.uni.fit_generative_model = fit_generative_model
del fit_generative_model

In [8]:
def gaussians(feature):
    mu, var, pi = helper.uni.fit_generative_model(feature)
    pi[1], pi[2], pi[3]

    colors = ['r', 'k', 'g']
    for label in range(1,4):
        m = mu[label]
        s = np.sqrt(var[label])
        x_axis = np.linspace(m - 3*s, m+3*s, 1000)
        plt.plot(x_axis, norm.pdf(x_axis,m,s), colors[label-1], label="class " + str(label))
    plt.xlabel(featurenames[feature], fontsize=14, color='red')
    plt.ylabel('Density', fontsize=14, color='red')
    plt.legend()
    plt.show()

helper.uni.gaussians = gaussians
del gaussians

In [16]:
def test_model(feature):
    mu, var, pi = helper.uni.fit_generative_model( feature)

    k = len(np.unique(y)) # Labels 1,2,...,k
    nt = len(ty) # Number of test points
    score = np.zeros((nt,k+1))
    for i in range(0,nt):
        for label in range(1,k+1):
            score[i,label] = np.log(pi[label]) + \
            norm.logpdf(tx[i,feature], mu[label], np.sqrt(var[label]))
    predictions = np.argmax(score[:,1:4], axis=1) + 1
    # Finally, tally up score
    errors = np.sum(predictions != ty)
    print "Test error using feature " + featurenames[feature] + ": " + str(errors) + "/" + str(nt)
    return errors/float(nt)
    
helper.uni.test_model = test_model
del test_model

# Bivariate Notebook

In [10]:
# Fit a Gaussian to a data set using the selected features
def fit_gaussian(x, features):
    mu = np.mean(x[:,features], axis=0)
    covar = np.cov(x[:,features], rowvar=0, bias=1)
    return mu, covar

helper.bi.fit_gaussian = fit_gaussian
del fit_gaussian

In [11]:
def fit_generative_model(x, y, features):
    k = len(np.unique(y)) # number of classes in y
    d = len(features) # number of features
    mu = np.zeros((k+1,d)) # list of means
    covar = np.zeros((k+1,d,d)) # list of covariance matrices
    pi = np.zeros(k+1) # list of class weights
    for label in range(1,k+1):
        indices = (y==label)
        mu[label,:], covar[label,:,:] = helper.bi.fit_gaussian(x[indices,:], features)
        pi[label] = float(sum(indices))/float(len(y))
    return mu, covar, pi

helper.bi.fit_generative_model = fit_generative_model
del fit_generative_model

In [12]:
def twoFeaturesPlot(f1,f2,label):
    # Setting up variables
    if f1 == f2: #<-- if f1 == f2 then then we would get an error
        print "Please choose different features for f1 and f2."
        return  
    features = [f1, f2]
    mu, covar = helper.bi.fit_gaussian(x[y==label,:], features)
    
    # Plot the training points along the two selected features
    plt.plot(x[y==label,f1], x[y==label,f2], 'ro')
    plt.xlabel(featurenames[f1], fontsize=14, color='red')
    plt.ylabel(featurenames[f2], fontsize=14, color='red')
    # For the plot: obtain limits along the x1-axis and x2-axis
    x1_lower = min(x[y==label,f1])
    x1_upper = max(x[y==label,f1])
    x1_width = x1_upper - x1_lower
    x1_lower = x1_lower - 0.2 * x1_width
    x1_upper = x1_upper + 0.2 * x1_width
    x2_lower = min(x[y==label,f2])
    x2_upper = max(x[y==label,f2])
    x2_width = x2_upper - x2_lower
    x2_lower = x2_lower - 0.2 * x2_width
    x2_upper = x2_upper + 0.2 * x2_width
    plt.xlim(x1_lower, x1_upper)
    plt.ylim(x2_lower, x2_upper)
    
    # Finally, plot a contour of the Gaussian
    res = 200 # resolution
    xg = np.linspace(x1_lower, x1_upper, res)
    yg = np.linspace(x2_lower, x2_upper, res)
    z = np.zeros((res,res))
    rv = multivariate_normal(mean=mu, cov=covar)
    
    for i in range(0,res):
        for j in range(0,res):
            z[j,i] = rv.logpdf([xg[i], yg[j]]) 
    sign, logdet = np.linalg.slogdet(covar)
    normalizer = -0.5 * (2 * np.log(6.28) + sign * logdet)
    for offset in range(0,4):
        plt.contour(xg,yg,z, levels=[normalizer - offset], colors='k', linewidths=2.0, linestyles='solid')
    # Finally, display
    plt.show()

helper.bi.twoFeaturesPlot = twoFeaturesPlot
del twoFeaturesPlot

In [13]:
def find_range(x):
    lower = min(x)
    upper = max(x)
    width = upper - lower
    lower = lower - 0.2 * width
    upper = upper + 0.2 * width
    return lower, upper

helper.bi.find_range = find_range
del find_range

In [14]:
def threeGaussiansPlot(f1,f2):
    # Setting up variables
    if f1 == f2: #<-- if f1 == f2 then then we would get an error
        print "Please choose different features for f1 and f2."
        return  
    features = [f1,f2] 
    mu, covar, pi = helper.bi.fit_generative_model(x, y, features)
    
    # Show the Gaussian fit to each class, using features f1,f2
    col = ['r', 'k', 'g']
    # Find rough ranges along each feature
    x1_lower, x1_upper = helper.bi.find_range(x[:,f1])
    x2_lower, x2_upper = helper.bi.find_range(x[:,f2])
    plt.xlim(x1_lower,x1_upper)
    plt.ylim(x2_lower,x2_upper)
    # Plot the training points along the two selected features
    plt.plot(x[y==1,f1], x[y==1,f2], 'ro')
    plt.plot(x[y==2,f1], x[y==2,f2], 'k^')
    plt.plot(x[y==3,f1], x[y==3,f2], 'gs')
    plt.xlabel(featurenames[f1], fontsize=14, color='red')
    plt.ylabel(featurenames[f2], fontsize=14, color='red')
    # Now draw a contour line for each label's Gaussian
    res = 200
    d = float(len(features))
    xg = np.linspace(x1_lower,x1_upper,res)
    yg = np.linspace(x2_lower,x2_upper,res)
    z = np.zeros((res,res))
    for label in range(1,4):
        gmean = mu[label,:]
        gcov = covar[label,:,:]
        rv = multivariate_normal(mean=gmean, cov=gcov)
        for i in range(0,res):
            for j in range(0,res):
                 z[j,i] = rv.logpdf([xg[i], yg[j]]) 
        sign, logdet = np.linalg.slogdet(gcov)
        normalizer = -0.5 * (d * np.log(6.28) + sign * logdet)
        plt.contour(xg,yg,z,levels=[normalizer - 4.0],colors=col[label-1],linewidths=2.0,linestyles='solid')
    # Finally, display
    plt.show()
    
helper.bi.threeGaussiansPlot = threeGaussiansPlot
del threeGaussiansPlot

In [73]:
def test_model(f1, f2):
    # Setting up variables
    if f1 == f2: #<-- if f1 == f2 then then we would get an error
        print "Please choose different features for f1 and f2."
        return  
    features= [f1,f2]
    mu, covar, pi = helper.bi.fit_generative_model(x, y, features)
    
    k = len(np.unique(y))  # Labels 1,2,...,k
    nt = len(ty) # Number of test points
    score = np.zeros((nt,k+1))
    for i in range(0,nt):
        for label in range(1,k+1):
            score[i,label] = np.log(pi[label]) + \
            multivariate_normal.logpdf(tx[i,features], mean=mu[label,:], cov=covar[label,:,:])
    predictions = np.argmax(score[:,1:4], axis=1) + 1
    # Finally, tally up score
    errors = np.sum(predictions != ty)
    print "Test error using feature(s): ",
    for f in features:
        print "'" + featurenames[f] + "'" + " ",
    print
    # Now test the performance of a predictor based on a subset of features
    print "Errors: " + str(errors) + "/" + str(nt)
    return errors/float(nt)

helper.bi.test_model = test_model
del test_model

In [74]:

def plot_boundary(f1,f2):

    features = [f1,f2] 
    mu, covar, pi = helper.bi.fit_generative_model(x, y, features)
    x1_lower, x1_upper = helper.bi.find_range(x[:,f1])      #  <--  Finds rough ranges along each feature
    x2_lower, x2_upper = helper.bi.find_range(x[:,f2])

    # Plot the decision boundary for a classifier based only on the two selected features
    delta = 0.005
    x1 = np.arange(x1_lower,x1_upper,delta)
    x2 = np.arange(x2_lower,x2_upper,delta)
    rv1 = multivariate_normal(mean=mu[1,:], cov = covar[1,:,:])
    rv2 = multivariate_normal(mean=mu[2,:], cov = covar[2,:,:])
    rv3 = multivariate_normal(mean=mu[3,:], cov = covar[3,:,:])
    Z1 = np.zeros((len(x1),len(x2)))
    Z2 = np.zeros((len(x1),len(x2)))
    Z3 = np.zeros((len(x1),len(x2)))
    for i in range(0,len(x1)):
        for j in range(0,len(x2)):
            v1 = np.log(pi[1]) + rv1.logpdf([x1[i],x2[j]])
            v2 = np.log(pi[2]) + rv2.logpdf([x1[i],x2[j]])
            v3 = np.log(pi[3]) + rv3.logpdf([x1[i],x2[j]])
            m = max([v1,v2,v3])
            Z1[i,j] = v1 - m
            Z2[i,j] = v2 - m
            Z3[i,j] = v3 - m
    plt.plot(x[y==1,f1], x[y==1,f2], 'ro')
    plt.plot(x[y==2,f1], x[y==2,f2], 'k^')
    plt.plot(x[y==3,f1], x[y==3,f2], 'gs')
    plt.xlabel(featurenames[f1], fontsize=14, color='red')
    plt.ylabel(featurenames[f2], fontsize=14, color='red')
    plt.contour(x1,x2,Z1.T,[-0.001])
    plt.contour(x1,x2,Z2.T,[-0.001])
    plt.contour(x1,x2,Z3.T,[-0.001])
    plt.xlim([x1_lower,x1_upper])
    plt.ylim([x2_lower,x2_upper])
    plt.show()
    
helper.bi.plot_boundary = plot_boundary
del plot_boundary

# HW_2 notebook

In [68]:
from sklearn import datasets
iris = datasets.load_iris()

In [69]:
ites = np.array([129,  40, 126,   4,  23, 147,  65,  93,  79, 117,  77,  27,  68,
        24,  50, 115, 144,  37,  85,  86,  14,  49,  54, 105, 135,  83,
        44,  99, 112,  61,  57,  64,  84,  36, 111,  69,  87,  80,  91,
       118,  47,  39,  26,  29,  51,   8,   7,  92,  97,  33,  70,  12,
       100,  53, 106,  71,  48,  25,  60,  20,  62, 143, 119,  32,  18,
       109,  52,  31, 116,  74,   2,  96,  94,  42,   6, 137,  88,   5,
        66, 131,  95, 133,  13,  58, 114, 130,  90,  11, 120,  45, 128,
       124,  73, 122, 125, 108,  43,  35, 148,  55, 145,  76,  75,  67,
       103,   1,   0, 107, 136, 113,  21,  98,  17, 102, 123, 104, 101,139,  10,  59 ])

tites = np.array([ 138, 141,  30,  81, 140,  28,  56,  41,  89, 134,
            16, 142,  63, 132,  72, 149,  15,  34,  46,  38,  78,  22,  19,
           121,   3, 127,   9, 146, 110,  82])

In [72]:
helper.hw.y  = iris.target[ites]+1
helper.hw.x  = iris.data[ites]
helper.hw.ty = iris.target[tites]+1
helper.hw.tx = iris.data[tites]

In [66]:
helper.hw.featurenames = iris.feature_names