In [1]:
# This is code for Quadratic Discriminant Analysis
# Written by William F Basener
# University of Virginia, School of Data Science
# For use in teaching Bayesian Machine Learning
#
# The code currently computes the maximum likelihood classification
# Student is to add method to compute posterior probabilities and maximum probability classification

import pandas as pd
import numpy as np


def multivariate_gaussian_pdf(X, MU, SIGMA):
    """Code from Data Blog https://xavierbourretsicotte.github.io/MLE_Multivariate_Gaussian.html
    Maximum Likelihood Estimator: Multivariate Gaussian Distribution
        by Xavier Bourret Sicotte, Fri 22 June 2018
    Returns the pdf of a multivariate Gaussian distribution
     - X, MU are p x 1 vectors
     - SIGMA is a p x p matrix"""
    # Initialize and reshape
    X = X.reshape(-1, 1)
    MU = MU.reshape(-1, 1)
    p, _ = SIGMA.shape

    # Compute values
    SIGMA_inv = np.linalg.inv(SIGMA)
    denominator = np.sqrt((2 * np.pi) ** p * np.linalg.det(SIGMA))
    exponent = -(1 / 2) * ((X - MU).T @ SIGMA_inv @ (X - MU))

    # Return result
    return float((1. / denominator) * np.exp(exponent))


class QDA:
    """Creates a class for Quadratic Discriminant Analysis
    Input:
        fname = file name for a csv file, must have one column labeled "class" and the rest numeric data
    Methods:
        compute_probabilities = given an input observation computes the likelihood for each class and the GML class
        compute_probabilities: given an input observation and prior probabilities,
            computes the posterior probabilities for each class and most probable class"""

    def __init__(self, fname):
        # reads the data and computes the statistics needed for classification

        # read the iris data as a Pandas data frame
        df = pd.read_csv(fname)

        # separate the class labels from the rest of the data
        # we are assuming the column name with class labels is 'Class'
        # and all other columns are numeric
        self.data_labels = df.loc[:]['Class']
        self.data = np.asarray(df.drop('Class', axis=1, inplace=False))

        # get information about the dimensions the data
        self.num_rows, self.num_cols = self.data.shape

        # get the class names as an array of strings
        self.class_names = np.unique(self.data_labels)

        # determine number of observations in each class
        self.num_obs = dict()
        for name in self.class_names:
            self.num_obs[name] = sum(self.data_labels == name)

        # compute the mean of each class
        self.means = dict()
        for name in self.class_names:
            self.means[name] = np.mean(self.data[self.data_labels == name, :], 0)

        # compute the covariance matrix of each class
        self.covs = dict()
        for name in self.class_names:
            self.covs[name] = np.cov(np.transpose(self.data[self.data_labels == name, :]))

    def compute_likelihoods(self, x):
        # compute and output the likelihood of each class and the maximum likelihood class

        # check that the input data x has the correct number of rows
        if not (len(x) == self.num_cols):
            print('Data vector has wrong number of values.')
            return -1

        # reformat x as a numpy array, incase the user input a list
        x = np.asarray(x)

        # compute the likelihood of each class
        likelihoods = np.zeros(len(self.class_names))
        idx = 0
        for name in self.class_names:
            likelihoods[idx] = multivariate_gaussian_pdf(x, self.means[name], self.covs[name])
            idx = idx + 1
        # get the indices for sorting the likelihoods (in descending order)
        indices_sorted = np.argsort(likelihoods)[::-1]

        # print the predicted class and all class likelihoods
        print('QDA Predicted Class: ' + self.class_names[indices_sorted[0]])
        print('QDA Class Likelihoods:')
        for idx in range(len(indices_sorted)):
            print(self.class_names[indices_sorted[idx]] + ': ' + str(likelihoods[indices_sorted[idx]]))

        # return the likelihoods
        return likelihoods

    def compute_probabilities(self, x, priors):
        # compute and output the probability of each class and the maximum probability class
        return -1


model_qda = QDA('iris_data.csv')

Iris_setosa_observation = [5.1, 3.5, 1.4, 0.2]
model_qda.compute_likelihoods(Iris_setosa_observation)

uninformative_priors = {
    "Iris-setosa": 1 / 3,
    "Iris-versicolor": 1 / 3,
    "Iris-virginica": 1 / 3
}
model_qda.compute_probabilities(Iris_setosa_observation, uninformative_priors)
print(model_qda)

QDA Predicted Class: Iris-setosa
QDA Class Likelihoods:
Iris-setosa: 13.725594445123031
Iris-versicolor: 6.846866360095676e-25
Iris-virginica: 4.150482018069567e-40
<__main__.QDA object at 0x7f544843ac50>


In [2]:
# This is code for Quadratic Discriminant Analysis
# Written by William F Basener
# University of Virginia, School of Data Science
# For use in teaching Bayesian Machine Learning
#
# The code currently computes the maximum likelihood classification
# Student is to add method to compute posterior probabilities and maximum probability classification

import pandas as pd
import numpy as np


def multivariate_gaussian_pdf(X, MU, SIGMA):
    """Code from Data Blog https://xavierbourretsicotte.github.io/MLE_Multivariate_Gaussian.html
    Maximum Likelihood Estimator: Multivariate Gaussian Distribution
        by Xavier Bourret Sicotte, Fri 22 June 2018
    Returns the pdf of a multivariate Gaussian distribution
     - X, MU are p x 1 vectors
     - SIGMA is a p x p matrix"""
    # Initialize and reshape
    X = X.reshape(-1, 1)
    MU = MU.reshape(-1, 1)
    p, _ = SIGMA.shape

    # Compute values
    SIGMA_inv = np.linalg.inv(SIGMA)
    denominator = np.sqrt((2 * np.pi) ** p * np.linalg.det(SIGMA))
    exponent = -(1 / 2) * ((X - MU).T @ SIGMA_inv @ (X - MU))

    # Return result
    return float((1. / denominator) * np.exp(exponent))


class QDA:
    """Creates a class for Quadratic Discriminant Analysis
    Input:
        fname = file name for a csv file, must have one column labeled "class" and the rest numeric data
    Methods:
        compute_probabilities = given an input observation computes the likelihood for each class and the GML class
        compute_probabilities: given an input observation and prior probabilities,
            computes the posterior probabilities for each class and most probable class"""

    def __init__(self, fname):
        # reads the data and computes the statistics needed for classification

        # read the iris data as a Pandas data frame
        df = pd.read_csv(fname)

        # separate the class labels from the rest of the data
        # we are assuming the column name with class labels is 'Class'
        # and all other columns are numeric
        self.data_labels = df.loc[:]['Class']
        self.data = np.asarray(df.drop('Class', axis=1, inplace=False))

        # get information about the dimensions the data
        self.num_rows, self.num_cols = self.data.shape

        # get the class names as an array of strings
        self.class_names = np.unique(self.data_labels)

        # determine number of observations in each class
        self.num_obs = dict()
        for name in self.class_names:
            self.num_obs[name] = sum(self.data_labels == name)

        # compute the mean of each class
        self.means = dict()
        for name in self.class_names:
            self.means[name] = np.mean(self.data[self.data_labels == name, :], 0)

        # compute the covariance matrix of each class
        self.covs = dict()
        for name in self.class_names:
            self.covs[name] = np.cov(np.transpose(self.data[self.data_labels == name, :]))

    def compute_likelihoods(self, x):
        # compute and output the likelihood of each class and the maximum likelihood class

        # check that the input data x has the correct number of rows
        if not (len(x) == self.num_cols):
            print('Data vector has wrong number of values.')
            return -1

        # reformat x as a numpy array, incase the user input a list
        x = np.asarray(x)

        # compute the likelihood of each class
        likelihoods = np.zeros(len(self.class_names))
        idx = 0
        for name in self.class_names:
            likelihoods[idx] = multivariate_gaussian_pdf(x, self.means[name], self.covs[name])
            idx = idx + 1
        # get the indices for sorting the likelihoods (in descending order)
        indices_sorted = np.argsort(likelihoods)[::-1]

        # print the predicted class and all class likelihoods
        print('QDA Predicted Class: ' + self.class_names[indices_sorted[0]])
        print('QDA Class Likelihoods:')
        for idx in range(len(indices_sorted)):
            print(self.class_names[indices_sorted[idx]] + ': ' + str(likelihoods[indices_sorted[idx]]))

        # return the likelihoods
        return likelihoods

    def compute_probabilities(self, x, priors):
        # compute and output the probability of each class and the maximum probability class

        # check that the input data x has the correct number of rows
        if not (len(x) == self.num_cols):
            print('Data vector has wrong number of values.')
            return -1

        # reformat x as a numpy array, incase the user input a list
        x = np.asarray(x)

        # compute the likelihood of each class
        likelihoods = np.zeros(len(self.class_names))
        probabilities = np.zeros(len(self.class_names))
        
        idx = 0
        for name in self.class_names:
            likelihoods[idx] = multivariate_gaussian_pdf(x, self.means[name], self.covs[name])
            idx = idx + 1
            
        print("Likeli:",likelihoods)
        print("Priors:", priors)
        
        # compute the Denominator (sum of likelihoods*priors)
        idx = 0
        denom = 0
        for name in self.class_names:
            denom = denom + (likelihoods[idx]*priors[name])
            idx = idx + 1
        print("Denom: ", denom)
        
        
         # compute the probabilities for each class 
        idx = 0 
        for name in self.class_names:
            probabilities[idx] = (likelihoods[idx]*priors[name]) / denom
            idx = idx + 1
        
        
        
        # get the indices for sorting the likelihoods (in descending order)
        #indices_sorted = np.argsort(likelihoods)[::-1]
        indices_sorted = np.argsort(probabilities)[::-1]
       
    
    # print the predicted class and all class likelihoods
        print('QDA Predicted Class: ' + self.class_names[indices_sorted[0]])
        print('QDA Class Probabilities:')
        for idx in range(len(indices_sorted)):
            print(self.class_names[indices_sorted[idx]] + ': ' + str(probabilities[indices_sorted[idx]]))
        
        return probabilities


model_qda = QDA('iris_data.csv')

Iris_setosa_observation = [5.1, 3.5, 1.4, 0.2]
#model_qda.compute_likelihoods(Iris_setosa_observation)

uninformative_priors = {
    "Iris-setosa": 1 / 3,
    "Iris-versicolor": 1 / 3,
    "Iris-virginica": 1 / 3
}
model_qda.compute_probabilities(Iris_setosa_observation, uninformative_priors)

Likeli: [1.37255944e+01 6.84686636e-25 4.15048202e-40]
Priors: {'Iris-setosa': 0.3333333333333333, 'Iris-versicolor': 0.3333333333333333, 'Iris-virginica': 0.3333333333333333}
Denom:  4.575198148374343
QDA Predicted Class: Iris-setosa
QDA Class Probabilities:
Iris-setosa: 1.0
Iris-versicolor: 4.9883933169309835e-26
Iris-virginica: 3.023899645777684e-41


array([1.00000000e+00, 4.98839332e-26, 3.02389965e-41])

In [3]:
first = [5.5,2.4,3.8,1.1]
second = [5.5,3.1,5,1.5]

In [4]:
print("\nFirst")
model_qda.compute_probabilities(first, uninformative_priors)
print("\nSECOND")
model_qda.compute_probabilities(second, uninformative_priors)


First
Likeli: [6.96249484e-52 2.56791233e+00 7.67146865e-05]
Priors: {'Iris-setosa': 0.3333333333333333, 'Iris-versicolor': 0.3333333333333333, 'Iris-virginica': 0.3333333333333333}
Denom:  0.8559963490929874
QDA Predicted Class: Iris-versicolor
QDA Class Probabilities:
Iris-versicolor: 0.9999701265523081
Iris-virginica: 2.987344769195638e-05
Iris-setosa: 2.711263447644606e-52

SECOND
Likeli: [2.07142499e-105 3.30607636e-003 4.29103948e-003]
Priors: {'Iris-setosa': 0.3333333333333333, 'Iris-versicolor': 0.3333333333333333, 'Iris-virginica': 0.3333333333333333}
Denom:  0.0025323719450157847
QDA Predicted Class: Iris-virginica
QDA Class Probabilities:
Iris-virginica: 0.5648248061588753
Iris-versicolor: 0.43517519384112463
Iris-setosa: 2.726593926101304e-103


array([2.72659393e-103, 4.35175194e-001, 5.64824806e-001])

In [5]:
Bagend_priors = {
    "Iris-setosa": 0.1,
    "Iris-versicolor": 0.2,
    "Iris-virginica": 0.7
}

In [6]:
print("\nFirst")
model_qda.compute_probabilities(first, Bagend_priors)
print("\nSECOND")
model_qda.compute_probabilities(second, Bagend_priors)


First
Likeli: [6.96249484e-52 2.56791233e+00 7.67146865e-05]
Priors: {'Iris-setosa': 0.1, 'Iris-versicolor': 0.2, 'Iris-virginica': 0.7}
Denom:  0.5136361667990312
QDA Predicted Class: Iris-versicolor
QDA Class Probabilities:
Iris-versicolor: 0.9998954507411951
Iris-virginica: 0.00010454925880481196
Iris-setosa: 1.3555304878994901e-52

SECOND
Likeli: [2.07142499e-105 3.30607636e-003 4.29103948e-003]
Priors: {'Iris-setosa': 0.1, 'Iris-versicolor': 0.2, 'Iris-virginica': 0.7}
Denom:  0.0036649429064580428
QDA Predicted Class: Iris-virginica
QDA Class Probabilities:
Iris-virginica: 0.8195837457481517
Iris-versicolor: 0.1804162542518483
Iris-setosa: 5.651997976619857e-104


array([5.65199798e-104, 1.80416254e-001, 8.19583746e-001])