# Lab 4 - 10/17/2024

### Split Data
These functions are the same as lab3

In [1]:
"""
Name: Yi Qian Goh
Date: 10/17/2024
Class: CSEN140 Machine Learning and Data Mining
Brief Description: This program implements Linear Discriminant Analysis (LDA) and Quadratic Discriminant Analysis (QDA) 
                using only NumPy. It does not use any machine learning library (e.g. Scikit-learn). Classifications are performed on the 
                same Iris dataset as Lab 3.
"""

import numpy as np
import time

#Function to transform text file of data instances into a List[List[]]
def txt_to_list(filename):
    dtype = [('col1', np.float64), ('col2', np.float64), ('col3', np.float64), ('col4', np.float64), ('col5', 'U20')]
    data = np.genfromtxt(filename, dtype=dtype, delimiter=',')
    data_list = []
    for row in data:
        data_list.append([np.float64(row[0]), np.float64(row[1]), np.float64(row[2]), np.float64(row[3]), str(row[4])])
    return data_list

#Function to check if dataset is in proper form
def test_dataset(data):
    if len(data) != 150:
        print('list length incorrect')
        return False
    for row in data:
        if len(row) != 5:
            print('row does not have exactly 5 elements')
            return False
        for column in row[:-1]:
            if type(column) != np.float64:
                print('column type, excluding last one, is not np.float64')
                return False
        if type(row[-1]) != str:
            print('last element of row is not string')
            return False
    return True

#Function to split data into training and test sets
def split_data(data_list):
    data_arr = np.array(data_list)
    category, count = np.unique(data_arr[:, -1], return_counts=True)
    all_freq = dict(zip(category, count))
    training_set = []  # 80%
    test_set = []  # 20%
    for cat in category:
        count_of_category = all_freq.get(cat)
        category_data = data_arr[data_arr[:, -1] == cat]
        num_training = int(count_of_category * 0.8)
        training_set.extend(category_data[:num_training].tolist())
        test_set.extend(category_data[num_training:].tolist())
    return training_set, test_set

#Load the dataset from the text file
data_list = txt_to_list('iris.data.txt')

#Split the data into training and test sets
training_set, test_set = split_data(data_list)

print("Training set:\n", training_set)
print("Test set:\n", test_set)


#Separate features and labels for training and test sets
x_train = np.array([row[:-1] for row in training_set]).astype(np.float64)
y_train = np.array([row[-1] for row in training_set])
x_test = np.array([row[:-1] for row in test_set]).astype(np.float64)
y_test = np.array([row[-1] for row in test_set])
#Encode labels to numerical values
label_encoder = {label: idx for idx, label in enumerate(np.unique(y_train))}
y_train = np.array([label_encoder[label] for label in y_train])
y_test = np.array([label_encoder[label] for label in y_test])

print("x_test (features):\n", x_test)
print("y_test (enumerated labels):\n", y_test)

Training set:
 [['5.1', '3.5', '1.4', '0.2', 'Iris-setosa'], ['4.9', '3.0', '1.4', '0.2', 'Iris-setosa'], ['4.7', '3.2', '1.3', '0.2', 'Iris-setosa'], ['4.6', '3.1', '1.5', '0.2', 'Iris-setosa'], ['5.0', '3.6', '1.4', '0.2', 'Iris-setosa'], ['5.4', '3.9', '1.7', '0.4', 'Iris-setosa'], ['4.6', '3.4', '1.4', '0.3', 'Iris-setosa'], ['5.0', '3.4', '1.5', '0.2', 'Iris-setosa'], ['4.4', '2.9', '1.4', '0.2', 'Iris-setosa'], ['4.9', '3.1', '1.5', '0.1', 'Iris-setosa'], ['5.4', '3.7', '1.5', '0.2', 'Iris-setosa'], ['4.8', '3.4', '1.6', '0.2', 'Iris-setosa'], ['4.8', '3.0', '1.4', '0.1', 'Iris-setosa'], ['4.3', '3.0', '1.1', '0.1', 'Iris-setosa'], ['5.8', '4.0', '1.2', '0.2', 'Iris-setosa'], ['5.7', '4.4', '1.5', '0.4', 'Iris-setosa'], ['5.4', '3.9', '1.3', '0.4', 'Iris-setosa'], ['5.1', '3.5', '1.4', '0.3', 'Iris-setosa'], ['5.7', '3.8', '1.7', '0.3', 'Iris-setosa'], ['5.1', '3.8', '1.5', '0.3', 'Iris-setosa'], ['5.4', '3.4', '1.7', '0.2', 'Iris-setosa'], ['5.1', '3.7', '1.5', '0.4', 'Iris-seto

### Accuracy Function

In [17]:

#function used to see what percentage of the predicted outcome matches to the true classification
def get_accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred) * 100

### LDA Class and Methods

In [18]:
#Linear Discriminant Analysis
class LDA:
    def train(self, x, y):
        self.classes = np.unique(y)
        n_features = x.shape[1]
        self.means = {}
        self.covariance = np.zeros((n_features, n_features))
        for c in self.classes:
            x_c = x[y == c]
            self.means[c] = np.mean(x_c, axis=0)
            self.covariance += np.cov(x_c, rowvar=False)

    def predict(self, x):
        #covariances are the same in the probabilty of each class
        inv_cov = np.linalg.inv(self.covariance)
        predictions = []
        for x in x:
            class_scores = []
            for c in self.classes:
                mean_vec = self.means[c]
                class_scores.append(
                    np.exp(-0.5 * np.dot(np.dot((x - mean_vec).T, inv_cov), (x - mean_vec)))
                )
            predictions.append(self.classes[np.argmax(class_scores)])
        return np.array(predictions)
    
#Train and test LDA
lda = LDA()
lda.train(x_train, y_train)
y_train_pred = lda.predict(x_train)
y_test_pred = lda.predict(x_test)
print("LDA training accuracy:", get_accuracy(y_train, y_train_pred), "%")
print("LDA test accuracy:", get_accuracy(y_test, y_test_pred))

LDA training accuracy: 97.5 %
LDA test accuracy: 100.0


### QDA Class and Methods

In [22]:
#Quadratic Discriminant Analysis
class QDA:
    def train(self, x, y):
        self.classes = np.unique(y)
        self.means = {}
        self.covariances = {}

        for c in self.classes:
            x_c = x[y == c]
            self.means[c] = np.mean(x_c, axis=0)
            self.covariances[c] = np.cov(x_c, rowvar=False)

    def predict(self, x):
        predictions = []
        for x in x:
            class_scores = []
            for c in self.classes:
                mean_vec = self.means[c]
                cov_matrix = self.covariances[c]
                inv_cov = np.linalg.inv(cov_matrix)
                ratio = 1/(np.power(2*np.pi, 2) * np.power(np.linalg.det(cov_matrix), 0.5))
                exponential = np.exp(-0.5 * np.dot(np.dot((x - mean_vec).T, inv_cov), (x - mean_vec))) 
                class_scores.append(ratio * exponential)
            predictions.append(self.classes[np.argmax(class_scores)])
        return np.array(predictions)


#Train and test QDA
start_time = time.time()
qda = QDA()
qda.train(x_train, y_train)
qda_train_time = time.time() - start_time
y_train_pred = qda.predict(x_train)
y_test_pred = qda.predict(x_test)
print("\nQDA training accuracy:", get_accuracy(y_train, y_train_pred), "%")
print("QDA test accuracy:", get_accuracy(y_test, y_test_pred), "%")
print("QDA duration:", qda_train_time, "seconds")

#check QDA classes, means and covariances
qda = QDA()
qda.train(x_train, y_train)
print("Classes:\n", qda.classes)
print("Covariances:\n", qda.covariances)
print("Means:\n", qda.means)



QDA training accuracy: 98.33333333333333 %
QDA test accuracy: 100.0 %
QDA duration: 0.002796649932861328 seconds
Classes:
 [0 1 2]
Covariances:
 {0: array([[0.13112179, 0.09897436, 0.01298077, 0.01362179],
       [0.09897436, 0.13271795, 0.00205128, 0.0145641 ],
       [0.01298077, 0.00205128, 0.02958333, 0.00458333],
       [0.01362179, 0.0145641 , 0.00458333, 0.00994231]]), 1: array([[0.27374359, 0.08661538, 0.17212821, 0.05230769],
       [0.08661538, 0.11087179, 0.08087179, 0.04538462],
       [0.17212821, 0.08087179, 0.20353205, 0.07371795],
       [0.05230769, 0.04538462, 0.07371795, 0.04307692]]), 2: array([[0.46794231, 0.11041026, 0.35777564, 0.05125641],
       [0.11041026, 0.11323077, 0.08107692, 0.04625641],
       [0.35777564, 0.08107692, 0.34532692, 0.05930769],
       [0.05125641, 0.04625641, 0.05930769, 0.07425641]])}
Means:
 {0: array([5.0375, 3.44  , 1.4625, 0.2325]), 1: array([6.01  , 2.78  , 4.3175, 1.35  ]), 2: array([6.6225, 2.96  , 5.6075, 1.99  ])}


### Diagonal QDA Class and Methods

In [23]:
#QDA with Diagonal Covariance Matrix
class diagonal_qda:
    def train(self, x, y):
        self.classes = np.unique(y)
        self.means = {}
        self.covariances = {}

        for c in self.classes:
            x_c = x[y == c]
            self.means[c] = np.mean(x_c, axis=0)
            self.covariances[c] = np.diag(np.var(x_c, axis=0))

    def predict(self, x):
        predictions = []
        for x in x:
            class_scores = []
            for c in self.classes:
                mean_vec = self.means[c]
                cov_matrix = self.covariances[c]
                inv_cov = np.linalg.inv(cov_matrix)
                ratio = 1/(np.power(2*np.pi, 2) * np.power(np.linalg.det(cov_matrix), 0.5))
                exponential = np.exp(-0.5 * np.dot(np.dot((x - mean_vec).T, inv_cov), (x - mean_vec))) 
                class_scores.append(ratio * exponential)
            predictions.append(self.classes[np.argmax(class_scores)])
        return np.array(predictions)

#Train and test Diagonal QDA
start_time = time.time()
diag_qda = diagonal_qda()
diag_qda.train(x_train, y_train)
diag_qda_train_time = time.time() - start_time
y_train_pred_diag = diag_qda.predict(x_train)
y_test_pred_diag = diag_qda.predict(x_test)
print("\nDiagonal QDA Training Accuracy:", get_accuracy(y_train, y_train_pred_diag), "%")
print("Diagonal QDA Test Accuracy:", get_accuracy(y_test, y_test_pred_diag), "%")
print("Diagonal QDA Duration:", diag_qda_train_time, "seconds")


Diagonal QDA Training Accuracy: 95.83333333333334 %
Diagonal QDA Test Accuracy: 100.0 %
Diagonal QDA Duration: 0.0015060901641845703 seconds


### Binary Classification Method

In [24]:
#Trains, tests, and returns accuracy of training and testing using both LDA and QDA.
def binary_classification(irisTypeString, x_train, y_train, x_test, y_test):
    #enumerate label np.arrays into 1s and 0s. Those that match the irisTypeString becomes 1 and others are 0.
    #Thus, transforming 3 classes into 2 classes. 
    y_train_bin = np.where(y_train == label_encoder[irisTypeString], 1, 0)
    y_train_true =  np.where(y_train == label_encoder[irisTypeString], 1, 0)
    y_test_true = np.where(y_test == label_encoder[irisTypeString], 1, 0)

    #create LDA class object and perform training and prediction. Then get accuracy of both the training set and the text set predictions.
    lda = LDA()
    lda.train(x_train, y_train_bin)
    y_train_pred = lda.predict(x_train)
    y_test_pred = lda.predict(x_test)
    lda_training_acc = get_accuracy(y_train_true, y_train_pred)
    lda_test_acc = get_accuracy(y_test_true, y_test_pred)

    #same as LDA, but with QDA
    qda = QDA()
    qda.train(x_train, y_train_bin)
    y_train_pred = qda.predict(x_train)
    y_test_pred = qda.predict(x_test)
    qda_training_acc = get_accuracy(y_train_true, y_train_pred)
    qda_test_acc = get_accuracy(y_test_true, y_test_pred)

    return lda_training_acc, lda_test_acc, qda_training_acc, qda_test_acc


lda_train_acc, lda_test_acc, qda_train_acc, qda_test_acc = binary_classification('Iris-setosa', x_train, y_train, x_test, y_test)
print("Setosa LDA training accuracy: ", lda_train_acc, "%")
print("Setosa LDA test accuracy: ", lda_test_acc, "%")
print("Setosa QDA training accuracy: ", qda_train_acc, "%")
print("Setosa QDA test accuracy: ", qda_test_acc, "%")

Setosa LDA training accuracy:  100.0 %
Setosa LDA test accuracy:  100.0 %
Setosa QDA training accuracy:  100.0 %
Setosa QDA test accuracy:  100.0 %


### Train and test everything

In [27]:
print("----------------------------------Overall Accuracy----------------------------------------")

#Train and test LDA
lda = LDA()
lda.train(x_train, y_train)
y_train_pred = lda.predict(x_train)
y_test_pred = lda.predict(x_test)
print("LDA training accuracy:", get_accuracy(y_train, y_train_pred), "%")
print("LDA test accuracy:", get_accuracy(y_test, y_test_pred))

#Train and test QDA
start_time = time.time()
qda = QDA()
qda.train(x_train, y_train)
qda_train_time = time.time() - start_time
y_train_pred = qda.predict(x_train)
y_test_pred = qda.predict(x_test)
print("\nQDA training accuracy:", get_accuracy(y_train, y_train_pred), "%")
print("QDA test accuracy:", get_accuracy(y_test, y_test_pred), "%")
print("QDA duration:", qda_train_time, "seconds")

#Train and test Diagonal QDA
start_time = time.time()
diag_qda = diagonal_qda()
diag_qda.train(x_train, y_train)
diag_qda_train_time = time.time() - start_time
y_train_pred_diag = diag_qda.predict(x_train)
y_test_pred_diag = diag_qda.predict(x_test)
print("\nDiagonal QDA Training Accuracy:", get_accuracy(y_train, y_train_pred_diag), "%")
print("Diagonal QDA Test Accuracy:", get_accuracy(y_test, y_test_pred_diag), "%")
print("Diagonal QDA Duration:", diag_qda_train_time, "seconds")



print("\n\n---------------------------Binary Classification and Accuracy---------------------------------")
print("\n\nAccuracy on Setosa Data\n-------------------------------------------------")
#call binary_classification class to train and test each method and data set.
lda_train_acc, lda_test_acc, qda_train_acc, qda_test_acc = binary_classification('Iris-setosa', x_train, y_train, x_test, y_test)
print("Setosa LDA training accuracy: ", lda_train_acc, "%")
print("Setosa LDA test accuracy: ", lda_test_acc, "%")
print("Setosa QDA training accuracy: ", qda_train_acc, "%")
print("Setosa QDA test accuracy: ", qda_test_acc, "%")

print("\n\nAccuracy on Versicolor Data\n-------------------------------------------------")
#call binary_classification class to train and test each method and data set.
lda_train_acc, lda_test_acc, qda_train_acc, qda_test_acc = binary_classification('Iris-versicolor', x_train, y_train, x_test, y_test)
print("Versicolor LDA training accuracy: ", lda_train_acc, "%")
print("Versicolor LDA test accuracy: ", lda_test_acc, "%")
print("Versicolor QDA training accuracy: ", qda_train_acc, "%")
print("Versicolor QDA test accuracy: ", qda_test_acc, "%")

print("\n\nAccuracy on Virginica Data\n-------------------------------------------------")
#call binary_classification class to train and test each method and data set.
lda_train_acc, lda_test_acc, qda_train_acc, qda_test_acc = binary_classification('Iris-virginica', x_train, y_train, x_test, y_test)
print("Virginica LDA training accuracy: ", lda_train_acc, "%")
print("Virginica LDA test accuracy: ", lda_test_acc, "%")
print("Virginica QDA training accuracy: ", qda_train_acc, "%")
print("Virginica QDA test accuracy: ", qda_test_acc, "%")


----------------------------------Overall Accuracy----------------------------------------
LDA training accuracy: 97.5 %
LDA test accuracy: 100.0

QDA training accuracy: 98.33333333333333 %
QDA test accuracy: 100.0 %
QDA duration: 0.0 seconds

Diagonal QDA Training Accuracy: 95.83333333333334 %
Diagonal QDA Test Accuracy: 100.0 %
Diagonal QDA Duration: 0.0 seconds


---------------------------Binary Classification and Accuracy---------------------------------


Accuracy on Setosa Data
-------------------------------------------------
Setosa LDA training accuracy:  100.0 %
Setosa LDA test accuracy:  100.0 %
Setosa QDA training accuracy:  100.0 %
Setosa QDA test accuracy:  100.0 %


Accuracy on Versicolor Data
-------------------------------------------------
Versicolor LDA training accuracy:  70.83333333333334 %
Versicolor LDA test accuracy:  90.0 %
Versicolor QDA training accuracy:  97.5 %
Versicolor QDA test accuracy:  96.66666666666667 %


Accuracy on Virginica Data
-----------------

 # Q1 Is there any class linearly separable from other classes? Explain your answer based on your experiments.
We can infer whether any class is linearly separable from the others by evaluating the accuracy rates and misclassifications. The accuracy rate is 100% for both the testing and training datasets in LDA and QDA for the Setosa class. This means that the Setosa class is linearly separable from the other classes. It is perfectly classified in all cases without any errors, indicating that a linear decision boundary can separate this class from the others.

Both Versicolor and Virginica have errors in classification. In the LDA results, Versicolor is often misclassified as Virginica and vice versa, as seen in the specific index errors during both training and testing. The LDA training accuracy are 69.17% and 88.33% for the Versicolor and Virginica classes respectively. This suggests that Versicolor and Virginica are not perfectly linearly separable. They are harder to distinguish from each other using LDA, which assumes shared covariance, but QDA (which accounts for different covariances) does a better job at separating them, albeit with some errors during training.