# Lab 3 Assignments

In [9]:
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis,
    QuadraticDiscriminantAnalysis,
)

#transform text file of data instances into a List[List[]]
def txt_to_list(filename):
    dtype = [('col1', np.float64), ('col2', np.float64), ('col3', np.float64), ('col4', np.float64), ('col5', 'U20')] #'U20' indicates a unicode string of up to 20 characters
    data = np.genfromtxt(filename, dtype=dtype, delimiter = ',')
    data_list = []
    for row in data:       
        data_list.append([np.float64(row[0]), np.float64(row[1]), np.float64(row[2]), np.float64(row[3]), str(row[4])])
    return data_list

#Check if dataset is in proper form. Provided function.
def test_dataset(data):
    if len(data) != 150:
        print('list length incorrect')
        return False  
    for row in data:
        if len(row) != 5:
            print('row does not have exactly 5 elements')
            return False        
        for column in row[:-1]:
            if type(column) != np.float64:
                print('column type, excluding last one, is not np.float64')
                return False           
        if type(row[-1]) != str:
            print('last element of row is not string')
            return False
    return True

#Choose the first 80% instances from each class for training and the remaining 20% for testing.
def split_data(data_list):
    data_arr = np.array(data_list)
    #get number of instances per category
    category, count = np.unique(data_arr[:, -1], return_counts = True)
    all_freq = dict(zip(category, count))
    #print("Num of instances of each category: \n" + str(all_freq) + "\n")

    training_set = [] #80%
    test_set = [] #20%

    for cat in category:
        count_of_category = all_freq.get(cat)
        #print(count_of_category)
        category_data = data_arr[data_arr[:, -1] == cat] #check all rows to see if the last column is == to the 'cat' then return TRUE else FALSE
        num_training = int(count_of_category * 0.8)
        training_set.extend(category_data[:num_training].tolist())
        test_set.extend(category_data[num_training:].tolist())
    return training_set, test_set

#train using LDA
def lda_classifier(x_train, y_train):
    #fig, axs = plt.subplots(nrows=3, ncols=2, sharex="row", sharey="row", figsize=(8, 12))
    lda = LinearDiscriminantAnalysis()
    lda.fit(x_train, y_train)
    #plot_result(lda, x_train, y_train, ax_row[0])
    return lda
    
def lda_predict(lda, x_train, y_train, x_test, y_test):
    #Predicting labels for the training and test sets
    y_train_pred = lda.predict(x_train)
    y_test_pred = lda.predict(x_test)
    #Calculating the accuracy
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    print(f'Training Accuracy: {train_accuracy}')
    print(f'Test Accuracy: {test_accuracy}')

#train using QDA
def qda_classifier(x_train, y_train):
    qda = QuadraticDiscriminantAnalysis()
    qda.fit(x_train, y_train)
    #plot_result(qda, X, y, ax_row[1])
    return qda

def qda_predict(qda, x_train, y_train, x_test, y_test):
    #Predicting labels for the training and test sets
    y_train_pred = qda.predict(x_train)
    y_test_pred = qda.predict(x_test)
    #Calculating the accuracy
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    print(f'Training Accuracy: {train_accuracy}')
    print(f'Test Accuracy: {test_accuracy}')

data_list = txt_to_list('iris.data.txt')
print(test_dataset(data_list))
training_set, test_set = split_data(data_list)


#transform from list to np.array() type for easier manipulation
training_arr = np.array(training_set)
test_arr = np.array(test_set)
#splitting training and test data into x (features) and y (target labels)
x_train = training_arr[:, :-1].astype(np.float64)
y_train = training_arr[:, -1]
x_test = test_arr[:, :-1].astype(np.float64)
y_test = test_arr[:, -1]

print("__________________________________________________________________")
print("Train and test with all features\n")

#LDA train and predict
print("LDA (Linear Discriminant Analysis) Accuracy")
lda = lda_classifier(x_train, y_train)
lda_predict(lda, x_train, y_train, x_test, y_test)


#QDA train and predict
print("\nQDA (Quadratic Discriminant Analysis) Accuracy")
qda = qda_classifier(x_train, y_train)
qda_predict(qda, x_train, y_train, x_test, y_test)


print("__________________________________________________________________")
print("Train and test without sepal length\n")
#manipulate train and test feature array to include everything BUT the sepal length
x_train = training_arr[:, 1:4].astype(np.float64)
x_test = test_arr[:, 1:4].astype(np.float64)

#LDA train and predict
print("LDA Accuracy")
lda = lda_classifier(x_train, y_train)
lda_predict(lda, x_train, y_train, x_test, y_test)


#QDA train and predict
print("\nQDA Accuracy")
qda = qda_classifier(x_train, y_train)
qda_predict(qda, x_train, y_train, x_test, y_test)


print("__________________________________________________________________")
print("Train and test without sepal width\n")
#manipulate train and test feature array to include everything BUT the sepal width
x_train = training_arr[:, [0, 2, 3]].astype(np.float64)
x_test = test_arr[:, [0, 2, 3]].astype(np.float64)

#LDA train and predict
print("LDA Accuracy")
lda = lda_classifier(x_train, y_train)
lda_predict(lda, x_train, y_train, x_test, y_test)

#QDA train and predict
print("\nQDA Accuracy")
qda = qda_classifier(x_train, y_train)
qda_predict(qda, x_train, y_train, x_test, y_test)

print("__________________________________________________________________")
print("Train and test without petal length\n")
#manipulate train and test feature array to include everything BUT the petal length
x_train = training_arr[:, [0, 1, 3]].astype(np.float64)
x_test = test_arr[:, [0, 1, 3]].astype(np.float64)

#LDA train and predict
print("LDA Accuracy")
lda = lda_classifier(x_train, y_train)
lda_predict(lda, x_train, y_train, x_test, y_test)

#QDA train and predict
print("\nQDA Accuracy")
qda = qda_classifier(x_train, y_train)
qda_predict(qda, x_train, y_train, x_test, y_test)

print("__________________________________________________________________")
print("Train and test without petal width\n")
#manipulate train and test feature array to include everything BUT the petal width
x_train = training_arr[:, :3].astype(np.float64)
x_test = test_arr[:, :3].astype(np.float64)

#LDA train and predict
print("LDA Accuracy")
lda = lda_classifier(x_train, y_train)
lda_predict(lda, x_train, y_train, x_test, y_test)

#QDA train and predict
print("\nQDA Accuracy")
qda = qda_classifier(x_train, y_train)
qda_predict(qda, x_train, y_train, x_test, y_test)

print("__________________________________________________________________")
print("Train and test with ONLY petal length and width\n")
#manipulate train and test feature array to include ONLY petal length and width
x_train = training_arr[:, [2, 3]].astype(np.float64)
x_test = test_arr[:, [2, 3]].astype(np.float64)

#LDA train and predict
print("LDA Accuracy")
lda = lda_classifier(x_train, y_train)
lda_predict(lda, x_train, y_train, x_test, y_test)

#QDA train and predict
print("\nQDA Accuracy")
qda = qda_classifier(x_train, y_train)
qda_predict(qda, x_train, y_train, x_test, y_test)

print("__________________________________________________________________")
print("Train and test with ONLY sepal width and length\n")
#manipulate train and test feature array to include only sepal width and length
x_train = training_arr[:, 0:1].astype(np.float64)
x_test = test_arr[:, 0:1].astype(np.float64)

#LDA train and predict
print("LDA Accuracy")
lda = lda_classifier(x_train, y_train)
lda_predict(lda, x_train, y_train, x_test, y_test)

#QDA train and predict
print("\nQDA Accuracy")
qda = qda_classifier(x_train, y_train)
qda_predict(qda, x_train, y_train, x_test, y_test)



True
__________________________________________________________________
Train and test with all features

LDA (Linear Discriminant Analysis) Accuracy
Training Accuracy: 0.975
Test Accuracy: 1.0

QDA (Quadratic Discriminant Analysis) Accuracy
Training Accuracy: 0.9833333333333333
Test Accuracy: 1.0
__________________________________________________________________
Train and test without sepal length

LDA Accuracy
Training Accuracy: 0.9833333333333333
Test Accuracy: 1.0

QDA Accuracy
Training Accuracy: 0.975
Test Accuracy: 1.0
__________________________________________________________________
Train and test without sepal width

LDA Accuracy
Training Accuracy: 0.975
Test Accuracy: 1.0

QDA Accuracy
Training Accuracy: 0.9833333333333333
Test Accuracy: 1.0
__________________________________________________________________
Train and test without petal length

LDA Accuracy
Training Accuracy: 0.9416666666666667
Test Accuracy: 1.0

QDA Accuracy
Training Accuracy: 0.9583333333333334
Test Accurac

### Q1 Are any of the variables not important in classifying iris type? Explain your answer based on your experiments.

Sepal length or width may be excluded from the data set to still produce high accuracy of classification. I trained and tested several scenarios where I removed one or more measured feature of the iris. I noticed the largest decrease in accuracy when I trained and tested without the petal width or length. I then tried it with just the petal width and length. However, that did not give the best results. Neither did training with just the sepal width and length. So, looking back at the result of training and testing with three or more features, we can conclude that the data set must include petal width and length and include either sepal width or length, but not both. Thus, sepal width or length are the least important in classifying iris type.