In [6]:
# Import useful packages
import numpy as np
import sklearn
import sklearn.tree
import sklearn.metrics
from sklearn.model_selection import GridSearchCV

In [7]:
# Define functions and common variables
def create_output_file(y_actual, y_predicted, labels, filename):
    '''
    Function for creating the ouput file as specified in the assignment instructions:
    a) instance number and predicted label (number)
    b) Confusion matrix
    c) Precision, recall and f1-measure of each class
    d) Accuracy, macro-average f1 and weighted-average f1 score of the model

    y_actual: numpy array of shape (N,) containing the actual class of each test instance
    y_predicted: numpy array of shape (N,) containing the class of each test instance predicted by the model
    labels: 1D numpy array containing the class labels of the dataset
    filename: name of the output (.csv) file
    '''

    # Open file
    output = open('A1-Output/' + filename + '.csv', 'w')

    # a) Write y values of test data
    output.write('instance,prediction\n')
    for i in range(y_predicted.shape[0]):
        output.write(str(i+1) + ',' + str(y_predicted[i]) + '\n')

    output.write('\n')

    # b) Plot confusion matrix
    output.write('confusion matrix\n')
    confusion_matrix = sklearn.metrics.confusion_matrix(y_actual, y_predicted)
    (m, n) = confusion_matrix.shape
    for i in range(m):
        for j in range(n):
            if j < n-1:
                output.write(str(confusion_matrix[i,j]) + ',')
            else:
                output.write(str(confusion_matrix[i,j]))
        output.write('\n')

    output.write('\n')

    # c) Write precision, recall and f1-measure of each class (rounded to 2 decimals)
    output.write('precision,recall,f1-measure\n')
    precision = sklearn.metrics.precision_score(y_actual, y_predicted, average=None)
    recall = sklearn.metrics.recall_score(y_actual, y_predicted, average=None)
    f1 = sklearn.metrics.f1_score(y_actual, y_predicted, average=None)

    for i in range(labels.shape[0]):
        #output.write(str(precision[i]) + ',' + str(recall[i]) + ',' + str(f1[i]) + '\n')
        output.write('{:.2f},{:.2f},{:.2f}\n'.format(precision[i], recall[i], f1[i]))

    output.write('\n')

    # Write accuracy, macro-average f1 and weighted-average f1 of the model (rounded to 2 decimals)
    output.write('accuracy,macro-average f-1,weighted-average f1\n') 
    accuracy = sklearn.metrics.accuracy_score(y_actual, y_predicted)
    macro_avg_f1 = sklearn.metrics.f1_score(y_actual, y_predicted, average='macro')
    weighted_avg_f1 = sklearn.metrics.f1_score(y_actual, y_predicted, average='weighted')
    #output.write(str(accuracy) + ',' + str(macro_avg_f1) + ',' + str(weighted_avg_f1))
    output.write('{:.2f},{:.2f},{:.2f}\n'.format(accuracy, macro_avg_f1, weighted_avg_f1))

    # Close output file
    output.close()

def load_dataset(filename, nb_pixels=32**2):
    '''
    Function for loading the X and Y data of the passed csv file

    filename: name of the file containing the dataset (ex: train_1)

    Return: 
    X: 2D numpy array containing the value of the features of each instance
    Y: 1D numpy array containing the true class of each instance 
    '''
    data = np.loadtxt('Assig1-Dataset/' + filename + '.csv', delimiter=',', dtype=np.int32)
    return data[:, :nb_pixels], data[:, nb_pixels]

In [9]:
# Load training and validation data for dataset 1
ds1_labels = np.loadtxt('Assig1-Dataset/info_1.csv', skiprows=1, usecols=1, delimiter=',', dtype=np.str)

ds1_training_X, ds1_training_Y = load_dataset('train_1')
ds1_val_X, ds1_val_Y = load_dataset('val_1')

In [10]:
# Execute a grid search for the best hyperparameters to use with Best DT Model, for Data set 1
parameters = [{'criterion': ['gini'], 
               'max_depth': [None, 10],
               'min_samples_split': [2, 3, 4, 5, 6],
               'min_impurity_decrease': [0.0, 0.02, 0.05, 0.1, 0.15, 0.2],
               'class_weight': [None, 'balanced']},
              {'criterion': ['entropy'], 
               'max_depth': [None, 10],
               'min_samples_split': [2, 3, 4, 5, 6],
               'min_impurity_decrease': [0.0, 0.02, 0.05, 0.1, 0.15, 0.2],
               'class_weight': [None, 'balanced']}]

print("Finding best hyperparameters using grid search..." + "\n")
ds1_best_dt_model = GridSearchCV(
    sklearn.tree.DecisionTreeClassifier(), parameters)
ds1_best_dt_model.fit(ds1_training_X, ds1_training_Y)

best_params_ds1 = ds1_best_dt_model.best_params_

print("Best parameters set found on development set:")
print()
print(best_params_ds1)

Finding best hyperparameters using grid search...

Best parameters set found on development set:

{'class_weight': None, 'criterion': 'entropy', 'max_depth': None, 'min_impurity_decrease': 0.0, 'min_samples_split': 3}


In [11]:
# Train Best DT Model with dataset 1, using the hyperparameters found in the previous grid search.
ds1_best_dt_model = sklearn.tree.DecisionTreeClassifier(criterion=best_params_ds1['criterion'],
                                                       max_depth=best_params_ds1['max_depth'],
                                                       min_samples_split=best_params_ds1['min_samples_split'],
                                                       min_impurity_decrease=best_params_ds1['min_impurity_decrease'],
                                                       class_weight=best_params_ds1['class_weight'])
ds1_best_dt_model = ds1_best_dt_model.fit(ds1_training_X, ds1_training_Y)

In [12]:
# Use validation data to test first
ds1_val_Y_predict = ds1_best_dt_model.predict(ds1_val_X)

# Check validation metrics and modify hyper-parameters as needed in previous cell
print (sklearn.metrics.confusion_matrix(ds1_val_Y, ds1_val_Y_predict)) # confusion matrix
print ('\n')
print (sklearn.metrics.classification_report(ds1_val_Y, ds1_val_Y_predict, target_names=ds1_labels)) # precision, recall, f1-measure (macro and weighted) and accuracy


[[7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0]
 [0 1 0 3 1 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1]
 [0 0 8 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 3 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0]
 [0 1 0 0 3 1 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 1 3 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [1 0 1 0 0 0 3 0 0 0 0 2 0 0 0 0 0 0 2 0 0 0 0 1 0 0]
 [1 0 0 0 0 0 0 4 1 0 1 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0]
 [1 0 0 1 0 0 0 0 5 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0]
 [0 1 0 1 2 0 0 0 0 1 0 1 0 0 0 0 0 2 0 1 0 0 0 0 0 1]
 [2 0 0 0 0 0 0 0 0 0 6 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 8 1 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 1 1 0 1 0 5 0 0 0 0 0 0 0 0 0 0 1 0 1]
 [0 0 0 1 0 0 0 1 0 0 1 0 1 3 0 0 0 0 0 0 0 1 1 0 1 0]
 [0 0 0 1 1 0 1 0 0 0 0 0 0 0 5 0 0 0 1 0 1 0 0 0 0 0]
 [0 0 0 0 2 2 0 0 1 0 0 0 0 0 0 4 0 0 1 0 0 0 0 0 0 0]
 [0 0 1 0 0 1 0 0 0 0 1 0 0 0 4 0 2 0 1 0 0 0 0 0 0 0]
 [0 2 0 0 0 0 0 0 0 0 0 0 0 0 1 4 0 2 0 0 0 0 0 0 0 1]
 [0 0 1 0 

In [16]:
# When ready, do testing
ds1_test_X, ds1_test_Y = load_dataset('test_with_label_1')

ds1_test_Y_predict = ds1_best_dt_model.predict(ds1_test_X)

# Check test metrics
print (sklearn.metrics.confusion_matrix(ds1_test_Y, ds1_test_Y_predict)) # confusion matrix
print ('\n')
print (sklearn.metrics.classification_report(ds1_test_Y, ds1_test_Y_predict, target_names=ds1_labels)) # precision, recall, f1-measure (macro and weighted average) and accuracy

# Write test results to output file Best-DT-DS1.csv
create_output_file(ds1_test_Y, ds1_test_Y_predict, ds1_labels, 'Best-DT-DS1')

[[3 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 3 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 2 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 2 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0]
 [0 0 1 0 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
# Load training and validation data for dataset 2
ds2_labels = np.loadtxt('Assig1-Dataset/info_2.csv', skiprows=1, usecols=1, delimiter=',', dtype=np.str)

ds2_training_X, ds2_training_Y = load_dataset('train_2')
ds2_val_X, ds2_val_Y = load_dataset('val_2')


In [18]:
# Execute a grid search for the best hyperparameters to use with Best DT Model, for Data set 2
parameters = [{'criterion': ['gini'], 
               'max_depth': [None, 10],
               'min_samples_split': [2, 3, 4, 5, 6],
               'min_impurity_decrease': [0.0, 0.02, 0.05, 0.1, 0.15, 0.2],
               'class_weight': [None, 'balanced']},
              {'criterion': ['entropy'], 
               'max_depth': [None, 10],
               'min_samples_split': [2, 3, 4, 5, 6],
               'min_impurity_decrease': [0.0, 0.02, 0.05, 0.1, 0.15, 0.2],
               'class_weight': [None, 'balanced']}]

print("Finding best hyperparameters using grid search..." + "\n")
ds2_best_dt_model = GridSearchCV(
    sklearn.tree.DecisionTreeClassifier(), parameters)
ds2_best_dt_model.fit(ds2_training_X, ds2_training_Y)

best_params_ds2 = ds2_best_dt_model.best_params_

print("Best parameters set found on development set:")
print()
print(best_params_ds2)

Finding best hyperparameters using grid search...

Best parameters set found on development set:

{'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'min_impurity_decrease': 0.0, 'min_samples_split': 2}


In [19]:
# Train Best DT Model with dataset 1, using the hyperparameters found in the previous grid search.
ds2_best_dt_model = sklearn.tree.DecisionTreeClassifier(criterion=best_params_ds2['criterion'],
                                                       max_depth=best_params_ds2['max_depth'],
                                                       min_samples_split=best_params_ds2['min_samples_split'],
                                                       min_impurity_decrease=best_params_ds2['min_impurity_decrease'],
                                                       class_weight=best_params_ds2['class_weight'])
ds2_best_dt_model = ds2_best_dt_model.fit(ds2_training_X, ds2_training_Y)

In [20]:
# Use validation data to test first
ds2_val_Y_predict = ds2_best_dt_model.predict(ds2_val_X)

# Check validation metrics and adjust hyper-parameters as needed in previous cell
print (sklearn.metrics.confusion_matrix(ds2_val_Y, ds2_val_Y_predict)) # confusion matrix
print ('\n')
print (sklearn.metrics.classification_report(ds2_val_Y, ds2_val_Y_predict, target_names=ds2_labels)) # precision, recall, f1-measure (macro and weighted) and accuracy

[[125   7   0   2   9   3   3   0   8   8]
 [  7 343   0   7   2   1   3   3   4   5]
 [  4   0  31   1   1   2   1   0   2   3]
 [  1   9   0  30   0   0   0   3   0   2]
 [ 11   0   2   3  90  11   1   1   4  27]
 [  2   3   3   1  10 106   0   0   1  39]
 [  3   0   1   0   1   0  29   0   3   8]
 [  0   4   0   0   1   2   0  38   0   0]
 [  6   9   6   0   4   1   1   0 117   6]
 [  8   8   5   3  27  22   4   2   7 289]]


              precision    recall  f1-score   support

          pi       0.75      0.76      0.75       165
       alpha       0.90      0.91      0.91       375
        beta       0.65      0.69      0.67        45
       sigma       0.64      0.67      0.65        45
       gamma       0.62      0.60      0.61       150
       delta       0.72      0.64      0.68       165
      lambda       0.69      0.64      0.67        45
       omega       0.81      0.84      0.83        45
          mu       0.80      0.78      0.79       150
          xi       0.75   

In [21]:
# When ready, do testing
ds2_test_X, ds2_test_Y = load_dataset('test_with_label_2')

ds2_test_Y_predict = ds2_best_dt_model.predict(ds2_test_X)

# Check test metrics
print (sklearn.metrics.confusion_matrix(ds2_test_Y, ds2_test_Y_predict)) # confusion matrix
print ('\n')
print (sklearn.metrics.classification_report(ds2_test_Y, ds2_test_Y_predict, target_names=ds2_labels)) # precision, recall, f1-measure (macro and weighted average) and accuracy

# Write test results to output file Best-DT-DS2.csv
create_output_file(ds2_test_Y, ds2_test_Y_predict, ds2_labels, 'Best-DT-DS2')

[[ 41   2   3   0   0   0   3   0   4   2]
 [  0 115   0   1   0   1   2   1   2   3]
 [  1   0   4   0   0   1   0   0   3   6]
 [  1   2   0  11   1   0   0   0   0   0]
 [  4   0   0   1  31   3   1   0   1   9]
 [  1   1   1   0   1  43   0   0   0   8]
 [  2   0   0   0   2   0   7   0   2   2]
 [  0   0   0   0   0   0   0  15   0   0]
 [  0   0   0   0   0   0   0   0  43   7]
 [  1   5   4   0  10   8   3   0   2  92]]


              precision    recall  f1-score   support

          pi       0.80      0.75      0.77        55
       alpha       0.92      0.92      0.92       125
        beta       0.33      0.27      0.30        15
       sigma       0.85      0.73      0.79        15
       gamma       0.69      0.62      0.65        50
       delta       0.77      0.78      0.77        55
      lambda       0.44      0.47      0.45        15
       omega       0.94      1.00      0.97        15
          mu       0.75      0.86      0.80        50
          xi       0.71   