In [None]:
#-------------------------------------------------------------------
# Author Wessel Olaf van Dam 09-30-2019
#
# Machine Learning script on practice dataset "Iris.csv"
#-------------------------------------------------------------------

import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import scipy as sc
import numpy as np
import sklearn

data = pd.read_csv('iris.csv',names=names)

print(dataset.describe())
print("\n")
print(dataset.groupby('class').size())

# histograms
dataset.hist()
plt.show()


In [None]:
#importing relevant libraries
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

#We will split the loaded dataset into two, 80% of which we will use to train our models 
#and 20% that we will hold back as a validation dataset. X_train and Y_train contain the
#training data, and X_validation and Y_validation will be used later to test the performance
#of our models

# Split-out validation dataset
array = data.values
# here we split the dataset out in the features (X) and the labels (Y)
# are we able to predict Y (i.e.,type of iris), based on features (X)
X = array[:,0:4]
Y = array[:,4]

validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)

print(X_train.shape) # This should be 0.80% of the 150 samples = 120
print(X_validation.shape) # This will contain 0.20% of the 150 samples = 30

In [None]:
# Test options and evaluation metric
seed = 7
scoring = 'accuracy'

# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
# evaluate each model in turn
results = []
names = []

# Here we print the accuracy of each of the 6 models to see
# which one is most accurate

for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)


In [None]:
# The model using Support Vector Machines (SVM) has the largest estimated
# accuracy score. Now we want to get an idea of the accuracy of the model on our validation set.
#This will give us an independent final check on the accuracy of the best model. 

#We can run the SVM model directly on the validation set and summarize the results as a final accuracy score, 
#a confusion matrix and a classification report.

# Make predictions on validation dataset
SVM = SVC(gamma='auto')
SVM.fit(X_train, Y_train)
predictions = SVM.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

#We can see that the accuracy is 0.933 or 93.3%. The confusion matrix provides 
#an indication of the two errors made. That is, model predicted Iris-virginica but
#it actually was Iris-versicolor

#Finally, the classification report provides a breakdown of each class by precision, 
#recall, f1-score and support showing excellent results (granted the validation dataset was small).