**Requirements**

1. Using sklearn library, perform classifications on the Iris dataset.

2. Break the sample into 70% for training, and 30% for validation datasets. 

3. Using standard functions, compute the F1-score and accuracy of the model for both training and validation.

In [1]:
import csv
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
#importing necessary modules


#in this lab we will use K nearest neighbor classification to find the three nearest
#neighbors and predict the class of 3/10 of our data set (VALIDATION SET)
#using 7/10 of the data as training set
print (".")

.


In [2]:
#we want to read iris.data into a pandas dataframe
#iris.data has no header columns so we create a list of column names
columns = ["sepal_length", "sepal_width", "petal_length", "petal_width", "plant_class"]
data = pd.read_csv("../data/iris.data", delimiter=",", names=columns)

#separate each "series" (column) of the dataframe into its own list
sepal_length = data["sepal_length"].tolist()
sepal_width = data["sepal_width"].tolist()
petal_length = data["petal_length"].tolist()
petal_width = data["petal_width"].tolist()
plant_class = data["plant_class"].tolist()

#prints the data frame
print(data)
#prints the sepal_length column
print(sepal_length)
#confirm sepal_length is a list
print(type(sepal_length))
#prints length of our list (150)
print(len(sepal_length))

#at this point we have labelled data

     sepal_length  sepal_width  petal_length  petal_width     plant_class
0             5.1          3.5           1.4          0.2     Iris-setosa
1             4.9          3.0           1.4          0.2     Iris-setosa
2             4.7          3.2           1.3          0.2     Iris-setosa
3             4.6          3.1           1.5          0.2     Iris-setosa
4             5.0          3.6           1.4          0.2     Iris-setosa
..            ...          ...           ...          ...             ...
145           6.7          3.0           5.2          2.3  Iris-virginica
146           6.3          2.5           5.0          1.9  Iris-virginica
147           6.5          3.0           5.2          2.0  Iris-virginica
148           6.2          3.4           5.4          2.3  Iris-virginica
149           5.9          3.0           5.1          1.8  Iris-virginica

[150 rows x 5 columns]
[5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6, 5.0, 4.4, 4.9, 5.4, 4.8, 4.8, 4.3, 5.8, 5.7, 5.4, 5.

In [3]:
#we want to convert our labeled data to encoded data
#use fit_transform function in our label encoder
le = preprocessing.LabelEncoder()

sepal_length_encoded = le.fit_transform(sepal_length)
print('sepal length encoded', sepal_length_encoded)
sepal_width_encoded = le.fit_transform(sepal_width)
print('sepal width encoded', sepal_width_encoded)
petal_length_encoded = le.fit_transform(petal_length)
print('petal length encoded', petal_length_encoded)
petal_width_encoded = le.fit_transform(petal_width)
print('petal width encoded', petal_width_encoded)
#the plant class will be our label (this is what we are predicting given the training set)
label = le.fit_transform(plant_class)
print('label (plant class) encoded', label)

#at this point the data is encoded

sepal length encoded [ 8  6  4  3  7 11  3  7  1  6 11  5  5  0 15 14 11  8 14  8 11  8  3  8
  5  7  7  9  9  4  5 11  9 12  6  7 12  6  1  8  7  2  1  7  8  5  8  3
 10  7 27 21 26 12 22 14 20  6 23  9  7 16 17 18 13 24 13 15 19 13 16 18
 20 18 21 23 25 24 17 14 12 12 15 17 11 17 24 20 13 12 12 18 15  7 13 14
 14 19  8 14 20 15 28 20 22 32  6 30 24 29 22 21 25 14 15 21 22 33 33 17
 26 13 33 20 24 29 19 18 21 29 31 34 21 20 18 33 20 21 17 26 24 26 15 25
 24 24 20 22 19 16]
sepal width encoded [14  9 11 10 15 18 13 13  8 10 16 13  9  9 19 22 18 14 17 17 13 16 15 12
 13  9 13 14 13 11 10 13 20 21 10 11 14 10  9 13 14  2 11 14 17  9 17 11
 16 12 11 11 10  2  7  7 12  3  8  6  0  9  1  8  8 10  9  6  1  4 11  7
  4  7  8  9  7  9  8  5  3  3  6  6  9 13 10  2  9  4  5  9  5  2  6  9
  8  8  4  7 12  6  9  8  9  9  4  8  4 15 11  6  9  4  7 11  9 17  5  1
 11  7  7  6 12 11  7  9  7  9  7 17  7  7  5  9 13 10  9 10 10 10  6 11
 12  9  4  9 13  9]
petal length encoded [ 4  4  3  5  4  7  4 

In [4]:
#combine our features into one list
features = list(zip(sepal_length_encoded, sepal_width_encoded, petal_length_encoded, petal_width_encoded))
print('Features', features)

Features [(8, 14, 4, 1), (6, 9, 4, 1), (4, 11, 3, 1), (3, 10, 5, 1), (7, 15, 4, 1), (11, 18, 7, 3), (3, 13, 4, 2), (7, 13, 5, 1), (1, 8, 4, 1), (6, 10, 5, 0), (11, 16, 5, 1), (5, 13, 6, 1), (5, 9, 4, 0), (0, 9, 1, 0), (15, 19, 2, 1), (14, 22, 5, 3), (11, 18, 3, 3), (8, 14, 4, 2), (14, 17, 7, 2), (8, 17, 5, 2), (11, 13, 7, 1), (8, 16, 5, 3), (3, 15, 0, 1), (8, 12, 7, 4), (5, 13, 8, 1), (7, 9, 6, 1), (7, 13, 6, 3), (9, 14, 5, 1), (9, 13, 4, 1), (4, 11, 6, 1), (5, 10, 6, 1), (11, 13, 5, 3), (9, 20, 5, 0), (12, 21, 4, 1), (6, 10, 5, 0), (7, 11, 2, 1), (12, 14, 3, 1), (6, 10, 5, 0), (1, 9, 3, 1), (8, 13, 5, 1), (7, 14, 3, 2), (2, 2, 3, 2), (1, 11, 3, 1), (7, 14, 6, 5), (8, 17, 8, 3), (5, 9, 4, 2), (8, 17, 6, 1), (3, 11, 4, 1), (10, 16, 5, 1), (7, 12, 4, 1), (27, 11, 23, 10), (21, 11, 21, 11), (26, 10, 25, 11), (12, 2, 16, 9), (22, 7, 22, 11), (14, 7, 21, 9), (20, 12, 23, 12), (6, 3, 10, 6), (23, 8, 22, 9), (9, 6, 15, 10), (7, 0, 11, 6), (16, 9, 18, 11), (17, 1, 16, 6), (18, 8, 23, 10), (13,

In [5]:
#split our feature list into training and validation
#we will have 70:30 ratio; in this case 105 data points in training, 45 in validation
#we will use data collected in training to predict the class of the validation samples
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.3)

In [6]:
#training the classifier
#k=3 thus we look for the three nearest neighbors
#using the Minkowski distance metrics for proximity
#majority vote will determine our class prediction
model = KNeighborsClassifier(n_neighbors=3)
#train the classifier
model.fit(X_train, y_train)

In [7]:
#predicting our validation set using the trained classifier
y_validation = model.predict(X_test)
print('Predicting the test set', y_validation)
print('Actual data from the test set', y_test)

#predicting values of the training set using the trained classifier
y_training = model.predict(X_train)
print('Predicting the train set', y_training)
print('Actual data from the train set', y_train)

Predicting the test set [0 1 2 1 2 2 2 0 1 0 0 1 1 1 1 2 2 1 1 0 0 1 1 0 0 0 2 0 2 2 2 1 2 1 1 1 2
 1 0 2 2 0 2 1 1]
Actual data from the test set [0 1 2 1 2 2 1 0 1 0 0 1 1 1 1 2 2 1 1 0 0 1 1 0 0 0 2 0 2 2 2 1 2 1 1 1 2
 1 0 2 2 0 2 1 1]
Predicting the train set [1 2 1 1 0 1 0 1 2 0 0 2 1 2 0 2 0 0 2 0 1 2 0 1 0 1 2 2 2 0 1 1 0 0 0 0 1
 0 1 2 2 0 0 2 2 2 0 1 0 2 1 2 0 0 1 2 2 1 2 1 2 1 1 2 1 1 2 2 1 2 2 2 2 2
 0 1 2 2 0 0 1 2 0 0 0 0 2 1 0 0 2 1 0 1 1 0 2 0 0 0 2 0 2 1 0]
Actual data from the train set [1 2 2 1 0 1 0 1 2 0 0 1 1 2 0 2 0 0 2 0 1 2 0 1 0 1 2 2 2 0 1 1 0 0 0 0 1
 0 1 2 2 0 0 2 1 2 0 1 0 2 1 2 0 0 1 2 2 1 2 1 2 1 1 2 1 1 2 2 1 2 2 2 2 2
 0 1 2 2 0 0 1 2 0 0 0 0 2 1 0 0 2 1 0 1 1 0 2 0 0 0 2 0 2 1 0]


In [8]:
#evaluating accuracy of prediction on validation model
accuracy = accuracy_score(y_test, y_validation)
print('Accuracy of prediction model on the validation set', accuracy)

accuracy = accuracy_score(y_test, y_validation, normalize=False)
print(accuracy, 'out of 45 samples classified correctly')

#evaluating accuracy of prediction on training set
accuracy = accuracy_score(y_train, y_training)
print('Accuracy of predicion model on the training set', accuracy)

accuracy = accuracy_score(y_train, y_training, normalize=False)
print(accuracy, 'out of 105 samples classified correctly')

Accuracy of prediction model on the validation set 0.9777777777777777
44 out of 45 samples classified correctly
Accuracy of predicion model on the training set 0.9714285714285714
102 out of 105 samples classified correctly


In [9]:
from sklearn.metrics import f1_score

#evaluating f1_score of our prediction on the validation set
f1_score = f1_score(y_test, y_validation, average='weighted')
print('f1 score of prediction on validation set', f1_score)

f1 score of prediction on validation set 0.9778606192399296


In [10]:
from sklearn.metrics import f1_score

#evaluating f1_score of our prediction on the training set
f1_score = f1_score(y_train, y_training, average='weighted')
print('f1 score of prediction on training set', f1_score)

f1 score of prediction on training set 0.9713900741073435
