In [49]:
# import libraries
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing

In [50]:
# load iris dataset
iris = pd.read_csv('iris.csv')
iris.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [51]:
# independent variables
X = iris.iloc[:, 1:5].values
X = X.reshape(-1,X.shape[1])
X = preprocessing.scale(X)

# create new column with 1s if the species is setosa, 0 otherwise
iris['Setosa'] = np.where(iris['Species'] == 'Iris-setosa', 0, 1)

# dependent variable
y = iris['Setosa'].values

# test-train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=0)


In [52]:
# setosa column made as required
iris.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,Setosa
0,1,5.1,3.5,1.4,0.2,Iris-setosa,0
1,2,4.9,3.0,1.4,0.2,Iris-setosa,0
2,3,4.7,3.2,1.3,0.2,Iris-setosa,0
3,4,4.6,3.1,1.5,0.2,Iris-setosa,0
4,5,5.0,3.6,1.4,0.2,Iris-setosa,0


In [53]:
# create and fit log regression
iris_log = LogisticRegression()
iris_log.fit(X_train, y_train)

# predictions
y_pred = iris_log.predict(X_test)

In [54]:
# confusion matrix
classes = ['setosa',  'not setosa']
confusion = confusion_matrix(y_test, y_pred)
conf_df = pd.DataFrame(confusion, columns=classes, index=classes)
conf_df

Unnamed: 0,setosa,not setosa
setosa,13,0
not setosa,0,25


Predictions made by the logistic regression are perfect with no errors or mislabelled data, so both recall and precision will probably be high. There's no false negatives or false positives so precision = 1, recall = 1, accuracy = 1. F1 score = 1

In [55]:
# write code to calculate accuracy, precision and recall from a confusion matrix

# correct predictions over total predictions
def accuracy(conf_matrix):
    # keep track of correct and total values
    correct = 0
    total = 0
    # loop through each row and column
    for column in range(0, conf_matrix.shape[1]):
        for row in range(0, conf_matrix.shape[0]):
            # add all values to total
            total += conf_matrix.iloc[row][column]
            if column == row: 
                # add values along the diagnol to correct values
                correct += conf_matrix.iloc[row][column]
    # return accuracy
    return correct/total
                
accuracy(conf_df)

1.0

If I'm writing code for just the confusion matrix in the compulsary task, precision and recall are easy because there's clear positive and negative columns. Precision correct over total in column 0 and recall is correct over total in row 0. If I'm writing functions for 3x3 or bigger matrices, there's no clear positive or negative. So, these functions will not apply. Accuracy function can work for matrices of all sizes.

In [56]:
# true positives over all positive predictions (2x2 confusion matrix only)
# positive condtion needs to be entered as column name
def precision(conf_matrix, positive_col):
    true_p = conf_matrix[positive_col][positive_col] # because confusion matrices are squares
    all_p = conf_matrix[positive_col].sum()
    return true_p/all_p

precision(conf_df, 'setosa')

1.0

In [61]:
# true positive over all positive values
def recall(conf_matrix, positive_row):
    true_p = conf_matrix[positive_row][positive_row] # because confusion matrices are squares
    all_p = conf_matrix.sum(axis=1)[positive_row]
    
    return true_p/all_p

recall(conf_df, 'setosa')

1.0

**OPTIONAL**

In [62]:
# X is the same 
y = iris['Species']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=5)

# log regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

species = iris['Species'].unique()
lr_cm = confusion_matrix(y_test, y_pred)
data = pd.DataFrame(lr_cm, columns=species, index=species)
data

Unnamed: 0,Iris-setosa,Iris-versicolor,Iris-virginica
Iris-setosa,12,0,0
Iris-versicolor,0,12,1
Iris-virginica,0,2,11


Running the code a few times with different numbers for random state affects the wrong classifications between versicolor and virginica classes, but the model gets setosa right 100% of the time. 

In [64]:
accuracy(data) 

0.9210526315789473