In [1]:
# Imports and pip installations (if needed)
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

# Part 1: Load the dataset

In [2]:
# Load the dataset (load remotely, not locally)
irisData = load_iris()
# Output the first 15 rows of the data
# Display a summary of the table information (number of datapoints, etc.)
df = pd.DataFrame(data= np.c_[irisData['data'], irisData['target']], columns= irisData['feature_names'] + ['target'])
print(df.head(15))

    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                 5.1               3.5                1.4               0.2   
1                 4.9               3.0                1.4               0.2   
2                 4.7               3.2                1.3               0.2   
3                 4.6               3.1                1.5               0.2   
4                 5.0               3.6                1.4               0.2   
5                 5.4               3.9                1.7               0.4   
6                 4.6               3.4                1.4               0.3   
7                 5.0               3.4                1.5               0.2   
8                 4.4               2.9                1.4               0.2   
9                 4.9               3.1                1.5               0.1   
10                5.4               3.7                1.5               0.2   
11                4.8               3.4 

In [3]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   target             150 non-null    float64
dtypes: float64(5)
memory usage: 6.0 KB
None


## About the dataset
Explain what the data is in your own words. What are your features and labels? What is the mapping of your labels to the actual classes?

The dataset consists of 150 rows with no null values, there are total of 5 columns namely sepal length (in cm), sepal width (in cm), petal length (in cm), petal width (in cm), and target (0 for Iris setosa, 1 for Iris versicolor, and 2 for Iris virginica)<br>

The features for this model are:
- sepal length (cm)
- sepal width (cm)
- petal length (cm)
- petal width (cm) 

and the label is:

- target

# Part 2: Split the dataset into train and test

In [4]:
# Take the dataset and split it into our features (X) and label (y)
X = irisData.data
y = irisData.target
# Use sklearn to split the features and labels into a training/test set. (90% train, 10% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.90, random_state=42)

# Part 3: Logistic Regression

In [5]:
# i. Use sklearn to train a LogisticRegression model on the training set
logReg = LogisticRegression()
logReg.fit(X_train, y_train)
# ii. For a sample datapoint, predict the probabilities for each possible class
y_prediction = logReg.predict(X_test)
y_prediction_prob = logReg.predict_proba(X_test)
classes = load_iris().target_names
for class_name, proba in zip(classes, y_prediction_prob):
    index = np.argmax(proba)
    print(f'predicted class = {class_name} and confidence = {proba[index]:.2%}')
# iii. Report on the score for Logistic regression model, what does the score measure?
score = logReg.score(X_test, y_test)
print('Accuracy of Logistic Regression Classifier on test set: {}'.format(score))

confusion = confusion_matrix(y_test, y_prediction)
print(f'Confusion Matrix:\n{confusion}')
# iv. Extract the coefficents and intercepts for the boundary line(s)
intercept = logReg.intercept_
coeff = logReg.coef_.T
print(f'intercept: {intercept} \ncoeff: {coeff}')


predicted class = setosa and confidence = 83.72%
predicted class = versicolor and confidence = 94.95%
predicted class = virginica and confidence = 99.89%
Accuracy of Logistic Regression Classifier on test set: 1.0
Confusion Matrix:
[[6 0 0]
 [0 6 0]
 [0 0 3]]
intercept: [  9.50134804   1.90990729 -11.41125533] 
coeff: [[-0.42689859  0.51325838 -0.08635979]
 [ 0.97268339 -0.22357238 -0.74911101]
 [-2.4446162  -0.21492369  2.6595399 ]
 [-1.03175179 -0.85147306  1.88322485]]


The score of 100% shows that on the given 10% of the data, i.e. test set, our logistic regression classifier model predicts the outcome of 100%. <br>
Also looking at the confusion matrix we can see that:<br>
- for true positive of setosa we got 6 out of 6
- for true positive of versicolor we got 6 out of 6
- for true positive of virginica we got 3 out of 3

# Part 4: Support Vector Machine

In [6]:
# i. Use sklearn to train a Support Vector Classifier on the training set
svc = svm.SVC(probability=True)
svc.fit(X_train, y_train)
# ii. For a sample datapoint, predict the probabilities for each possible class
y_prediction_svc = svc.predict(X_test)
y_prediction_prob_svc = svc.predict_proba(X_test)
for class_name, proba in zip(classes, y_prediction_prob_svc):
    index = np.argmax(proba)
    print(f'predicted class = {class_name} and confidence = {proba[index]:.2%}')
# iii. Report on the score for the SVM, what does the score measure?
score = svc.score(X_test, y_test)
print('Accuracy of Support Vector Classifier on test set: {}'.format(score))

confusion = confusion_matrix(y_test, y_prediction_svc)
print(f'Confusion Matrix:\n{confusion}')

predicted class = setosa and confidence = 90.79%
predicted class = versicolor and confidence = 95.21%
predicted class = virginica and confidence = 98.25%
Accuracy of Support Vector Classifier on test set: 1.0
Confusion Matrix:
[[6 0 0]
 [0 6 0]
 [0 0 3]]


Same as the Logistic Regression we also got the score of 100% for our Support Vector Machine model

# Part 5: Neural Network

In [7]:
# i. Use sklearn to train a Neural Network (MLP Classifier) on the training set
mlp = MLPClassifier(solver='sgd', learning_rate_init=0.01, hidden_layer_sizes=10, max_iter=500)
mlp.fit(X_train, y_train)
# ii. For a sample datapoint, predict the probabilities for each possible class
y_prediction_mlp = mlp.predict(X_test)
y_prediction_prob_mlp = mlp.predict_proba(X_test)
for class_name, proba in zip(classes, y_prediction_prob_mlp):
    index = np.argmax(proba)
    print(f'predicted class = {class_name} and confidence = {proba[index]:.2%}')
# iii. Report on the score for the Neural Network, what does the score measure?
score = mlp.score(X_test, y_test)
print('Accuracy of Neural Network on test set: {}'.format(score))
# iv: Experiment with different options for the neural network, report on your best configuration (the highest score I was able to achieve was 0.8666)

predicted class = setosa and confidence = 91.73%
predicted class = versicolor and confidence = 99.78%
predicted class = virginica and confidence = 100.00%
Accuracy of Neural Network on test set: 1.0


# Part 6: K-Nearest Neighbors

In [8]:
# i. Use sklearn to 'train' a k-Neighbors Classifier
# Note: KNN is a nonparametric model and technically doesn't require training
# fit will essentially load the data into the model see link below for more information
# https://stats.stackexchange.com/questions/349842/why-do-we-need-to-fit-a-k-nearest-neighbors-classifier
knn = KNeighborsClassifier(n_neighbors=8)
knn.fit(X_train, y_train)
# ii. For a sample datapoint, predict the probabilities for each possible class
y_prediction_knn = knn.predict(X_test)
y_prediction_prob_knn = knn.predict_proba(X_test)
for class_name, proba in zip(classes, y_prediction_prob_knn):
    index = np.argmax(proba)
    print(f'predicted class = {class_name} and confidence = {proba[index]:.2%}')
# iii. Report on the score for kNN, what does the score measure?
score = knn.score(X_test, y_test)
print('Accuracy of k-Neighbors on test set: {}'.format(score))

predicted class = setosa and confidence = 100.00%
predicted class = versicolor and confidence = 100.00%
predicted class = virginica and confidence = 100.00%
Accuracy of k-Neighbors on test set: 1.0


# Part 7: Conclusions and takeaways

In your own words describe the results of the notebook. Which model(s) performed the best on the dataset? Why do you think that is? Did anything surprise you about the exercise?

In this assignment we use the Classification Model, as we were ask to classify; given some features like sepal length (cm), sepal width (cm), petal length (cm), petal width (cm); into 3 classes of the iris plant. <br>
We use 4 different types of Classifier to solve this assignment and compare the result for each of those: 
- Logistic Regression
- Support Vector Machine
- Neural Network
- K-Nearest Neighbors

Though all 4 model give us 100% accuracy, K Nearest Neighbors gave 100% probabilities for each class, it is because for a plant to be in that class it has to be within the range of its respective features, if its smaller or bigger than that range it belongs to the different class. <br>

One thing that surprise me is that we getting 100% accuracy for all the models, I am not sure if its because of the dataset or because of the spliting of data into 90% test data and 10% data i.e. having less number of data to test with. 