# IRIS DATASET

In [63]:
import numpy as np                #importing the useful libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")        

In [11]:
iris = pd.read_csv("iris.csv")                   #loading the data
iris.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [12]:
del iris['Id']                      # delete the 'Id' column
iris.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [13]:
iris['Species'].value_counts()       #counting the number of data points belonging to each class

Iris-setosa        50
Iris-virginica     50
Iris-versicolor    50
Name: Species, dtype: int64

In [14]:
x = iris.iloc[:,0:4]
y = iris.iloc[:,4]              

print(x.head())
print(y.head())

   SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
0            5.1           3.5            1.4           0.2
1            4.9           3.0            1.4           0.2
2            4.7           3.2            1.3           0.2
3            4.6           3.1            1.5           0.2
4            5.0           3.6            1.4           0.2
0    Iris-setosa
1    Iris-setosa
2    Iris-setosa
3    Iris-setosa
4    Iris-setosa
Name: Species, dtype: object


In [15]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)              #encoding labels with value between 0 and 2
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [16]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.33,random_state=0)  #dividing the data into train and test

# KNN 


In [49]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_jobs=-1)

params = {'n_neighbors':[1,2,3,4,5,6,7,8,9,10],                 #dictionary of parameters which will passed to GridSearchCV
          'leaf_size':[1,2,3,5],
          'weights':['uniform', 'distance'],
          'algorithm':['auto', 'ball_tree','kd_tree','brute'],
          'n_jobs':[-1]}

model1 = GridSearchCV(model,cv=5, param_grid=params, n_jobs=1)  #using GridSearchCV for hyperparameter tuning

model1.fit(x_train,y_train)         #fitting the model

print("Best Hyper Parameters:\n",model1.best_params_)   #displaying most suitable parameters

y_pred = model1.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)*100
print("Accuracy: "+str(accuracy)+"%")                   # displaying the accuracy of the model

Best Hyper Parameters:
 {'algorithm': 'brute', 'leaf_size': 1, 'n_jobs': -1, 'n_neighbors': 5, 'weights': 'uniform'}
Accuracy: 96.0%


# DECISION TREES

In [56]:
from sklearn.tree import DecisionTreeClassifier

model= DecisionTreeClassifier(random_state=None)

params = {'max_features': ['auto', 'sqrt', 'log2'],         #the number of features to consider when looking for best split
          'min_samples_split': [2,3,4,5,6,7,8,9,10,11,12,13,14,15], #The minimum number of samples required to split an internal node
          'min_samples_leaf':[1,2,3,4,5,6,7,8,9,10,11], #The minimum number of samples required to be at a leaf node
          }

model1 = GridSearchCV(model, param_grid=params, n_jobs=-1)

model1.fit(x_train,y_train)

print("Best Hyper Parameters:",model1.best_params_)

y_test=model1.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)*100 
print("Accuracy: "+str(accuracy)+"%")           #displaying the accuracy of the model

Best Hyper Parameters: {'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 6}
Accuracy: 96.0%


# RANDOM FORESTS

In [36]:
from sklearn.ensemble import RandomForestClassifier

model=RandomForestClassifier()

params = {'criterion':['gini','entropy'],   #function to measure quality of split-gini impurity and entropy
          'n_estimators':[10,15,20,25,30],  #number of trees in forest
          'min_samples_leaf':[1,2,3],       #The minimum number of samples required to be at a leaf nod
          'min_samples_split':[3,4,5,6,7],  #The minimum number of samples required to split an internal node
          'random_state':[123],
          'n_jobs':[-1]}                 # n_jobs = -1 means using all processors

model1 = GridSearchCV(model, param_grid=params, n_jobs=-1)

model1.fit(x_train, y_train)

print("Best Hyper Parameters:\n",model1.best_params_)

y_pred = model1.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)*100
print("Accuracy: "+str(accuracy)+"%")      #displaying accuracy of the model

Best Hyper Parameters:
 {'criterion': 'gini', 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 30, 'n_jobs': -1, 'random_state': 123}
Accuracy: 98.0%


# NAIVE BAYES

In [35]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold  
                                            #This cross-validation object is a variation of KFold that returns stratified folds. 
                                            #The folds are made by preserving the percentage of samples for each class.

skf = StratifiedKFold(n_splits=10)   #number of folds must be atleast 2
model = GaussianNB()
params = {}

#First off GaussianNB only accepts priors as an argument
#so unless you have some priors to set for your model ahead of time you will have nothing to grid search over.
#Furthermore, your param_grid is set to an empty dictionary which ensures that you only fit one estimator with GridSearchCV. 
#This is the same as fitting an estimator without using a grid search

model1 = GridSearchCV(model,cv=skf,param_grid=params)

model1.fit(x_train, y_train)

print("Best Hyper Parameters:\n",model1.best_params_)

y_pred = model1.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)*100
print("Accuracy: "+str(accuracy)+"%")

Best Hyper Parameters:
 {}
Accuracy: 94.0%


# LOGISTIC REGRESSION

In [60]:
from sklearn import linear_model

model = linear_model.LogisticRegression()

penalty = ['l1','l2']          #l1 and l2 regularization
c = np.logspace(0,10,10)       #Return numbers spaced evenly on a log scale
params = dict(C=c,penalty=penalty)   #Inverse of regularization strength; must be a positive float.

model1 = GridSearchCV(model,param_grid=params,cv=5,verbose=0)

model1.fit(x_train, y_train)

print("Best Hyper Parameters:\n",model1.best_params_)

y_pred = model1.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)*100
print("Accuracy: "+str(accuracy)+"%")        #displaying accuracy of the model

Best Hyper Parameters:
 {'C': 12.91549665014884, 'penalty': 'l1'}
Accuracy: 96.0%


In [43]:
#we can also perform hyperparameter tuning with LogisticRegressionCV in this case

from sklearn import linear_model
from sklearn.linear_model import LogisticRegressionCV


model1 = LogisticRegressionCV(cv=5,multi_class='multinomial')

model1.fit(x_train, y_train)

y_pred = model1.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)*100
print("Accuracy: "+str(accuracy)+"%")     #displaying accuracy of the model

Accuracy: 96.0%


# CONCLUSION

In [62]:
# MODEL                       ACCURACY

# KNN                            96%
# DECISION TREES                 96%
# RANDOM FORESTS                 98%
# NAIVE BAYES                    94%
# LOGISTIC REGRESSION            96%

#This is a relatively small data set
#In this the Random Forest gives the best performance and Naive Bayes give the worst performance