# Do the following in the iris dataset.
### 1. Read the dataset to the python environment.
### 2. Do necessary pre-processing steps.
### 3. Find out which classification model gives the best result to predict iris
###     species.(also do random forest algorithm)|


In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# 1. Read the dataset to the python environment.

In [2]:
data=pd.read_excel("E:\PAATSHAALA\Assignments\iris2.xlsx")

In [3]:
data.shape

(150, 5)

In [4]:
data.head()

Unnamed: 0,SL,SW,PL,PW,Classification
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


#### Data read into the system. Data is of shape (150, 5)

# 2. Do necessary pre-processing steps.

In [5]:
data.isna().sum()

SL                7
SW                6
PL                6
PW                0
Classification    0
dtype: int64

#### There are missing values in 3 columns which needs to be fixed

In [6]:
data['SL']=data['SL'].fillna(data['SL'].median())

In [7]:
data['SW']=data['SW'].fillna(data['SW'].median())
data['PL']=data['PL'].fillna(data['PL'].median())

In [9]:
data.isna().sum()

SL                0
SW                0
PL                0
PW                0
Classification    0
dtype: int64

####  Missig data is fixed by filling median for respecttive columns

In [11]:
data.columns

Index(['SL', 'SW', 'PL', 'PW', 'Classification'], dtype='object')

In [12]:
data.dtypes

SL                float64
SW                float64
PL                float64
PW                float64
Classification     object
dtype: object

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   SL              150 non-null    float64
 1   SW              150 non-null    float64
 2   PL              150 non-null    float64
 3   PW              150 non-null    float64
 4   Classification  150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [14]:
data.describe()

Unnamed: 0,SL,SW,PL,PW
count,150.0,150.0,150.0,150.0
mean,5.853333,3.047333,3.78,1.198667
std,0.808567,0.421995,1.729424,0.763161
min,4.3,2.0,1.0,0.1
25%,5.2,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.275,5.1,1.8
max,7.9,4.4,6.9,2.5


#### Insight:- Data read into the system.
####          3 columns were having data na. These were filled with the median value
####         data looks to be okay

In [15]:
df=pd.DataFrame(data)

In [16]:
df.head()

Unnamed: 0,SL,SW,PL,PW,Classification
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,5.8,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [17]:
df['Classification'].value_counts()

Iris-versicolor    50
Iris-setosa        50
Iris-virginica     50
Name: Classification, dtype: int64

In [18]:
data['Classification'].value_counts()

Iris-versicolor    50
Iris-setosa        50
Iris-virginica     50
Name: Classification, dtype: int64

In [19]:
data['Classification'].value_counts(normalize=True)

Iris-versicolor    0.333333
Iris-setosa        0.333333
Iris-virginica     0.333333
Name: Classification, dtype: float64

#### 3 types of classification are there. Each have a count of 50. 
#### For algorithms to perform well, One Hot Encoding needs to be invoked for column - Classification

# One Hot Encoding

In [20]:
data=pd.get_dummies(data)

In [21]:
data.tail()

Unnamed: 0,SL,SW,PL,PW,Classification_Iris-setosa,Classification_Iris-versicolor,Classification_Iris-virginica
145,6.7,3.0,5.2,2.3,0,0,1
146,6.3,2.5,5.0,1.9,0,0,1
147,6.5,3.0,4.35,2.0,0,0,1
148,6.2,3.4,5.4,2.3,0,0,1
149,5.9,3.0,5.1,1.8,0,0,1


#### Insight:- Necessary preprocessing steps are followed and data is ready for testing 

### Splitting Data

In [22]:
x=data.drop(['Classification_Iris-setosa','Classification_Iris-versicolor','Classification_Iris-virginica'],axis=1)
y=data['Classification_Iris-virginica']

In [23]:
x.head()

Unnamed: 0,SL,SW,PL,PW
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,5.8,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [24]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Classification_Iris-virginica, dtype: uint8

In [25]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=42,test_size=0.2)

In [26]:
x_train.shape

(120, 4)

In [27]:
x_test.shape

(30, 4)

# 3. Find out which classification model gives the best result to predict iris

In [28]:
from sklearn.linear_model import LogisticRegression
logit_model=LogisticRegression()

In [29]:
logit_model.fit(x_train,y_train)
y_pred=logit_model.predict(x_test)

In [30]:
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score

In [31]:
print('Accuracy is : ' ,accuracy_score(y_test,y_pred))
print('Precision is : ' ,precision_score(y_test,y_pred))
print('Recall_score is : ', recall_score(y_test,y_pred))
print('f1_score is : ' ,f1_score(y_test,y_pred))

Accuracy is :  1.0
Precision is :  1.0
Recall_score is :  1.0
f1_score is :  1.0


In [32]:
confusion_matrix(y_test,y_pred)

array([[19,  0],
       [ 0, 11]], dtype=int64)

In [33]:
from sklearn.metrics import classification_report
print (classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



#### Based on the above results, all the classifications made with Classification_Iris-virginica appears correctly classified
#### and, it also scores 1 for Accuracy, precision, recall and f1_score

## Using Random Forest Algorithm - Gradient Boosting 

In [38]:
from sklearn.ensemble import GradientBoostingClassifier
gb=GradientBoostingClassifier()
gb.fit(x_train,y_train)
y_pred=gb.predict(x_test)
f1_score(y_test,y_pred)

1.0

## Again a score of 1 and that shows the correctness of the model or the Classification_Iris-virginica

### Classification_Iris-versicolor

In [39]:
data.columns

Index(['SL', 'SW', 'PL', 'PW', 'Classification_Iris-setosa',
       'Classification_Iris-versicolor', 'Classification_Iris-virginica'],
      dtype='object')

In [40]:
x=data.drop(['Classification_Iris-setosa','Classification_Iris-versicolor','Classification_Iris-virginica'],axis=1)
y=data['Classification_Iris-versicolor']

In [41]:
from sklearn.linear_model import LogisticRegression
logit_model=LogisticRegression()

In [42]:
logit_model.fit(x_train,y_train)
y_pred=logit_model.predict(x_test)

In [43]:
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score

In [44]:
print('Accuracy is : ' ,accuracy_score(y_test,y_pred))
print('Precision is : ' ,precision_score(y_test,y_pred))
print('Recall_score is : ', recall_score(y_test,y_pred))
print('f1_score is : ' ,f1_score(y_test,y_pred))

Accuracy is :  1.0
Precision is :  1.0
Recall_score is :  1.0
f1_score is :  1.0


In [45]:
confusion_matrix(y_test,y_pred)

array([[19,  0],
       [ 0, 11]], dtype=int64)

In [46]:
from sklearn.metrics import classification_report
print (classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



#### Based on the above results, all the classifications made with Classification_Iris-versicolor appears correctly classified
#### and, it also scores 1 for Accuracy, precision, recall and f1_score

In [47]:
from sklearn.ensemble import GradientBoostingClassifier
gb=GradientBoostingClassifier()
gb.fit(x_train,y_train)
y_pred=gb.predict(x_test)
f1_score(y_test,y_pred)

1.0

## Again a score of 1 and that shows the correctness of the model or the Classification_Iris-versicolor

In [48]:
data.columns

Index(['SL', 'SW', 'PL', 'PW', 'Classification_Iris-setosa',
       'Classification_Iris-versicolor', 'Classification_Iris-virginica'],
      dtype='object')

In [49]:
x=data.drop(['Classification_Iris-setosa','Classification_Iris-versicolor','Classification_Iris-virginica'],axis=1)
y=data['Classification_Iris-setosa']

In [50]:
from sklearn.linear_model import LogisticRegression
logit_model=LogisticRegression()

In [51]:
logit_model.fit(x_train,y_train)
y_pred=logit_model.predict(x_test)

In [52]:
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score

In [54]:
print('Accuracy is : ' ,accuracy_score(y_test,y_pred))
print('Precision is : ' ,precision_score(y_test,y_pred))
print('Recall_score is : ', recall_score(y_test,y_pred))
print('f1_score is : ' ,f1_score(y_test,y_pred))

Accuracy is :  1.0
Precision is :  1.0
Recall_score is :  1.0
f1_score is :  1.0


In [55]:
confusion_matrix(y_test,y_pred)

array([[19,  0],
       [ 0, 11]], dtype=int64)

In [56]:
from sklearn.metrics import classification_report
print (classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



#### Based on the above results, all the classifications made with Classification_Iris-versicolor appears correctly classified
#### and, it also scores 1 for Accuracy, precision, recall and f1_score

## Random Forest -- Gradiant Boosting

In [57]:
from sklearn.ensemble import GradientBoostingClassifier
gb=GradientBoostingClassifier()

In [58]:
gb.fit(x_train,y_train)

GradientBoostingClassifier()

In [59]:
y_pred=gb.predict(x_test)

In [60]:
f1_score(y_test,y_pred)

1.0

## Again a score of 1 and that shows the correctness of the model or the Classification_Iris-setosa

### Based on all the testing, all the classification methods appears with no misclassifications 
### and model is corrrect to the maximum.