In [1]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

In [3]:
data = pd.read_csv('iris_classification/final_data.csv')

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,0,0,5.1,3.5,1.4,0.2,Iris-setosa
1,1,1,4.7,3.2,1.6,0.2,Iris-setosa
2,2,2,4.9,3.1,1.5,0.1,Iris-setosa
3,3,3,4.4,2.9,1.4,0.2,Iris-setosa
4,4,4,5.0,3.4,1.5,0.2,Iris-setosa


In [9]:
data.drop(['Unnamed: 0', 'Id'], axis=1, inplace=True)

In [10]:
data.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.7,3.2,1.6,0.2,Iris-setosa
2,4.9,3.1,1.5,0.1,Iris-setosa
3,4.4,2.9,1.4,0.2,Iris-setosa
4,5.0,3.4,1.5,0.2,Iris-setosa


## Label Encoding

In [11]:
le = LabelEncoder()

In [14]:
data['Species'] = le.fit_transform(data['Species'])

In [15]:
data.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,0
1,4.7,3.2,1.6,0.2,0
2,4.9,3.1,1.5,0.1,0
3,4.4,2.9,1.4,0.2,0
4,5.0,3.4,1.5,0.2,0


### Data Controls

In [16]:
data.isna().sum()

SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [18]:
data.dtypes

SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
Species            int32
dtype: object

## Train_test_split

In [19]:
from sklearn.model_selection import train_test_split

In [40]:
X_train, X_test, Y_train, Y_test = train_test_split(data.iloc[:, :-1], data.iloc[:, -1], test_size=0.2)

In [41]:
X_train

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
22,5.1,3.5,1.400000,0.3
117,6.2,2.8,4.800000,1.8
84,6.3,3.3,3.707229,1.6
65,6.1,2.8,4.700000,1.2
11,4.8,3.4,1.600000,0.2
...,...,...,...,...
99,6.1,2.9,3.707229,1.4
20,5.1,3.8,1.500000,0.3
71,6.6,2.9,4.600000,1.3
132,6.7,3.1,5.600000,2.4


In [42]:
Y_train

22     0
117    2
84     1
65     1
11     0
      ..
99     1
20     0
71     1
132    2
138    2
Name: Species, Length: 123, dtype: int32

In [43]:
Y_train.value_counts()

2    44
1    42
0    37
Name: Species, dtype: int64

In [44]:
Y_test.value_counts()

1    11
0    10
2    10
Name: Species, dtype: int64

## Creating Model

In [45]:
import xgboost as xgb

In [48]:
xgb_cls = xgb.XGBClassifier(objective="multiclass:softmax", num_class=3)

In [49]:
xgb_cls.fit(X_train, Y_train)

In [51]:
X_test

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
47,5.7,2.8,4.5,1.3
5,4.6,3.4,1.4,0.3
144,5.8,2.8,5.1,1.180357
72,7.0,3.2,4.7,1.4
114,6.4,2.8,5.6,2.1
86,5.7,2.9,4.2,1.3
150,6.8,3.0,5.5,2.1
28,5.2,4.1,1.5,0.1
19,5.1,3.7,1.5,0.4
37,4.4,3.2,1.3,0.2


In [52]:
preds = xgb_cls.predict(X_test)

In [53]:
preds

array([1, 0, 2, 1, 2, 1, 2, 0, 0, 0, 0, 1, 0, 0, 2, 2, 1, 2, 1, 2, 1, 1,
       1, 2, 0, 2, 1, 2, 0, 0, 2], dtype=int64)

In [54]:
import numpy as np

In [55]:
np.array(Y_test)

array([1, 0, 2, 1, 2, 1, 2, 0, 0, 0, 0, 1, 0, 0, 2, 2, 1, 2, 1, 1, 1, 1,
       1, 2, 0, 2, 1, 2, 0, 0, 2])

In [56]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [57]:
accuracy_score(Y_test, preds)

0.967741935483871

In [58]:
confusion_matrix(Y_test, preds)

array([[10,  0,  0],
       [ 0, 10,  1],
       [ 0,  0, 10]], dtype=int64)