In [65]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np

In [5]:
data = pd.read_csv("final_data.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,0,0,5.1,3.5,1.4,0.2,Iris-setosa
1,1,1,4.7,3.2,1.6,0.2,Iris-setosa
2,2,2,4.9,3.1,1.5,0.1,Iris-setosa
3,3,3,4.4,2.9,1.4,0.2,Iris-setosa
4,4,4,5.0,3.4,1.5,0.2,Iris-setosa


In [8]:
data.drop(["Unnamed: 0","Id"],axis=1 ,inplace=True)

In [9]:
data.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.7,3.2,1.6,0.2,Iris-setosa
2,4.9,3.1,1.5,0.1,Iris-setosa
3,4.4,2.9,1.4,0.2,Iris-setosa
4,5.0,3.4,1.5,0.2,Iris-setosa


In [10]:
#label encoding

In [11]:
le = LabelEncoder()

In [13]:
data["Species"]=le.fit_transform(data["Species"])

In [14]:
data.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,0
1,4.7,3.2,1.6,0.2,0
2,4.9,3.1,1.5,0.1,0
3,4.4,2.9,1.4,0.2,0
4,5.0,3.4,1.5,0.2,0


In [15]:
## Data Checking

In [17]:
data.isna().sum()

SepalLengthCm    3
SepalWidthCm     1
PetalLengthCm    4
PetalWidthCm     2
Species          0
dtype: int64

In [19]:
for column in data.columns[:-1]:
    data[column].fillna(value = data[column].mean(),inplace=True)

In [20]:
data.isna().sum()

SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [21]:
data.dtypes

SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
Species            int32
dtype: object

In [25]:
data.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,0
1,4.7,3.2,1.6,0.2,0
2,4.9,3.1,1.5,0.1,0
3,4.4,2.9,1.4,0.2,0
4,5.0,3.4,1.5,0.2,0


In [23]:
#TrainTestSplit

In [46]:
from sklearn.model_selection import train_test_split

In [47]:
X_train,X_test,y_train,y_test = train_test_split(data.iloc[:,:-1],data.iloc[:,-1],test_size=0.2)

In [48]:
X_train.value_counts()

SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
4.9            3.1           1.500000       0.1             3
5.2            3.4           1.400000       0.2             2
6.4            2.8           5.600000       2.2             2
5.8            2.7           5.100000       1.9             2
5.0            3.5           1.300000       0.3             2
                                                           ..
5.4            3.4           1.700000       0.2             1
                             1.500000       0.4             1
               3.0           4.500000       1.5             1
5.3            3.7           3.785161       0.2             1
7.9            3.8           6.400000       2.0             1
Length: 119, dtype: int64

In [49]:
y_train.value_counts()

0    44
1    42
2    41
Name: Species, dtype: int64

In [30]:
#Modelling

In [53]:
import xgboost as xgb

In [57]:
xgb_cls = xgb.XGBClassifier(objective = "multiclass:softmax",num_class=3)

In [58]:
xgb_cls.fit(X_train,y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_class=3, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=None, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [59]:
preds = xgb_cls.predict(X_test)

In [60]:
preds

array([1, 2, 0, 0, 1, 2, 2, 0, 2, 2, 2, 1, 0, 1, 1, 0, 2, 2, 1, 2, 2, 1,
       2, 0, 2, 0, 0, 2, 0, 2, 2, 0])

In [66]:
np.array(y_test)

array([1, 2, 0, 0, 1, 2, 2, 0, 1, 2, 1, 1, 0, 1, 1, 0, 2, 2, 1, 2, 2, 1,
       2, 0, 2, 0, 0, 2, 0, 2, 2, 0])

In [68]:
## Accuracy and matrix

In [69]:
from sklearn.metrics import accuracy_score,confusion_matrix

In [70]:
accuracy_score(y_test,preds)

0.9375

In [71]:
confusion_matrix(y_test,preds)

array([[10,  0,  0],
       [ 0,  7,  2],
       [ 0,  0, 13]], dtype=int64)