## Imports

In [119]:
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

## Read the Data

In [120]:
df = pd.read_csv('Iris.csv').iloc[:, 1:]
df

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [121]:
X = df.drop(columns='Species')
X

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [122]:
y = df.Species.str.strip().eq('Iris-setosa').astype(int)
y 

0      1
1      1
2      1
3      1
4      1
      ..
145    0
146    0
147    0
148    0
149    0
Name: Species, Length: 150, dtype: int64

## Split the Data

In [123]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(120, 4) (120,)
(30, 4) (30,)


In [124]:
# we used stratify=y to maintain the same class distribution in train and test sets

print(y_train.value_counts(normalize=True))
print('-----------------------------------')
print(y_test.value_counts(normalize=True))

Species
0    0.666667
1    0.333333
Name: proportion, dtype: float64
-----------------------------------
Species
0    0.666667
1    0.333333
Name: proportion, dtype: float64


## Train the Model

In [125]:
model = LogisticRegression().fit(X_train.values, y_train.values)
model

# if we use only X_train then we will get dataframe when we print X_train, if we use X_train.values then we will get numpy array.

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


## Model Evaluation

In [126]:
y_train_pred = model.predict(X_train.values)
y_test_pred = model.predict(X_test.values)

print(f'Train Acc: {accuracy_score(y_train, y_train_pred)}')
print(f'Test Acc: {accuracy_score(y_test, y_test_pred)}')

Train Acc: 1.0
Test Acc: 1.0


In [127]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        20
           1       1.00      1.00      1.00        10

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



## Model Output Format

In [128]:
y_train_pred

array([0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0])

In [129]:
# Now we got array beacuse we used X_train.values, in:  model = LogisticRegression().fit(X_train.values, y_train.values)

X_train.values

array([[6. , 2.7, 5.1, 1.6],
       [4.6, 3.4, 1.4, 0.3],
       [6.5, 3. , 5.8, 2.2],
       [7.6, 3. , 6.6, 2.1],
       [5.6, 2.5, 3.9, 1.1],
       [5.1, 3.8, 1.6, 0.2],
       [6.6, 3. , 4.4, 1.4],
       [5.5, 4.2, 1.4, 0.2],
       [5. , 3.2, 1.2, 0.2],
       [5.1, 3.5, 1.4, 0.3],
       [5.4, 3.9, 1.7, 0.4],
       [5.1, 2.5, 3. , 1.1],
       [6.8, 3. , 5.5, 2.1],
       [6.7, 2.5, 5.8, 1.8],
       [5. , 2.3, 3.3, 1. ],
       [5.9, 3. , 4.2, 1.5],
       [7. , 3.2, 4.7, 1.4],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [6. , 2.2, 5. , 1.5],
       [7.9, 3.8, 6.4, 2. ],
       [6.1, 3. , 4.9, 1.8],
       [5.4, 3.4, 1.5, 0.4],
       [7.4, 2.8, 6.1, 1.9],
       [6.9, 3.1, 5.4, 2.1],
       [4.6, 3.1, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [6.3, 2.5, 4.9, 1.5],
       [5.1, 3.7, 1.5, 0.4],
       [5.6, 3. , 4.5, 1.5],
       [6.8, 3.2, 5.9, 2.3],
       [6.6, 2.9, 4.6, 1.3],
       [7.7, 2.6, 6.9, 2.3],
       [5.5, 2.4, 3.7, 1. ],
       [5.1, 3

In [130]:
X_train.values[0]

array([6. , 2.7, 5.1, 1.6])

In [131]:
# model.predict([6. , 2.7, 5.1, 1.6])

# this wont work because model expect 2D array as input.

In [132]:
# both gives the predicted value but first one gives array and second one gives scalar value


print(model.predict(np.array([[6. , 2.7, 5.1, 1.6]])))

print(model.predict(np.array([[6. , 2.7, 5.1, 1.6]]))[0])

[0]
0


## Serialization

In [133]:
joblib.dump(model, 'model.joblib')

['model.joblib']