
# Iris Classification with Logistic Regression (FastAPI Project Prep)

This notebook demonstrates:
- Loading the Iris dataset (built-in sklearn)
- Validating data quality (missing, duplicates, stats)
- Training a Pipeline (StandardScaler + Logistic Regression)
- Evaluating the model
- Saving the model (`model.pkl`) and metadata (`model_info.json`)
- Making sample predictions


In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import joblib
import json
from datetime import datetime

In [2]:
iris = load_iris()
X = iris['data']
y = iris['target']
feature_names = iris['feature_names']
class_names = iris['target_names'].tolist()

df = pd.DataFrame(X, columns=feature_names)
df['target'] = [class_names[i] for i in y]

print("First 5 rows of dataset:")
df.head()

First 5 rows of dataset:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
# Data Validation / Cleaning
print("\nMissing values per column:")
print(df.isnull().sum())

print("\nNumber of duplicate rows:", df.duplicated().sum())

print("\nBasic statistics:")
df.describe()


Missing values per column:
sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
target               0
dtype: int64

Number of duplicate rows: 1

Basic statistics:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [4]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [5]:
# Build & Train Model
clf = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(max_iter=1000, multi_class="auto"))
])
clf.fit(X_train, y_train)



0,1,2
,steps,"[('scaler', ...), ('lr', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [6]:
# Evaluation
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=class_names)

print(f"\nTest Accuracy: {acc:.4f}")
print("\nClassification Report:\n", report)


Test Accuracy: 0.9333

Classification Report:
               precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       0.90      0.90      0.90        10
   virginica       0.90      0.90      0.90        10

    accuracy                           0.93        30
   macro avg       0.93      0.93      0.93        30
weighted avg       0.93      0.93      0.93        30



In [7]:
# Save Artifacts
joblib.dump(clf, "model.pkl")

info = {
    "model_type": "Pipeline(StandardScaler -> LogisticRegression)",
    "problem_type": "classification",
    "created_at": datetime.utcnow().isoformat() + "Z",
    "features": feature_names,
    "classes": class_names,
    "metrics": {
        "test_accuracy": acc
    }
}
with open("model_info.json", "w") as f:
    json.dump(info, f, indent=2)

print("\nArtifacts saved: model.pkl, model_info.json")


Artifacts saved: model.pkl, model_info.json


  "created_at": datetime.utcnow().isoformat() + "Z",


In [9]:
# Sample Prediction
sample = np.array([[5.1, 3.5, 1.4, 0.2]])  # expected setosa
pred_idx = clf.predict(sample)[0]
proba = clf.predict_proba(sample)[0]

print("\nSample Input:", sample.tolist())
print("Prediction:", class_names[pred_idx])
print("Probabilities:", {class_names[i]: float(proba[i]) for i in range(len(proba))})


Sample Input: [[5.1, 3.5, 1.4, 0.2]]
Prediction: setosa
Probabilities: {'setosa': 0.9808127381969137, 'versicolor': 0.019186992121989523, 'virginica': 2.6968109686920263e-07}
