Classify Iris dataset

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [None]:
data = pd.read_csv('iris.csv')

# Encode label (species)
label_map = {label: idx for idx, label in enumerate(data['species'].unique())}
data['species'] = data['species'].map(label_map)

mask = data['species'] < 2
data = data[mask]
# Chia dữ liệu thành input (X) và output (y)
value = data.values
X = value[:, 0:4]  # Input features (150, 5)
y = value[:, -1]   # Output labels (150,)

# Chia tập train/test
train_x = X[:67, :]
train_y = y[:67]
test_x = X[67:100, :]
test_y = y[67:100]

# Hàm scale dữ liệu thủ công về khoảng [0, 1]
def manual_min_max_scaler(data):
    # Tìm min và max của từng cột (feature)
    min_vals = np.min(data, axis=0)
    max_vals = np.max(data, axis=0)
    # Áp dụng công thức scale
    scaled_data = (data - min_vals) / (max_vals - min_vals)
    return scaled_data

# Scale dữ liệu
train_x_scaled = manual_min_max_scaler(train_x)
test_x_scaled = manual_min_max_scaler(test_x)


In [181]:
import numpy as np

class LogisticRegression:
    def __init__(self, learning_rate=1e-2, epoch=10000):
        self.learning_rate = learning_rate
        self.epoch = epoch
        self.w = None
        self.bias = 0
        self.loss = []

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def compute_cost(self, X, y):
        n = X.shape[0]
        z = np.dot(X, self.w) + self.bias
        s = self.sigmoid(z)
        cost = (-1 / n) * np.sum(y * np.log(s) + (1 - y) * np.log(1 - s))
        return cost

    def compute_gradient(self, X, y):
        n = X.shape[0]
        z = np.dot(X, self.w) + self.bias
        s = self.sigmoid(z)
        dw = (1 / n) * np.dot(X.T, (s - y))
        db = (1 / n) * np.sum(s - y)
        return dw, db

    def train(self, X, y):
        self.w = np.zeros(X.shape[1])
        for i in range(self.epoch):
            dw, db = self.compute_gradient(X, y)
            self.w -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

            cost = self.compute_cost(X, y)
            self.loss.append(cost)

            if i % 100 == 0:
                print(f"Iteration: {i}, Cost = {cost}")

    def predict(self, x_test, threshold=0.5):
        s = self.sigmoid(np.dot(x_test, self.w) + self.bias)
        return s >= threshold


In [182]:
model = LogisticRegression()
model.train(train_x_scaled, train_y)

Iteration: 0, Cost = 0.6921365157378996
Iteration: 100, Cost = 0.6108659627137228
Iteration: 200, Cost = 0.5545559857986103
Iteration: 300, Cost = 0.5111155616698746
Iteration: 400, Cost = 0.4750891901221578
Iteration: 500, Cost = 0.4439180895742659
Iteration: 600, Cost = 0.41631244872781525
Iteration: 700, Cost = 0.3915521153995611
Iteration: 800, Cost = 0.3691826890007213
Iteration: 900, Cost = 0.34888139678490376
Iteration: 1000, Cost = 0.3303968903406982
Iteration: 1100, Cost = 0.3135215283139004
Iteration: 1200, Cost = 0.2980779991213439
Iteration: 1300, Cost = 0.28391229741404994
Iteration: 1400, Cost = 0.2708895493100522
Iteration: 1500, Cost = 0.2588911605309163
Iteration: 1600, Cost = 0.24781262953427738
Iteration: 1700, Cost = 0.2375617430892751
Iteration: 1800, Cost = 0.22805703077520395
Iteration: 1900, Cost = 0.21922642048720198
Iteration: 2000, Cost = 0.21100606327601368
Iteration: 2100, Cost = 0.20333930611147097
Iteration: 2200, Cost = 0.19617579529300072
Iteration: 230

In [183]:
y_pred = model.predict(test_x_scaled)
print(classification_report(test_y, y_pred, zero_division=0))

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         0
         1.0       1.00      0.39      0.57        33

    accuracy                           0.39        33
   macro avg       0.50      0.20      0.28        33
weighted avg       1.00      0.39      0.57        33

