# Environement set up details

## Installing locally (rather than colab)

Using `uv` (from https://docs.astral.sh/uv/)

```
# install uv in its own global location (using pipx)
pipx install uv
# create a virtual environment
uv venv
# activate the environment
source .venv/bin/activate
# install the Jupyter notebook packages
uv pip install ipykernel jupyter notebook
# install required packages
uv pip install numpy pandas matplotlib


In [None]:
# imports for the project

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from matplotlib.colors import ListedColormap

## Use the same iris data

In [None]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None)

df.tail()

# Extract the first hundred class labels (it is known that 50 are Iris-setosa and 50 are Iris-virginica)
y = df.iloc[0:100, 4].values
# update the class labels to -1 or 1
y = np.where(y == 'Iris-setosa', -1, 1)
# Extract the first hundred features in columns 0 and 1, representively representing sepal length and petal length
X = df.iloc[0:100, [0, 2]].values


## The code for the Adaline class

In [None]:
class AdalineGD(object):
    """ADAptive LINEar neuron classifier.

    Adaline implements a continuous linear activation function (identity)
    and uses gradient descent to minimize the cost function, making it
    different from the Perceptron which uses a unit step function.

    Parameters
    ------------
    eta : float
      Learning rate (between 0.0 and 1.0)
    n_iter : int
      Passes over the training dataset.
    random_state : int
      Random number generator seed for random weight
      initialization.


    Attributes
    -----------
    w_ : 1d-array
      Weights after fitting.
    cost_ : list
      Sum-of-squares cost function value in each epoch.

    """
    def __init__(self, eta=0.01, n_iter=50, random_state=1):
        self.eta = eta
        self.n_iter = n_iter
        self.random_state = random_state

    def fit(self, X, y):
        """ Fit training data using gradient decent

        Unlike the Perceptron, Adaline updates weights based on a continuous
        linear activation function rather than a threshold function, which
        allows for gradient-based optimization.

        Implementation steps:
        1. Initialize weights with small random values
        2. For each epoch:
           a. Calculate the net input (weighted sum via the aggregation function)
           b. Apply activation function (in this case it is the identity function)
           c. Calculate errors (difference between actual and predicted)
           d. Calculate the derivative (gradient) of the cost function wrt the weights and bias
           e. Update all weights and biases based on the derivatives multipled by the learning rate
           f. Calculate and store cost for this epoch

        Note: the deriveative of the cost function wrt the weights is calculated using the chain rule:
          ∂E/∂w_j = ∂E/∂φ * ∂φ/∂z * ∂z/∂w_j

        where:
          ∂E/∂φ = -(y - φ) = (φ - y)  [The actual output (aggregation_function) minus the desired output (y)]
          ∂φ/∂z = 1                   [The derivative of the activation (activation_function) wrt the net input (aggregation_function)]
          ∂z/∂w_j = x_j               [The derivative of the net input (aggregation_function) wrt the jth input (X_j)]

        Here we jump to using numpy in the code.

        The important thing to remember is... the Jacobian matrix is the first order derivative of the cost function.

        Rather than looping though each input (also called a feature) and updating the weights, we can use matrix multiplication to update all weights at once.

        The matrix multiplication is done by taking the dot product of the transpose of the input matrix (X.T) and the error vector (y - self.activation_function(self.aggregation_function(X))).

        This reduces to this code:
          X.T.dot(X.dot(w) - y) = X.T.dot(y - self.activation_function(self.aggregation_function(X)))

        In the for loop below this code is broken out to mirror the steps in the algorithm.

        Parameters
        ----------
        X : {array-like}, shape = [n_examples, n_features]
          Training vectors, where n_examples is the number of examples and
          n_features is the number of features.
        y : array-like, shape = [n_examples]
          Target values.

        Returns
        -------
        self : object

        """
        rgen = np.random.RandomState(self.random_state)
        # Step 1: Initialize weights with small random values
        self.w_ = rgen.normal(loc=0.0, scale=0.01, size=1 + X.shape[1])
        self.cost_ = []

        for i in range(self.n_iter):
            # Step 2a: Calculate net input (weighted sum via aggregation_function)
            net_input = self.aggregation_function(X)

            # Step 2b: Apply activation function (in this case it is the identity function)
            output = self.activation_function(net_input)

            # Step 2c: Calculate prediction error
            error_vector = output - y

            # Step 2d: Calculate the derivative (gradient) of the cost function wrt the weights and bias
            # using the chain rule: ∂E/∂w = X.T.dot(output - y)
            derivative_cost_wrt_weights = X.T.dot(error_vector)
            derivative_cost_wrt_bias = error_vector.sum()

            # Step 2e: Apply standard gradient descent update: w = w - eta * gradient
            self.w_[1:] -= self.eta * derivative_cost_wrt_weights
            self.w_[0] -= self.eta * derivative_cost_wrt_bias

            # Step 2f: Calculate and store cost
            cost = (error_vector**2).sum() / 2.0  # Sum of squared errors / 2
            self.cost_.append(cost)
        return self

    def aggregation_function(self, X):
        """Calculate net input"""
        return np.dot(X, self.w_[1:]) + self.w_[0]

    def activation_function(self, X):
        """Compute linear activation"""
        return X

    def predict(self, X):
        """Return class label after unit step"""
        return np.where(self.activation_function(self.aggregation_function(X)) >= 0.0, 1, -1)

## Test different learning rates

It was interesting to see the effect of different learning rates

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=5, figsize=(10, 4))

ada1 = AdalineGD(n_iter=10, eta=0.001).fit(X, y)
ax[0].plot(range(1, len(ada1.cost_) + 1), np.log10(ada1.cost_), marker='o')
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('log(SSE)')
ax[0].set_title('eta 0.001')


ada5 = AdalineGD(n_iter=10, eta=0.0006).fit(X, y)
ax[1].plot(range(1, len(ada5.cost_) + 1), np.log10(ada5.cost_), marker='x')
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('log(SSE)')
ax[1].set_title('eta 0.0006')


ada3 = AdalineGD(n_iter=40, eta=0.0005).fit(X, y)
ax[2].plot(range(1, len(ada3.cost_) + 1), ada3.cost_, marker='x')
ax[2].set_xlabel('Epochs')
ax[2].set_ylabel('SSE')
ax[2].set_title('eta 0.0005')


ada2 = AdalineGD(n_iter=500, eta=0.0002).fit(X, y)
ax[3].plot(range(1, len(ada2.cost_) + 1), ada2.cost_, marker='o')
ax[3].set_xlabel('Epochs')
ax[3].set_ylabel('SSE')
ax[3].set_title('eta 0.0002')


ada4 = AdalineGD(n_iter=1000, eta=0.00009).fit(X, y)
ax[4].plot(range(1, len(ada4.cost_) + 1), ada4.cost_, marker='+')
ax[4].set_xlabel('Epochs')
ax[4].set_ylabel('SSE')
ax[4].set_title('eta 0.000009')

# plt.savefig('images/02_11.png', dpi=300)
plt.show()


The first two graphs have logarithmic y-axis, we can see that the cost (equivalent to the error rate in the Preceptron) increases and doesn't come back down. Interestingly the second does start learn, however it overshots the optimal value.

The third graph (note it does not have a logrthmic y-axis) shows a learning rate ot 0.0005 approaches a optimal value but goes past it.

The last two graphs show settings of 0.0002 and 0.00009, both converge on the optimal value.

## Viewing the decision boundaries

### The messy version - investigation


In [None]:

def plot_decision_regions(X, y, classifier, resolution=0.02):
    """
    Plot decision regions for a classifier in a 2D feature space.

    Parameters:
    X : array-like, shape = [n_samples, n_features]
        Feature matrix.
    y : array-like, shape = [n_samples]
        Target vector.
    classifier : object
        Trained classifier with a predict method.
    resolution : float, optional (default=0.02)
        Resolution of the mesh grid used to plot the decision surface.

    Returns:
    None

    This function visualizes the decision boundaries of a classifier by plotting
    the decision surface in a 2D feature space.
    """
    # setup marker generator and color map
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])

    # calculate a grid of the features (feature_1 is the sepal length and feature_2 is the petal length)

    # firstly get the min and max of each feature
    feature_1_min, feature_1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    feature_2_min, feature_2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    # Create a fine grid of points covering our feature space so we can visualize
    # how the perceptron classifies every possible combination of feature values
    feature_1_values, feature_2_values = np.meshgrid(np.arange(feature_1_min, feature_1_max, resolution),
                                                     np.arange(feature_2_min, feature_2_max, resolution))
    # now flatten the 2 dimensional arrays into a one dimension array for both features
    # the array is transposed so that each point in the grid is represented as a row
    feature_grid = np.array([feature_1_values.ravel(), feature_2_values.ravel()]).T

    # use the classifier to calculate the label at each point on the gird
    label_per_point_on_feature_grid = classifier.predict(feature_grid)
    # Reshape the predictions back to match our grid dimensions for plotting
    label_per_point_on_feature_grid = label_per_point_on_feature_grid.reshape(feature_1_values.shape)

    # plot the decision surface

    # Create a filled contour plot where different colors show the different predicted classes
    # alpha=0.4 makes the coloring semi-transparent so we can see the data points
    plt.contourf(feature_1_values, feature_2_values, label_per_point_on_feature_grid, alpha=0.4, cmap=cmap)
    # Set the plot limits to show the full decision boundary region
    plt.xlim(feature_1_values.min(), feature_1_values.max())
    plt.ylim(feature_2_values.min(), feature_2_values.max())



In [None]:
def plot_class_samples(X, y, label_names):
    """
    Plot class samples in a 2D feature space.

    Parameters:
    X : array-like, shape = [n_samples, n_features]
        Feature matrix.
    y : array-like, shape = [n_samples]
        Target vector.
    label_names : list
        List of label names corresponding to the unique classes in y.

    Returns:
    None

    This function plots the class samples in a 2D feature space with a legend.
    """
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])

    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
                    alpha=0.8, c=[cmap(idx)],
                    marker=markers[idx], label=label_names[idx])
    plt.legend()

In [None]:
# ada_gd = AdalineGD(n_iter=35, eta=0.01)
# ada_gd.fit(X, y)

ada_gd = AdalineGD(n_iter=25, eta=0.0005).fit(X, y)

plot_decision_regions(X, y, classifier=ada_gd)
label_names = ['setosa', 'versicolor']
plot_class_samples(X, y, label_names)

plt.title('Adaline - Gradient Descent')
plt.xlabel('sepal length [standardized]')
plt.ylabel('petal length [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('images/02_14_1.png', dpi=300)
plt.show()

plt.plot(range(1, len(ada_gd.cost_) + 1), ada_gd.cost_, marker='o')
plt.xlabel('Epochs')
plt.ylabel('SSE')

plt.tight_layout()
# plt.savefig('images/02_14_2.png', dpi=300)
plt.show()

In [None]:
# ada_gd = AdalineGD(n_iter=35, eta=0.01)
# ada_gd.fit(X, y)

ada_gd = AdalineGD(n_iter=20, eta=0.0005).fit(X, y)

plot_decision_regions(X, y, classifier=ada_gd)
label_names = ['setosa', 'versicolor']
plot_class_samples(X, y, label_names)

plt.title('Adaline - Gradient Descent')
plt.xlabel('sepal length [standardized]')
plt.ylabel('petal length [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('images/02_14_1.png', dpi=300)
plt.show()

plt.plot(range(1, len(ada_gd.cost_) + 1), ada_gd.cost_, marker='o')
plt.xlabel('Epochs')
plt.ylabel('SSE')

plt.tight_layout()
# plt.savefig('images/02_14_2.png', dpi=300)
plt.show()

In [None]:
# ada_gd = AdalineGD(n_iter=35, eta=0.01)
# ada_gd.fit(X, y)

ada_gd = AdalineGD(n_iter=35, eta=0.0005).fit(X, y)

plot_decision_regions(X, y, classifier=ada_gd)
label_names = ['setosa', 'versicolor']
plot_class_samples(X, y, label_names)

plt.title('Adaline - Gradient Descent')
plt.xlabel('sepal length [standardized]')
plt.ylabel('petal length [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('images/02_14_1.png', dpi=300)
plt.show()

plt.plot(range(1, len(ada_gd.cost_) + 1), ada_gd.cost_, marker='o')
plt.xlabel('Epochs')
plt.ylabel('SSE')

plt.tight_layout()
# plt.savefig('images/02_14_2.png', dpi=300)
plt.show()

In [None]:
# ada_gd = AdalineGD(n_iter=35, eta=0.01)
# ada_gd.fit(X, y)

ada_gd = AdalineGD(n_iter=50, eta=0.0005).fit(X, y)

plot_decision_regions(X, y, classifier=ada_gd)
label_names = ['setosa', 'versicolor']
plot_class_samples(X, y, label_names)

plt.title('Adaline - Gradient Descent')
plt.xlabel('sepal length [standardized]')
plt.ylabel('petal length [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('images/02_14_1.png', dpi=300)
plt.show()

plt.plot(range(1, len(ada_gd.cost_) + 1), ada_gd.cost_, marker='o')
plt.xlabel('Epochs')
plt.ylabel('SSE')

plt.tight_layout()
# plt.savefig('images/02_14_2.png', dpi=300)
plt.show()

### The clean version - conclusion

In [None]:
def run_adaline_analysis(X, y, custom_config=None):
    """
    Run Adaline analysis with a specified configuration

    Parameters:
    X : array-like, feature matrix
    y : array-like, target vector
    custom_config : dict, custom configuration (optional)

    Returns:
    list of trained models
    """
    # Default configuration
    default_config = {
        "learning_rates": [0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005],
        "n_iters": [10, 15, 20, 25, 30, 35],
        "markers": ['o', 'x', 'x', 'o', '+', '+']
    }

    # Use custom config if provided, otherwise use default
    config = custom_config if custom_config is not None else default_config

    # Extract configuration values
    learning_rates = config["learning_rates"]
    n_iters = config["n_iters"]
    markers = config.get("markers", ['o'] * len(learning_rates))

    # Create a figure with 2 rows, N columns
    n_experiments = len(learning_rates)
    fig, axes = plt.subplots(nrows=2, ncols=n_experiments, figsize=(3*n_experiments, 8))

    # Train models and plot results
    models = []
    for i, (eta, n_iter, marker) in enumerate(zip(learning_rates, n_iters, markers)):
        # Train model
        model = AdalineGD(n_iter=n_iter, eta=eta).fit(X, y)
        models.append(model)

        # Row 1: Learning curves
        if i < 2:
            axes[0, i].plot(range(1, len(model.cost_) + 1), np.log10(model.cost_), marker=marker)
            axes[0, i].set_ylabel('log(SSE)')
        else:
            axes[0, i].plot(range(1, len(model.cost_) + 1), model.cost_, marker=marker)
            axes[0, i].set_ylabel('SSE')

        axes[0, i].set_xlabel('Epochs')
        axes[0, i].set_title(f'eta {eta}')

        # Row 2: Decision boundaries
        # Plot decision boundary in the second row
        plot_decision_regions(X, y, classifier=model, resolution=0.02, ax=axes[1, i])

        # Add labels to last subplot in second row
        if i == 4:
            axes[1, i].legend(['setosa', 'versicolor'])

        axes[1, i].set_xlabel('sepal length')
        axes[1, i].set_ylabel('petal length')
        axes[1, i].set_title(f'Decision Boundary (eta={eta})')

    plt.tight_layout()
    plt.show()

    return models

# Modify plot_decision_regions to accept an axis parameter
def plot_decision_regions(X, y, classifier, resolution=0.02, ax=None):
    """
    Plot decision regions for a classifier in a 2D feature space.

    Parameters:
    X : array-like, shape = [n_samples, n_features]
        Feature matrix.
    y : array-like, shape = [n_samples]
        Target vector.
    classifier : object
        Trained classifier with a predict method.
    resolution : float, optional (default=0.02)
        Resolution of the mesh grid used to plot the decision surface.
    ax : matplotlib axis, optional
        Axis to plot on. If None, uses current axis.

    This function visualizes the decision boundaries of a classifier by plotting
    the decision surface in a 2D feature space.
    """
    # setup marker generator and color map
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])

    # If ax is None, get current axis
    if ax is None:
        ax = plt.gca()

    # Calculate grid of the features
    feature_1_min, feature_1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    feature_2_min, feature_2_max = X[:, 1].min() - 1, X[:, 1].max() + 1

    # Create a fine grid of points
    feature_1_values, feature_2_values = np.meshgrid(
        np.arange(feature_1_min, feature_1_max, resolution),
        np.arange(feature_2_min, feature_2_max, resolution)
    )

    # Flatten and transpose the grid for prediction
    feature_grid = np.array([feature_1_values.ravel(), feature_2_values.ravel()]).T

    # Use the classifier to predict labels for the grid
    label_per_point = classifier.predict(feature_grid)
    label_per_point = label_per_point.reshape(feature_1_values.shape)

    # Plot the decision surface
    ax.contourf(feature_1_values, feature_2_values, label_per_point,
               alpha=0.4, cmap=cmap)

    # Set plot limits
    ax.set_xlim(feature_1_values.min(), feature_1_values.max())
    ax.set_ylim(feature_2_values.min(), feature_2_values.max())

    # Plot class samples
    for idx, cl in enumerate(np.unique(y)):
        ax.scatter(
            x=X[y == cl, 0],
            y=X[y == cl, 1],
            alpha=0.8,
            c=[cmap(idx)],
            marker=markers[idx],
            label=cl,
            edgecolors='black'
        )

# Run the analysis
models = run_adaline_analysis(X, y)

In [None]:
config = {
    "learning_rates": [0.001, 0.0006, 0.0005, 0.0002, 0.00009],
    "n_iters": [10, 10, 40, 500, 1000],
    "markers": ['o', 'x', 'o', '+', '+']
}


# Run the analysis
models = run_adaline_analysis(X, y, custom_config=config)



In [None]:
config = {
    "learning_rates": [0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005],
    "n_iters": [5, 7, 8, 40, 50, 100],
    "markers": ['o', 'x', 'o', '+', '+', '+']
}


# Run the analysis
models = run_adaline_analysis(X, y, custom_config=config)
