In [26]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [27]:
class Perceptron:
    def __init__(self, learning_rate=0.01, max_iterations=1000):
        self.learning_rate = learning_rate
        self.max_iterations = max_iterations
        self.weights = None
        self.bias = None
        self.errors_history = []
        
    def fit(self, X, y):
        # Convert labels from 0,1 to -1,1
        y = np.where(y == 0, -1, 1)
        
        # Initialize weights and bias
        n_features = X.shape[1]
        self.weights = np.zeros(n_features)
        self.bias = 0
        
        for _ in range(self.max_iterations):
            errors = 0
            for xi, target in zip(X, y):
                prediction = self.predict_one(xi)
                
                if target * prediction <= 0:
                    update = self.learning_rate * target
                    self.weights += update * xi
                    self.bias += update
                    errors += 1
            
            self.errors_history.append(errors)
            
            if errors == 0:
                break
                
        return self
    
    def predict_one(self, x):
        activation = np.dot(x, self.weights) + self.bias
        return np.sign(activation)
    
    def predict(self, X):
        predictions = np.array([self.predict_one(x) for x in X])
        # Convert back to 0,1 from -1,1
        return np.where(predictions == -1, 0, 1)
    
    def score(self, X, y):
        """Calculate accuracy"""
        return np.mean(self.predict(X) == y)
    
    def plot_error_history(self):
        plt.figure(figsize=(10, 6))
        plt.plot(range(len(self.errors_history)), self.errors_history, marker='o')
        plt.xlabel('Iteration')
        plt.ylabel('Number of Misclassifications')
        plt.title('Perceptron Training Error History')
        plt.grid(True)
        plt.show()


In [30]:
# Load and prepare the data
def prepare_data():
    # Load breast cancer dataset
    cancer = load_breast_cancer()
    X = cancer.data
    y = cancer.target
    print("cancer shape", cancer.data.shape)
    print("cancer target names", cancer.target_names)
    print("cancer feature names", cancer.feature_names)

    df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
    print(df.head())
    # df['target'] = cancer.target
    # print(df.head())
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Scale the features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # return X_train, X_test, y_train, y_test, cancer.feature_names

In [31]:
# Main execution
if __name__ == "__main__":
    # Prepare data
    X_train, X_test, y_train, y_test, feature_names = prepare_data()
    
    # Create and train perceptron
    perceptron = Perceptron(learning_rate=0.01, max_iterations=1000)
    perceptron.fit(X_train, y_train)
    
    # Calculate and print accuracies
    train_accuracy = perceptron.score(X_train, y_train)
    test_accuracy = perceptron.score(X_test, y_test)
    
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Testing Accuracy: {test_accuracy:.4f}")
    
    # Plot error history
    perceptron.plot_error_history()
    
    # Print feature importance
    feature_importance = np.abs(perceptron.weights)
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5
    
    # Plot feature importance
    plt.figure(figsize=(12, 6))
    plt.barh(pos, feature_importance[sorted_idx])
    plt.yticks(pos, feature_names[sorted_idx])
    plt.xlabel('Absolute Weight Value (Feature Importance)')
    plt.title('Feature Importance in Breast Cancer Classification')
    plt.tight_layout()
    plt.show()
    
    # Print top 5 most important features
    top_features_idx = np.argsort(np.abs(perceptron.weights))[-5:]
    print("\nTop 5 Most Important Features:")
    for idx in reversed(top_features_idx):
        print(f"{feature_names[idx]}: {perceptron.weights[idx]:.4f}")

FileNotFoundError: [Errno 2] No such file or directory: '/home/yubelgg/personal/notes/zettelkasten/ML-CSCI325/.venv/lib/python3.12/site-packages/sklearn/datasets/data/breast_cancer.csv'