In [None]:
# ### 1. Import Libraries
# Import necessary libraries for data manipulation, numerical operations, and machine learning.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

# ### 2. Helper Function
# Define a helper function to drop rows with missing values and reset the DataFrame index.
def dropresetidx(df):
    df = df.dropna()
    df = df.reset_index(drop=True)
    df.index += 1
    return df

In [None]:
# ### 3. Load Data
# Define the file path for the dataset and load it into a pandas DataFrame.
file_path = (r'C:\Users\flavia\Downloads\cancerdiag\wdbc.data')
df = pd.read_csv(file_path)
df.head()

# Apply the helper function to clean the DataFrame.
df = dropresetidx(df)

In [None]:
# ### 4. Data Preparation
# Define the column names based on the dataset's documentation.
column_name = ['ID', 'Diagnosis']
base_features = [
    'radius', 'texture', 'perimeter', 'area', 'smoothness', 
    'compactness', 'concavity', 'concave_points', 'symmetry', 'fractal_dimension'
]
for i in ['_mean', '_se', '_worst']:
    for feature in base_features:
        column_name.append(feature + i)

# Assign the created list of names to the DataFrame's columns.
df.columns = column_name

# Convert the categorical 'Diagnosis' column to numerical format (Malignant: 1, Benign: 0).
df['Diagnosis'].replace({'M': 1, 'B': 0}, inplace=True)

In [None]:
# ### 5. Data Splitting
# Split the dataset into training (80%) and testing (20%) sets to evaluate the model's performance on unseen data.
train_df, test_df = train_test_split(df, test_size=0.20, random_state=42)

# ### 6. Feature and Target Definition
# Define the list of feature columns and the target column.
features = ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 
            'concavity_mean', 'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean',
            'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se',
            'concavity_se', 'concave_points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst',
            'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst',
            'concave_points_worst', 'symmetry_worst', 'fractal_dimension_worst']
target = ['Diagnosis']

In [None]:
# ### 7. Feature Scaling
# Scale the features using StandardScaler to ensure all features have a mean of 0 and a standard deviation of 1.
# This helps the model converge faster and perform better.
scaler = StandardScaler()
X_train = scaler.fit_transform(train_df[features]).T
Y_train = train_df[target].values.T

X_test = scaler.transform(test_df[features]).T
Y_test = test_df[target].values.T

In [None]:
# ### 8. Neural Network Implementation
# Define the components of the neural network from scratch.

# Define the size of the input, hidden, and output layers.
def layer_sizes(X, Y):
    n_x = X.shape[0]  # Number of features
    n_h = 8           # Number of hidden units
    n_y = Y.shape[0]  # Number of output units
    return n_x, n_h, n_y

# Initialize the model's parameters (weights and biases).
def initialize_parameters(n_x, n_h, n_y):
    np.random.seed(2)
    W1 = np.random.randn(n_h, n_x) * 0.01
    b1 = np.zeros((n_h, 1))
    W2 = np.random.randn(n_y, n_h) * 0.01
    b2 = np.zeros((n_y, 1))
    parameters = {"W1": W1, "b1": b1, "W2": W2, "b2": b2}
    return parameters

# Implement the forward propagation process.
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def forward_propagation(X, parameters):
    W1, b1, W2, b2 = parameters['W1'], parameters['b1'], parameters['W2'], parameters['b2']
    Z1 = np.dot(W1, X) + b1
    A1 = np.tanh(Z1)  # Use tanh activation for the hidden layer
    Z2 = np.dot(W2, A1) + b2
    A2 = sigmoid(Z2) # Use sigmoid for the output layer for binary classification
    cache = {"Z1": Z1, "A1": A1, "Z2": Z2, "A2": A2}
    return A2, cache

# Compute the cross-entropy cost function to measure the model's error.
def compute_cost(A2, Y):
    m = Y.shape[1]
    logprobs = np.multiply(np.log(A2), Y) + np.multiply(np.log(1 - A2), 1 - Y)
    cost = -np.sum(logprobs) / m
    return float(np.squeeze(cost))

# Implement the backward propagation algorithm to calculate gradients.
def backward_propagation(parameters, cache, X, Y):
    m = X.shape[1]
    W2, A1, A2 = parameters['W2'], cache['A1'], cache['A2']
    dZ2 = A2 - Y
    dW2 = np.dot(dZ2, A1.T) / m
    db2 = np.sum(dZ2, axis=1, keepdims=True) / m
    dZ1 = np.dot(W2.T, dZ2) * (1 - np.power(A1, 2)) # Derivative of tanh
    dW1 = np.dot(dZ1, X.T) / m
    db1 = np.sum(dZ1, axis=1, keepdims=True) / m
    grads = {"dW1": dW1, "db1": db1, "dW2": dW2, "db2": db2}
    return grads

# Update the model's parameters using gradient descent.
def update_parameters(parameters, grads, learning_rate=1.2):
    W1 = parameters['W1'] - learning_rate * grads['dW1']
    b1 = parameters['b1'] - learning_rate * grads['db1']
    W2 = parameters['W2'] - learning_rate * grads['dW2']
    b2 = parameters['b2'] - learning_rate * grads['db2']
    return {"W1": W1, "b1": b1, "W2": W2, "b2": b2}

# Combine all functions into a single model training function.
def nn_model(X, Y, n_h, num_iterations=10000, learning_rate=1.2, print_cost=False):
    np.random.seed(3)
    n_x, _, n_y = layer_sizes(X, Y)
    parameters = initialize_parameters(n_x, n_h, n_y)
    
    for i in range(num_iterations):
        A2, cache = forward_propagation(X, parameters)
        cost = compute_cost(A2, Y)
        grads = backward_propagation(parameters, cache, X, Y)
        parameters = update_parameters(parameters, grads, learning_rate) # Corrected this line
        
        if print_cost and i % 1000 == 0:
            print(f"Cost after iteration {i}: {cost:.4f}")
            
    return parameters

# Define a function to make predictions on new data.
def predict(parameters, X):
    A2, _ = forward_propagation(X, parameters)
    return (A2 > 0.5).astype(int)

In [None]:
# ### 9. Model Training
# Train the neural network on the training data with specified hyperparameters.
parameters = nn_model(X_train, Y_train, n_h=9, num_iterations=10000, learning_rate=0.1, print_cost=True)


In [None]:
# ### 10. Model Evaluation
# Make predictions on both the training and test sets.
train_preds = predict(parameters, X_train)
test_preds = predict(parameters, X_test)

# Calculate and print the accuracy for both sets.
train_acc = np.mean(train_preds == Y_train) * 100
test_acc = np.mean(test_preds == Y_test) * 100
print(f"\nTrain Accuracy: {train_acc:.2f}%")
print(f"Test Accuracy: {test_acc:.2f}%")

# Print a detailed classification report for the test set.
print("\nClassification Report on Test Set:")
print(classification_report(Y_test.flatten(), test_preds.flatten()))

In [None]:
# ### 11. Prediction on Test DataFrame
# Add the model's predictions as a new column to the test DataFrame for comparison.
test_df['Pred_Label'] = test_preds.T.flatten()
print(test_df[['Diagnosis','Pred_Label']])

# Final accuracy check directly on the DataFrame.
accuracy_test = (test_df['Diagnosis'] == test_df['Pred_Label']).mean()
print(f'Final Accuracy Check: {accuracy_test:.2%}')