In [16]:

# ****** IMPORTANT PLEASE READ THIS *******
# I'm using the Dataset from 'Breast Cancer Wisconsin (Diagnostic)'
# the file is in the .data format it's in the name of wdbc.data in my project folder
# please include the same file of any file with .data extension and not .csv
# thank you for understanding and please update the data_path to import.


# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.utils import to_categorical

# Load the WDBC dataset
# File paths to the uploaded dataset
data_path = '/Users/tejasbk/Documents/1 Fall 2024/Assignments/Data mining/wdbc.data'
columns = ['ID', 'Diagnosis'] + [f'Feature_{i}' for i in range(1, 31)]  # Assuming 30 features based on typical structure
data = pd.read_csv(data_path, header=None, names=columns)

# Preprocess dataset
def preprocess_data(data):
    # Drop ID column as it’s not useful for classification
    data = data.drop('ID', axis=1)

    # Encode the 'Diagnosis' column (B = 0, M = 1)
    data['Diagnosis'] = data['Diagnosis'].map({'B': 0, 'M': 1})

    # Separate features and target
    X = data.drop('Diagnosis', axis=1)
    y = data['Diagnosis']
    return X, y

X, y = preprocess_data(data)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Helper function to calculate metrics
def calculate_metrics(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    return {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'FPR': fpr,
        'FNR': fnr
    }

# Random Forest Implementation
rf_model = RandomForestClassifier(random_state=42)
rf_metrics = []
for train_index, test_index in kf.split(X, y):
    X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]
    
    rf_model.fit(X_train_fold, y_train_fold)
    y_pred_fold = rf_model.predict(X_test_fold)
    rf_metrics.append(calculate_metrics(y_test_fold, y_pred_fold))
rf_table = pd.DataFrame(rf_metrics)


print("\n\n")

# SVM Implementation
svm_model = SVC(kernel='linear', random_state=42)
svm_metrics = []
for train_index, test_index in kf.split(X, y):
    X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]

    svm_model.fit(X_train_fold, y_train_fold)
    y_pred_fold = svm_model.predict(X_test_fold)
    svm_metrics.append(calculate_metrics(y_test_fold, y_pred_fold))
svm_table = pd.DataFrame(svm_metrics)


print("\n\n")

# LSTM Implementation
lstm_metrics = []
X_lstm = X.values.reshape(X.shape[0], 1, X.shape[1])  # Reshape for LSTM

# Define the LSTM model once
lstm_model = Sequential([
    LSTM(64, input_shape=(X_lstm.shape[1], X_lstm.shape[2]), activation='relu'),
    Dense(1, activation='sigmoid')
])

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


for train_index, test_index in kf.split(X_lstm, y):
    
    X_train_fold, X_test_fold = X_lstm[train_index], X_lstm[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]

    # Train the model
    lstm_model.fit(X_train_fold, y_train_fold, epochs=3, batch_size=32, verbose=1)
    
    # Predict and calculate metrics
    y_pred_fold = (lstm_model.predict(X_test_fold) > 0.5).astype(int).flatten()
    lstm_metrics.append(calculate_metrics(y_test_fold, y_pred_fold))
    

lstm_table = pd.DataFrame(lstm_metrics)

# Print metrics after epoch logs
print("\nRandom Forest Metrics:\n", rf_table.to_markdown(index=False))
print("\n\n")
print("SVM Metrics:\n", svm_table.to_markdown(index=False))
print("\n\n")
print("LSTM Metrics:\n", lstm_table.to_markdown(index=False))


print("\n\n")

# results
rf_avg_metrics = rf_table.mean().to_dict()
svm_avg_metrics = svm_table.mean().to_dict()
lstm_avg_metrics = lstm_table.mean().to_dict()

results = pd.DataFrame({
    'Random Forest': rf_avg_metrics,
    'SVM': svm_avg_metrics,
    'LSTM': lstm_avg_metrics
})

print("Comparison of Classification Algorithms:\n", results.to_markdown())








Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3

Random Forest Metrics:
 |   Accuracy |   Precision |   Recall |       FPR |       FNR |
|-----------:|------------:|---------:|----------:|----------:|
|   0.947368 |    0.913043 | 0.954545 | 0.0571429 | 0.0454545 |
|   0.982456 |    0.956522 | 1        | 0.0285714 | 0         |
|   0.964912 |    0.952381 | 0.952381 | 0.0277778 | 0.047619  |
|   0.912281 |    0.944444 | 0.809524 | 0.0277778 | 0.190476  |
|   0.947368 |    0.95     | 0.904762 | 0.0277778 | 0.0952381 |
|   0.964912 |    1        | 0.904762 | 0         | 0.0952381 |
|   0.964912 |    0.952381 | 0.952381 | 0.0277778 | 0.047619  |
|   0.947368 |    0.875    | 1        | 0.0833333 | 0         |
|   0.929825 |    0.947368 | 