In [1]:
# Import necessary libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2  # OpenCV for image processing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.utils import resample


In [2]:
# Set the image size
IMAGE_SIZE = (64, 64)  # Adjust as necessary
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

# Constants
IMAGE_SIZE = (64, 64)  # Update to the desired size for resizing images

def load_data(data_dir):
    images = []
    labels = []
    
    for label in ['malignant', 'benign']:
        folder_path = os.path.join(data_dir, label)
        for filename in os.listdir(folder_path):
            img_path = os.path.join(folder_path, filename)
            img = cv2.imread(img_path)
            if img is not None:
                img = cv2.resize(img, IMAGE_SIZE)
                img = img.flatten()  # Flatten to 1D array
                images.append(img)
                labels.append(1 if label == 'malignant' else 0)  # 1 for malignant, 0 for benign

    images = np.array(images)
    labels = np.array(labels)
    
    return images, labels


# Load images and labels
data_dir = r'C:\Users\sanji\OneDrive\Desktop\abhi assi\BreaKHis_Total_dataset'  # Update with your dataset path
X, y = load_data(data_dir)

In [3]:
# Assuming X is your feature matrix and y is your target vector
# Convert X and y to a DataFrame for easier manipulation
data = pd.DataFrame(X)
data['target'] = y

# Separate data by class
class_0 = data[data['target'] == 0]
class_1 = data[data['target'] == 1]

# Define the size of the test set for each class
test_size_per_class = min(len(class_0), len(class_1)) // 5  # e.g., 20% of the smaller class
m = int(0.1 * (len(class_0) + len(class_1)))
# Sample equal number of instances from each class for the test set
test_0 = class_0.sample(n= m, random_state=42)
test_1 = class_1.sample(n= m, random_state=42)

# Combine the samples to form the test set
test_data = pd.concat([test_0, test_1])

# Drop the test instances from the original data to form the train set
train_data = data.drop(test_data.index)

# Separate features and target for train and test sets
X_train = train_data.drop(columns='target')
y_train = train_data['target']
X_test = test_data.drop(columns='target')
y_test = test_data['target']

# Separate majority and minority classes in the training set
malignant_images = X_train[y_train == 1]
benign_images = X_train[y_train == 0]

# Determine the number of samples to match classes in the training set
n_samples = max(len(malignant_images), len(benign_images))

# Resample minority class in the training set to match the majority class
malignant_images_resampled = resample(
    malignant_images, replace=True, n_samples=n_samples, random_state=42
)
benign_images_resampled = resample(
    benign_images, replace=True, n_samples=n_samples, random_state=42
)

# Combine resampled images and labels for the training set
X_train_balanced = np.vstack((malignant_images_resampled, benign_images_resampled))
y_train_balanced = np.hstack((
    np.ones(len(malignant_images_resampled)),
    np.zeros(len(benign_images_resampled))
))

# Shuffle the balanced training set
indices = np.arange(X_train_balanced.shape[0])
np.random.shuffle(indices)
X_train_balanced = X_train_balanced[indices]
y_train_balanced = y_train_balanced[indices]

# The final training and testing sets
X_train, y_train = X_train_balanced, y_train_balanced


In [14]:
from sklearn.decomposition import PCA

pca = PCA(n_components=5)

 # Fit PCA on the training features and transform both train and test sets
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)  # Only transform on test set
 # Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_pca)
X_test_scaled = scaler.transform(X_test_pca)

In [15]:
# Support Vector Machine, after hyperparameter tuning
svm_model = SVC(kernel='rbf', C=1.0,max_iter=10000, gamma='scale')
svm_model.fit(X_train_scaled, y_train)
y_pred = svm_model.predict(X_test_scaled)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("confusion matrix:\n",confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.81      0.83       778
           1       0.82      0.86      0.84       778

    accuracy                           0.83      1556
   macro avg       0.83      0.83      0.83      1556
weighted avg       0.83      0.83      0.83      1556

confusion matrix:
 [[627 151]
 [109 669]]


In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
# Initialize and train the model
rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_classifier.fit(X_train_scaled, y_train)

# Make predictions
y_pred = rf_classifier.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Feature importance
importances = rf_classifier.feature_importances_
print("Feature Importances:", importances)

Accuracy: 0.8354755784061697
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.80      0.83       778
           1       0.81      0.87      0.84       778

    accuracy                           0.84      1556
   macro avg       0.84      0.84      0.84      1556
weighted avg       0.84      0.84      0.84      1556

Feature Importances: [0.20068307 0.4255214  0.10911647 0.18970275 0.07497631]


In [9]:
from sklearn.decomposition import PCA

pca = PCA(n_components=100)

 # Fit PCA on the training features and transform both train and test sets
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)  # Only transform on test set
 # Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_pca)
X_test_scaled = scaler.transform(X_test_pca)

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
# Initialize and train the model
rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_classifier.fit(X_train_scaled, y_train)

# Make predictions
y_pred = rf_classifier.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Feature importance
importances = rf_classifier.feature_importances_
print("Feature Importances:", importances)

Accuracy: 0.781491002570694
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.69      0.76       778
           1       0.74      0.88      0.80       778

    accuracy                           0.78      1556
   macro avg       0.79      0.78      0.78      1556
weighted avg       0.79      0.78      0.78      1556

Feature Importances: [0.06960598 0.24494289 0.02523403 0.12250307 0.0085763  0.00765283
 0.00653668 0.00594069 0.00472892 0.00572319 0.00536718 0.00717297
 0.0050676  0.00431546 0.00447507 0.006577   0.00644891 0.00534798
 0.00482633 0.00659712 0.00552294 0.00565997 0.00528607 0.00579762
 0.00656071 0.00549012 0.00405657 0.0049937  0.00546873 0.00563109
 0.00696395 0.00444805 0.00450713 0.00476362 0.00428044 0.00531696
 0.00527993 0.0049596  0.00817602 0.00461288 0.00624581 0.00535857
 0.00527879 0.00473282 0.0053972  0.0045435  0.00521697 0.00538399
 0.00510754 0.00426865 0.00513883 0.00469178 0.00536866 0.006361