# Library

In [1]:
import cv2
import os
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from skimage import exposure, feature
from skimage.feature import hog
from imutils import paths
import matplotlib.pyplot as plt
import argparse
from sklearn.metrics import confusion_matrix, classification_report, f1_score, accuracy_score, log_loss
import seaborn as sns
import pandas as pd
import random
import shutil
import time

# Local Binary Pattern Setup

In [2]:
# Local Binnary Patterns (LBP) class
class LocalBinaryPatterns:
    def __init__(self, numPoints, radius):
        self.numPoints = numPoints
        self.radius = radius

    def describe(self, image, eps=1e-7):
        lbp = feature.local_binary_pattern(image, self.numPoints,
                                           self.radius, method="uniform")
        (hist, _) = np.histogram(lbp.ravel(),
                                  bins=np.arange(0, self.numPoints + 3),
                                  range=(0, self.numPoints + 2))

        # Normalize the histogram
        hist = hist.astype("float")
        hist /= (hist.sum() + eps)

        return hist

In [3]:
# Initialize the local binary patterns descriptor along with
desc = LocalBinaryPatterns(8, 1)

# Histogram Oriented Gradient Setup

In [4]:
# HOG parameters
hog_params = {
    'orientations': 9, # number of bins
    'pixels_per_cell': (8, 8), # pixel size
    'cells_per_block': (2, 2), # number of cells
    'block_norm': 'L2-Hys' # block norm
}

# Dataset

In [5]:
# Specify the paths to the training and testing images
training_path = "Dataset/Train"
testing_path = "Dataset/Test"

## Training Data Feature Extraction

In [6]:
# Initialize the data and label lists
data = []
labels = []

In [7]:
# Add time measurement
start = time.time()

# Loop over the training images
training_path = "Dataset/Train"
for imagePath in paths.list_images(training_path):
    # Load the image, convert it to grayscale, and describe it
    image = cv2.imread(imagePath)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Extract LBP features
    lbp_hist = desc.describe(gray)
    
    # Extract HOG features
    hog_features = hog(gray, **hog_params)
    
    # Combine HOG and LBP features
    features = np.hstack((hog_features, lbp_hist))
    
    # Extract the label from the image path, then update the labels and data
    labels.append(imagePath.split(os.path.sep)[-2])
    data.append(features)

# Stop time measurement
end = time.time()

# Calculate the elapsed time
elapsed_time = end - start
print(f"Time taken to process the training images: {elapsed_time:.2f} seconds")

In [None]:
# Convert data and labels to NumPy arrays
data = np.array(data)
labels = np.array(labels)

In [None]:
# Check the length of the data and labels
print(f"Data shape: {data.shape}")
print(f"Labels shape: {labels.shape}")

# check the number of unique labels
unique_labels = np.unique(labels)

# check the lengt of each unique label
for label in unique_labels:
    print(f"Number of images with label {label}: {len(data[labels == label])}")

Data shape: (32000, 26254)
Labels shape: (32000,)
Number of images with label Negative: 16000
Number of images with label Positive: 16000


# Modelling

## Training Model Random Forest Gini Criterion

In [None]:
# Add time measurement
start = time.time()
# Train a Random Forest Gini Criterion
model_RFG = RandomForestClassifier(criterion='gini',random_state=42)
model_RFG.fit(data,labels)
# Stop time measurement
end = time.time()

# Calculate the elapsed time
elapsed_time = end - start
print(f"Time taken to train the Random Forest Gini Criterion: {elapsed_time:.2f} seconds")

## Training Model Random Forest Entropy Criterion

In [None]:
# Add time measurement
start = time.time()
# Train a Random Forest Information Gain Criterion
model_RFE = RandomForestClassifier(criterion='entropy',random_state=42)
model_RFE.fit(data,labels)
# Stop time measurement
end = time.time()

# Calculate the elapsed time
elapsed_time = end - start
print(f"Time taken to train the Random Forest Information Gain Criterion: {elapsed_time:.2f} seconds")

## Training Model Random Forest Log Loss Criterion

In [None]:
# Add time measurement
start = time.time()
# Train a Random Forest Information Gain Criterion
model_RFL = RandomForestClassifier(criterion='log_loss',random_state=42)
model_RFL.fit(data,labels)
# Stop time measurement
end = time.time()

# Calculate the elapsed time
elapsed_time = end - start
print(f"Time taken to train the Random Forest Log Loss Criterion: {elapsed_time:.2f} seconds")


# Confussion Matrix & Classification Report Random Forest Gini

In [None]:
# Initialize empty lists for true labels and predicted labels
true_labels = []
predicted_labels = []

In [None]:
# Add time measurement
start = time.time()
# Loop over the testing images
for imagePath in paths.list_images(testing_path):
    # Load the image, convert it to grayscale, describe it
    image = cv2.imread(imagePath)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Extract LBP features
    lbp_hist = desc.describe(gray)
    
    # Extract HOG features
    hog_features = hog(gray, **hog_params)
    
    # Combine HOG and LBP features
    features = np.hstack((hog_features, lbp_hist))
    
    # Predict the label using the trained SVM classifier
    prediction = model_RFG.predict(features.reshape(1, -1))
    
    # Extract the true label from the image path
    true_label = imagePath.split(os.path.sep)[-2]
    
    # Update the lists
    true_labels.append(true_label)
    predicted_labels.append(prediction[0])

# Create the confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)

# Display the confusion matrix using a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(true_labels),
            yticklabels=np.unique(true_labels))
plt.title('Confusion Matrix - SVM Classifier')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Stop time measurement
end = time.time()

# Calculate the elapsed time
elapsed_time = end - start
print(f"Time taken to predict the testing images: {elapsed_time:.2f} seconds")

In [None]:
# Get the classification report
class_report = classification_report(true_labels, predicted_labels, digits=4)
print("Classification Report:\n", class_report)

# Get the accuracy score
acc_score_RFG_n8r1 = accuracy_score(true_labels, predicted_labels)
print("Accuracy Score:", acc_score_RFG_n8r1)

# Get the F1 score
f1_score_RFG_n8r1 = f1_score(true_labels, predicted_labels, average='weighted')
print("F1 Score:", f1_score_RFG_n8r1)

# Confussion Matrix & Classification Report Random Forest Entropy

In [None]:
# Initialize empty lists for true labels and predicted labels
true_labels = []
predicted_labels = []

# Add time measurement
start = time.time()
# Loop over the testing images
for imagePath in paths.list_images(testing_path):
    # Load the image, convert it to grayscale, describe it,
    # and classify it
    image = cv2.imread(imagePath)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    hist = desc.describe(gray)
    prediction = model_RFE.predict(hist.reshape(1, -1))

    # Extract the true label from the image path
    true_label = imagePath.split(os.path.sep)[-2]

    # Update the lists
    true_labels.append(true_label)
    predicted_labels.append(prediction[0])

# Create the confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)

# Display the confusion matrix using a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(true_labels),
            yticklabels=np.unique(true_labels))
plt.title('Confusion Matrix Random Forest Entropy Criterion')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()
# Stop time measurement
end = time.time()

# Calculate the elapsed time
elapsed_time = end - start
print(f"Time taken to predict the testing images: {elapsed_time:.2f} seconds")

In [None]:
# Get the classification report
class_report = classification_report(true_labels, predicted_labels, digits=4)
print("Classification Report:\n", class_report)

# Get the accuracy score
acc_score_RFE_n8r1 = accuracy_score(true_labels, predicted_labels)
print("Accuracy Score:", acc_score_RFE_n8r1)

# Get the F1 score
f1_score_RFE_n8r1 = f1_score(true_labels, predicted_labels, average='weighted')
print("F1 Score:", f1_score_RFE_n8r1)

# Confussion Matrix & Classification Report Random Forest Log Loss

In [None]:
# Initialize empty lists for true labels and predicted labels
true_labels = []
predicted_labels = []

# Add time measurement
start = time.time()
# Loop over the testing images
for imagePath in paths.list_images(testing_path):
    # Load the image, convert it to grayscale, describe it,
    # and classify it
    image = cv2.imread(imagePath)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    hist = desc.describe(gray)
    prediction = model_RFL.predict(hist.reshape(1, -1))

    # Extract the true label from the image path
    true_label = imagePath.split(os.path.sep)[-2]

    # Update the lists
    true_labels.append(true_label)
    predicted_labels.append(prediction[0])

# Create the confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)

# Display the confusion matrix using a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(true_labels),
            yticklabels=np.unique(true_labels))
plt.title('Confusion Matrix Random Forest Log Loss Criterion')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()
# Stop time measurement
end = time.time()

# Calculate the elapsed time
elapsed_time = end - start
print(f"Time taken to predict the testing images: {elapsed_time:.2f} seconds")

In [None]:
# Get the classification report
class_report = classification_report(true_labels, predicted_labels, digits=4)
print("Classification Report:\n", class_report)

# Get the accuracy score
acc_score_RFL_n8r1 = accuracy_score(true_labels, predicted_labels)
print("Accuracy Score:", acc_score_RFL_n8r1)

# Get the F1 score
f1_score_RFL_n8r1 = f1_score(true_labels, predicted_labels, average='weighted')
print("F1 Score:", f1_score_RFL_n8r1)

# Wrapping Up

In [None]:
# wrap the models in a list
# make dict of models to loop through
model = {
    "Random Forest Gini NumPoint = 8 Radius = 1": model_RFG,
    "Random Forest Entropy NumPoint = 8 Radius = 1": model_RFE,
    "Random Forest Log Loss NumPoint = 8 Radius = 1": model_RFL
}

# Make dict of accuracy scores to loop through
acc_score = {
    "Random Forest Gini NumPoint = 8 Radius = 1": acc_score_RFG_n8r1,
    "Random Forest Entropy NumPoint = 8 Radius = 1": acc_score_RFE_n8r1,
    "Random Forest Log Loss NumPoint = 8 Radius = 1": acc_score_RFL_n8r1
}

# Make dict of F1 scores to loop through
f1_score = {
    "Random Forest Gini NumPoint = 8 Radius = 1": f1_score_RFG_n8r1,
    "Random Forest Entropy NumPoint = 8 Radius = 1": f1_score_RFE_n8r1,
    "Random Forest Log Loss NumPoint = 8 Radius = 1": f1_score_RFL_n8r1
}

# Make DataFrame of all the scores
df = pd.DataFrame(list(acc_score.items()), columns=['Model', 'Accuracy Score'])
df['F1 Score'] = f1_score.values()

# Sort the values by f1 score
df.sort_values(by=['F1 Score'], inplace=True, ascending=False)

# Display the DataFrame
print(df)

# Save Model

In [None]:
# Save all the model
import pickle
# model as a pickle file
model_RFG_pkl_file = "model\RFG_n8r1.pkl"
model_RFE_pkl_file = "model\RFE_n8r1.pkl"
model_RFL_pkl_file = "model\RFL_n8r1.pkl"

with open(model_RFG_pkl_file, 'wb') as file:  
    pickle.dump(model_RFG, file)

with open(model_RFE_pkl_file, 'wb') as file:
    pickle.dump(model_RFE, file)

with open(model_RFL_pkl_file, 'wb') as file:
    pickle.dump(model_RFL, file)