In [1]:
from google.colab import files

# Prompt user to upload files
uploaded = files.upload()

Saving archive (1).zip to archive (1).zip


In [2]:
import zipfile
import os

# Specify the path to the zip file
zip_file_path = "archive (1).zip"  # Replace with the name of your zip file

# Specify the directory where you want to extract the contents
extracted_folder_path = "images"  # Replace "images" with your desired folder name

# Create the target directory if it doesn't exist
os.makedirs(extracted_folder_path, exist_ok=True)

# Unzip the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_folder_path)

print("Folder extracted successfully.")


Folder extracted successfully.


## Random Forest Model

# 1- Features Extraction


In [8]:
import numpy as np
from skimage.feature import hog, local_binary_pattern
from skimage import io, color, exposure
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

# Function to extract HOG features shape from an image
def extract_hog_features(image):
    gray_image = color.rgb2gray(image)
    orientations = 9
    pixels_per_cell = (8, 8)
    cells_per_block = (2, 2)
    features = hog(gray_image, orientations=orientations, pixels_per_cell=pixels_per_cell,
                   cells_per_block=cells_per_block, block_norm='L2-Hys', visualize=False)
    return features

# Function to extract LBP features texture from an image
def extract_lbp_features(image):
    gray_image = color.rgb2gray(image)
    radius = 3
    n_points = 8 * radius
    lbp = local_binary_pattern(gray_image, n_points, radius, method='uniform')
    hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, n_points + 3), range=(0, n_points + 2))
    hist = hist.astype("float")
    hist /= (hist.sum() + 1e-7)
    return hist

# Function to extract color histogram features from an image
def extract_color_histogram(image):
    # Convert the image to HSV color space
    hsv_image = color.rgb2hsv(image)

    # Split the image into channels
    h, s, v = hsv_image[:,:,0], hsv_image[:,:,1], hsv_image[:,:,2]

    # Compute histograms for each channel
    hist_h, _ = np.histogram(h.ravel(), bins=256, range=(0, 1))
    hist_s, _ = np.histogram(s.ravel(), bins=256, range=(0, 1))
    hist_v, _ = np.histogram(v.ravel(), bins=256, range=(0, 1))

    # Concatenate histograms
    hist = np.concatenate((hist_h, hist_s, hist_v))
    return hist

# Function to extract combined features from an image
def extract_combined_features(image_path):
    # Load the image
    image = io.imread(image_path)

    # Extract individual features
    hog_features = extract_hog_features(image)
    lbp_features = extract_lbp_features(image)
    color_hist_features = extract_color_histogram(image)

    # Combine features into a single feature vector
    combined_features = np.concatenate((hog_features, lbp_features, color_hist_features))

    return combined_features

# Function to load dataset and extract features
def load_dataset_and_extract_features(dataset_path):
    X = []
    y = []

    for class_name in os.listdir(dataset_path):
        class_path = os.path.join(dataset_path, class_name)
        if os.path.isdir(class_path):
            if class_name.lower() == 'augmented_benign':
                label = 0
            elif class_name.lower() == 'augmented_malignant':
                label = 1
            else:
                continue

            for image_name in os.listdir(class_path):
                image_path = os.path.join(class_path, image_name)
                features = extract_combined_features(image_path)
                X.append(features)
                y.append(label)

    return np.array(X), np.array(y)

# Load dataset and extract features
dataset_path = "/content/images/Oral Images Dataset/augmented_data"
X, y = load_dataset_and_extract_features(dataset_path)

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



Accuracy: 0.775330396475771
F1 Score: 0.7524271844660194


# 2- Model Training and Evaluation

In [11]:
# Initialize and train Random Forest classifier
clf = RandomForestClassifier(n_estimators=100,max_depth=40, random_state=42)
clf.fit(X_train, y_train)

# Make predictions on test set
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)

Accuracy: 0.9008810572687225
F1 Score: 0.8965517241379312


## using ResNet-101 which is a Convolutional Neural Network  CNN


In [4]:
import torch
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
from sklearn.model_selection import train_test_split
import torchvision.models as models
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import f1_score

# Define data transformations including data augmentation
train_transform = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.RandomResizedCrop(64, scale=(0.8, 1.2)),
    transforms.RandomAffine(degrees=0, translate=(0.2, 0.2)),
    transforms.RandomRotation(degrees=20),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

test_transform = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load the dataset
dataset = ImageFolder(root='/content/images/Oral Images Dataset/augmented_data', transform=train_transform)

# Split the dataset into training and testing sets
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)

# Define data loaders with reduced batch size
train_loader = torch.utils.data.DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=64, shuffle=False)

# Load pre-trained ResNet-101 model
model = models.resnet101(pretrained=True)

# Modify the model for binary classification
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 2)

# Set up loss function, optimizer, and scheduler
class_weights = torch.tensor([0.4, 0.6])  # Weighted loss for class imbalance
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=0.005)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)  # Learning rate decay

# Train the model
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)
    epoch_loss = running_loss / len(train_loader.dataset)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss}")
    scheduler.step()

# Evaluation
model.eval()
correct = 0
total = 0
true_labels = []
predicted_labels = []

with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        true_labels.extend(labels.tolist())
        predicted_labels.extend(predicted.tolist())

accuracy = correct / total
print(f"Accuracy: {accuracy}")

# Calculate F1 score
f1 = f1_score(true_labels, predicted_labels)
print(f"F1 Score: {f1}")


Epoch [1/3], Loss: 0.5235973612327408
Epoch [2/3], Loss: 0.21512269527376485
Epoch [3/3], Loss: 0.07501506880910386
Accuracy: 0.8810572687224669
F1 Score: 0.8805309734513275


# Conclusion

## Random Forest is better than the proposed CNN model as it acheive similar accuracy and F1-score with much simpler and computationally less expensive way