Implementating EfficeintNETBO for feature extraction ,for visulaizinthose feature we are using PCA ,then on extracted features we have implemented random forest

In [None]:
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.decomposition import PCA
from sklearn.utils import class_weight
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from google.colab import drive

# Step 1: Mount Google Drive to access data
drive.mount('/content/drive')

# Define directories for training and test data
train_dir = '/content/drive/My Drive/Project 1 Data (2)/Project 1 Data/Train_Data'
test_dir = '/content/drive/My Drive/Project 1 Data (2)/Project 1 Data/Test_Data'

# Step 2: Load and Preprocess Training Data
IMG_SIZE = (224, 224)
ALLOWED_EXTENSIONS = {'.jpg', '.jpeg', '.png'}

# Function to load training data
def load_data(train_dir):
    images = []
    labels = []
    label_map = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'S': 5}

    for folder in os.listdir(train_dir):
        folder_path = os.path.join(train_dir, folder)
        if os.path.isdir(folder_path) and folder in label_map:
            for img_name in os.listdir(folder_path):
                if not any(img_name.lower().endswith(ext) for ext in ALLOWED_EXTENSIONS):
                    continue
                img_path = os.path.join(folder_path, img_name)
                img = cv2.imread(img_path)
                if img is not None:
                    img = cv2.resize(img, IMG_SIZE)
                    images.append(img)
                    labels.append(label_map[folder])
    return np.array(images), np.array(labels)

# Load training data
if os.path.exists(train_dir):
    images, labels = load_data(train_dir)
    print(f"Loaded {len(images)} images.")
    print(f"Shape of images array: {images.shape}")
    print(f"Shape of labels array: {labels.shape}")

# Step 3: Prepare Data for Training
labels = labels - 1  # Adjust labels to be 0-based

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(images, labels, test_size=0.2, random_state=42)

# Normalize image data
X_train = X_train / 255.0
X_val = X_val / 255.0

# Step 4: Apply Data Augmentation to Training Data
datagen = ImageDataGenerator(
    rotation_range=30,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

train_generator = datagen.flow(X_train, y_train, batch_size=32)

# Step 5: Define the Feature Extraction Model Using EfficientNetB0
base_model = EfficientNetB0(input_shape=(224, 224, 3), include_top=False, weights='imagenet')

# Unfreeze the last few layers for fine-tuning
for layer in base_model.layers[-20:]:
    layer.trainable = True

# Compile model for feature extraction
model = Sequential([
    base_model,
    Flatten(),  # This layer will give you a feature vector
])

# Extract features for training and validation data
train_features = model.predict(X_train)
val_features = model.predict(X_val)

# Step 6: Handle Class Imbalance Using Class Weights
class_weights = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(class_weights))

# Standardize the extracted features for the classical machine learning models
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)
val_features = scaler.transform(val_features)

# Step 7: Visualize Extracted Features Using PCA
pca = PCA(n_components=2)
train_features_pca = pca.fit_transform(train_features)

# Plot the PCA-reduced features
plt.figure(figsize=(10, 6))
plt.scatter(train_features_pca[:, 0], train_features_pca[:, 1], c=y_train, cmap='viridis', s=10)
plt.colorbar(label='Class')
plt.title('PCA of Extracted Features from EfficientNetB0')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()

# Print information on the extracted features
print("The extracted features hold information related to high-level representations of the input images, such as edges, textures, and patterns. These features help distinguish between the different image classes. The PCA plot shows how these features cluster based on the class labels, providing insights into their separability.")

# Step 8: Try Random Forest for Classification
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Perform cross-validation and store the accuracy scores
rf_cross_val_scores = cross_val_score(rf_classifier, train_features, y_train, cv=5, scoring='accuracy')

# Plot Cross-Validation Accuracy
plt.figure(figsize=(8, 6))
plt.plot(range(1, 6), rf_cross_val_scores, marker='o', linestyle='-', color='b', label='Random Forest Accuracy')
plt.title('Cross-Validation Accuracy of Random Forest')
plt.xlabel('Fold')
plt.ylabel('Accuracy')
plt.ylim(0, 1)
plt.legend()
plt.grid(True)
plt.show()

# Print Cross-Validation Results
print(f"Random Forest Cross-Validation Accuracy Scores: {rf_cross_val_scores}")
print(f"Random Forest Mean Validation Accuracy: {np.mean(rf_cross_val_scores)}")

# Step 9: Train the Random Forest Classifier on Full Training Data
rf_classifier.fit(train_features, y_train)

# Step 10: Predict on Validation Set
val_predictions = rf_classifier.predict(val_features)
val_accuracy = np.mean(val_predictions == y_val)
print(f"Validation Accuracy: {val_accuracy:.4f}")

# Step 11: Visualize Feature Importances (for Random Forest)
importances = rf_classifier.feature_importances_
indices = np.argsort(importances)[::-1]

# Plot top 10 important features
plt.figure(figsize=(10, 6))
plt.title('Feature Importance - Random Forest')
plt.bar(range(10), importances[indices[:10]], align='center')
plt.xticks(range(10), indices[:10])
plt.tight_layout()
plt.show()

# Step 12: Load and Preprocess Test Data
def load_test_data(test_dir):
    test_images = []
    test_ids = []
    for img_name in os.listdir(test_dir):
        img_path = os.path.join(test_dir, img_name)
        img = cv2.imread(img_path)
        if img is not None:
            img = cv2.resize(img, IMG_SIZE)
            test_images.append(img)
            test_ids.append(img_name.split('.')[0])

    print(f"Loaded {len(test_images)} test images.")
    return np.array(test_images), test_ids

# Load test data
test_images, test_ids = load_test_data(test_dir)

# Normalize test data
test_images = test_images / 255.0

# Extract features from test data using EfficientNetB0
test_features = model.predict(test_images)

# Standardize the test features
test_features = scaler.transform(test_features)

# Step 13: Make Predictions on Test Data using Random Forest
rf_test_predictions = rf_classifier.predict(test_features)

# Adjust predictions to match original label indices
rf_predicted_classes = rf_test_predictions + 1

# Step 14: Create Submission File
rf_submission = pd.DataFrame({
    'ID': test_ids,
    'Predictions': rf_predicted_classes
})

# Save submission file
rf_submission.to_csv('rf_submission.csv', index=False)

# Step 15: Download Submission File
from google.colab import files
files.download('rf_submission.csv')
