In [1]:
#!pip install imbalanced-learn

In [2]:
import os
import numpy as np
import cv2
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.utils import to_categorical
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Function to load and preprocess data
def load_data_from_folder(folder_path, target_size=(256, 256)):
    images = []
    labels = []
    for subdir in os.listdir(folder_path):
        subdir_path = os.path.join(folder_path, subdir)
        if os.path.isdir(subdir_path):
            for filename in os.listdir(subdir_path):
                img_path = os.path.join(subdir_path, filename)
                if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
                    image = load_img(img_path, target_size=target_size)
                    image = img_to_array(image) / 255.0  # Normalize to [0, 1]
                    images.append(image)
                    labels.append(subdir)  # Use subdirectory name as label
    return np.array(images), np.array(labels)

# Extract features using ResNet50
def extract_resnet50_features(images):
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(256, 256, 3))
    features = base_model.predict(images)
    return features.reshape((features.shape[0], -1))  # Flatten features

# Handcrafted features (edge detection, texture, etc.)
def extract_handcrafted_features(images):
    feature_list = []
    for img in images:
        gray = cv2.cvtColor((img * 255).astype('uint8'), cv2.COLOR_BGR2GRAY)
        
        # Edge detection (Canny)
        edges = cv2.Canny(gray, 100, 200)
        
        # Texture detection (Laplacian)
        laplacian = cv2.Laplacian(gray, cv2.CV_64F).var()
        
        # Hough Line detection
        lines = cv2.HoughLines(edges, 1, np.pi / 180, 200)
        num_lines = len(lines) if lines is not None else 0
        
        # Combine handcrafted features
        handcrafted_features = [laplacian, num_lines]
        feature_list.append(handcrafted_features)
    return np.array(feature_list)

# Function to balance the dataset using SMOTE
def balance_dataset(X, y):
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    return X_resampled, y_resampled

# Load train data
train_data_path = r"C:\Users\ganga\Documents\IISc Coursework\ML4CPS\Project1\Project 1 Data\Project 1 Data\Train_Data"
train_images, train_labels = load_data_from_folder(train_data_path)

# Encode labels
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)

# Split into training and validation sets
train_images_split, val_images_split, train_labels_split, val_labels_split = train_test_split(
    train_images, train_labels_encoded, test_size=0.2, random_state=42)

# Extract ResNet50 features
resnet50_train_features = extract_resnet50_features(train_images_split)
resnet50_val_features = extract_resnet50_features(val_images_split)

# Extract handcrafted features (e.g., edges, lines)
handcrafted_train_features = extract_handcrafted_features(train_images_split)
handcrafted_val_features = extract_handcrafted_features(val_images_split)

# Combine features (ResNet50 + handcrafted)
X_train_combined = np.hstack([resnet50_train_features, handcrafted_train_features])
X_val_combined = np.hstack([resnet50_val_features, handcrafted_val_features])

# Balance the dataset using SMOTE
X_train_resampled, y_train_resampled = balance_dataset(X_train_combined, train_labels_split)

# Train a Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_resampled, y_train_resampled)

# Predict on validation data
val_predictions = rf.predict(X_val_combined)

# Evaluate the model
print(f'Validation Accuracy: {accuracy_score(val_labels_split, val_predictions)}')
print(classification_report(val_labels_split, val_predictions))


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m366s[0m 6s/step
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m216s[0m 12s/step
Validation Accuracy: 0.4107142857142857
              precision    recall  f1-score   support

           0       0.39      0.47      0.43        58
           1       0.15      0.09      0.11        76
           2       0.37      0.36      0.36       146
           3       0.51      0.57      0.54       187
           4       0.37      0.41      0.38        37

    accuracy                           0.41       504
   macro avg       0.36      0.38      0.37       504
weighted avg       0.39      0.41      0.40       504



In [8]:
test_dir = r"C:\Users\ganga\Documents\IISc Coursework\ML4CPS\Project1\Project 1 Data\Project 1 Data\Test_Data"

def load_test_data(test_dir):
    test_images = []
    test_ids = []
    for img_name in os.listdir(test_dir):
        img_path = os.path.join(test_dir, img_name)
        img = cv2.imread(img_path)
        if img is not None:
            img = cv2.resize(img, (256, 256))
            test_images.append(img)
            test_ids.append(img_name.split('.')[0])

    print(f"Loaded {len(test_images)} images.")

    return np.array(test_images), test_ids

# Load test data
test_images, test_ids = load_test_data(test_dir)

# Normalize test data
test_images = test_images / 255.0




Loaded 478 images.


In [9]:
#Test data feature extraction
# Extract ResNet50 features
resnet50_test_features = extract_resnet50_features(test_images)

# Extract handcrafted features (e.g., edges, lines)
handcrafted_test_features = extract_handcrafted_features(test_images)

# Combine features (ResNet50 + handcrafted)
X_test_combined = np.hstack([resnet50_test_features, handcrafted_test_features])

# Predict on test data
test_predictions = rf.predict(X_test_combined)


[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 2s/step


In [12]:
import pandas as pd

In [13]:
predicted_classes = test_predictions + 1  # Adjust to match original labels

# Ensure predicted_classes has 478 entries
if len(predicted_classes) < 478:
    print(f"Expected 478 images, but loaded {len(test_ids)}. Adding placeholder rows for missing images.")

    # Initialize predicted_classes if not already done
    predicted_classes = np.array(predicted_classes)

    # Add placeholder predictions (e.g., class 1) to match the expected number of rows
    for missing_img in missing_images:
        test_ids.append(missing_img.split('.')[0])
        predicted_classes = np.append(predicted_classes, [1])  # Default prediction as class 1
        
        # Step 8: Create Submission File
submission = pd.DataFrame({
    'ID': test_ids,
    'Predictions': predicted_classes
})

# Save submission variable as a .csv file in the current working directory
submission.to_csv('submission5.csv', index=False)
print("Submission file created successfully.")

Submission file created successfully.
