In [1]:
# TODO: Consider cropping the image in training to only include the bounding box
# Seperate into classes and train the model on that
# Then use a sliding window approach to detect predictions and bounding boxes
# Then calculate the IoU/AP between the bounding boxes and the ground truth bounding boxes

In [12]:
# Imports
import pickle
import os
import cv2
import json
import numpy as np
from PIL import Image
import warnings
warnings.filterwarnings("ignore")

from sklearn.svm import SVC
from skimage.feature import hog
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler

WINDOW_IMG_WIDTH = 96
WINDOW_IMG_HEIGHT= 96
CLASSES = ['Dent', 'Scratch', 'Crack', 'Glass shatter', 'Lamp broken', 'Tire flat']
COLORS = {1: (255,0,0), 2: (0, 255, 0), 3: (0, 0, 255), 4: (255, 255, 0), 5: (255, 0, 255), 6: (0, 255, 255)}

## Train the model

In [13]:
#############
# FUNCTIONS #
#############

def load_train_data(image_dir, labels_dir):
    # Generate the training and testing image paths
    train_image_path = os.path.join(image_dir, 'train')
    train_labels_path = os.path.join(labels_dir, 'train.json')
    test_labels_path = os.path.join(labels_dir, 'test.json')
    
    # Process the labels from the JSON file
    train_labels_dict = _extract_labels(train_labels_path)
    test_labels_dict = _extract_labels(test_labels_path)
    
    # Get the training images and labels
    train_images = []
    train_labels = []
    for image_file in os.listdir(train_image_path):
        # Get the image ID
        image_id = int(image_file.split('.')[0])
        
        # Load the image
        image = cv2.imread(os.path.join(train_image_path, image_file))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Ensure it is RGB
        
        # Crop the image at the bounding box for each label
        for i, unique_label in enumerate(train_labels_dict['classification'][image_id]):
            # Get the bounding box for the label
            (x, y, w, h) = train_labels_dict['bbox'][image_id][i]

            # Crop the image at the bounding box
            cropped_image = image[int(np.floor(x)) : int(np.ceil(x+w)), int(np.floor(y)) : int(np.ceil(y+h))]
            
            # Ensure the width and height are big enough, otherwise it will cause problems
            if(cropped_image.shape[0] <= 0 or cropped_image.shape[1] <= 0):
                continue
            # Resize image to a standard size
            cropped_image = cv2.resize(cropped_image, (WINDOW_IMG_WIDTH, WINDOW_IMG_HEIGHT))
            
            # Extract the HOG features from the cropped image
            img_features = hog(cropped_image, orientations=9, pixels_per_cell=(16,16), cells_per_block=(2,2), channel_axis=2)
            
            # Append the image and labels to the datasets
            train_images.append(img_features)
            train_labels.append(unique_label)  
       
    # Return the data 
    return np.array(train_images), np.array(train_labels), test_labels_dict

def _extract_labels(labels_file):
    with(open(labels_file)) as label_data:
        annot_df = json.load(label_data)
        
    labels = {
        "classification": {}, # Store image id and list of labels for that image
        'bbox' : {},          # Store image id and list of bounding boxes for that image
        'segementation' : {}  # Store image id and list of segementations for that image
    }
    
    # Add labels to the dictionary
    for annotations in annot_df['annotations']:
        image_id = annotations['image_id']
        
        # Classification
        if(image_id in labels['classification'].keys()):
            labels['classification'][image_id].append(annotations['category_id'])
        else:
            labels['classification'][image_id] = [annotations['category_id']]
            
        # Bounding Box
        if(image_id in labels['bbox'].keys()):
            labels['bbox'][image_id].append(annotations['bbox'])
        else:
            labels['bbox'][image_id] = [annotations['bbox']]
        
        # Segmentation
        if(image_id in labels['segementation'].keys()):
            labels['segementation'][image_id].append(annotations['segmentation'])
        else:
            labels['segementation'][image_id] = [annotations['segmentation']]
    
    # Return the labels
    return labels


def train_model(images, labels, random_state=0):
    # Instatite the model
    model = SVC(probability=True, random_state=random_state)
    
    # Train the model
    model.fit(images, labels)
    
    # Return the model
    return model

In [4]:
train_images, train_labels, test_labels = load_train_data(image_dir='../data/images', labels_dir='../data/images/annotations')

In [14]:
train_images.shape
#train_labels.shape

(6023, 900)

In [11]:
model = train_model(train_images, train_labels)

In [None]:
# Save the model
import pickle
model_file = '../models/SVM_MODEL.pkl'
pickle.dump(model, open(model_file, 'wb'))


2

## Use the model for prediction

In [15]:
#############
# FUNCTIONS #
#############

class Heatmap():
    
    def __init__(self,original_image):
        
        # Mask attribute is the heatmap initialized with zeros
        self.mask = np.zeros(original_image.shape[:2])
        self.labels = np.zeros(original_image.shape[:2])
    
    # Increase value of region function will add some heat to heatmap
    def incValOfReg(self,coords, label):
        w1,w2,h1,h2 = coords
        self.mask[h1:h2,w1:w2] = self.mask[h1:h2,w1:w2] + 30
        self.labels[h1:h2, w1:w2] = label
    
    # Decrease value of region function will remove some heat from heatmap
    # We'll use this function if a region considered negative
    def decValOfReg(self,coords):
        w1,w2,h1,h2 = coords
        self.mask[h1:h2,w1:w2] = self.mask[h1:h2,w1:w2] - 30
    
    def compileHeatmap(self):
        
        # As you know,pixel values must be between 0 and 255 (uint8)
        # Now we'll scale our values between 0 and 255 and convert it to uint8
        
        # Scaling between 0 and 1 
        scaler = MinMaxScaler()
        
        self.mask = scaler.fit_transform(self.mask)
        
        
        # Scaling between 0 and 255
        self.mask = np.asarray(self.mask * 255).astype(np.uint8)
        
        # Now we'll threshold our mask, if a value is higher than 170, it will be white else
        # it will be black
        self.mask = cv2.inRange(self.mask,170,255)
        
        return self.mask
    
# Adapted from https://www.kaggle.com/code/mehmetlaudatekman/support-vector-machine-object-detection
def slideExtract(image, windowSize=(WINDOW_IMG_WIDTH, WINDOW_IMG_HEIGHT), step=12):
    
    # We'll store coords and features in these lists
    coords = []
    features = []
    
    hIm,wIm = image.shape[:2] 

    
    # W1 will start from 0 to end of image - window size
    # W2 will start from window size to end of image
    # We'll use step (stride) like convolution kernels.
    for w1,w2 in zip(range(0,wIm-windowSize[0],step),range(windowSize[0],wIm,step)):
       
        for h1,h2 in zip(range(0,hIm-windowSize[1],step),range(windowSize[1],hIm,step)):
            window = image[h1:h2,w1:w2]
            features_of_window = hog(window,orientations=9,pixels_per_cell=(16,16),
                                     cells_per_block=(2,2), channel_axis=2
                                    )
            
            coords.append((w1,w2,h1,h2))
            features.append(features_of_window)
    
    return coords, np.asarray(features)

def detection(model, val_images_dir, step=24, threshold=0.5):
    pred_classes = {}
    pred_bboxes = {}
    
    for count, file in enumerate(os.listdir(val_images_dir)):
        # Open the image
        image_id = int(file.split('.')[0])
        image = np.asarray(Image.open(os.path.join(val_images_dir, file)))
        
        # Extracting features and initalizing heatmap
        coords,features = slideExtract(image, step=step)
        htmp = Heatmap(image)
        
        
        for i in range(len(features)):
            # If region is positive then add some heat
            pred = model.predict_proba([features[i]])
            if(max(pred[0]) > threshold):
                htmp.incValOfReg(coords[i], np.argmax(pred[0]) + 1)
            else:
                htmp.decValOfReg(coords[i])
        
        # Compiling heatmap
        mask = htmp.compileHeatmap()
        
        cont,_ = cv2.findContours(mask,1,2)[:2]
        for c in cont:
            # If a contour is small don't consider it
            if cv2.contourArea(c) < 70*70:
                continue
            
            (x,y,w,h) = cv2.boundingRect(c)
            
            if (htmp.labels[y,x] != 0):
                if image_id in pred_bboxes.keys():
                    pred_bboxes[image_id].append([x,y,w,h])
                else:
                    pred_bboxes[image_id] = [[x,y,w,h]]
                    
                if image_id in pred_classes.keys():
                    pred_classes[image_id].append(htmp.labels[y,x])
                else:
                    pred_classes[image_id] = [htmp.labels[y,x]]
        
        if(count % 50 == 0):
            print(f"{count}/374 images processed")
                
    return pred_classes, pred_bboxes

In [17]:
# Load the model
model = pickle.load(open('../models/SVM_MODEL.pkl', 'rb'))

In [None]:
pred_classes, pred_bboxes = detection(model, val_images_dir='../data/images/test', step=36)

0/374 images processed
50/374 images processed
100/374 images processed
150/374 images processed
200/374 images processed
250/374 images processed
300/374 images processed
350/374 images processed


In [None]:
x = []
for val in pred_classes.values():
    x += val
    
np.unique(np.array(x), return_counts=True)

(array([1., 2., 4., 5., 6.]), array([717, 811,  32,   1,   4], dtype=int64))

In [None]:
val_images_dir = "../data/images/test"
val_images = {int(file.split('.')[0]):np.asarray(Image.open(os.path.join(val_images_dir, file))) for file in os.listdir(val_images_dir)}

TypeError: only integer scalar arrays can be converted to a scalar index

In [None]:
val_labels = []
for label in test_labels['classification'].values():
    val_labels = val_labels + label

val_labels = np.array(val_labels)

## Calculate the AP

In [18]:
def IoU(boxA, boxB):
    # Resource: https://pyimagesearch.com/2016/11/07/intersection-over-union-iou-for-object-detection/
    # determine the (x, y)-coordinates of the intersection rectangle
	xA = max(boxA[0], boxB[0])
	yA = max(boxA[1], boxB[1])
	xB = min(boxA[2], boxB[2])
	yB = min(boxA[3], boxB[3])
	# compute the area of intersection rectangle
	interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
	# compute the area of both the prediction and ground-truth
	# rectangles
	boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
	boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
	# compute the intersection over union by taking the intersection
	# area and dividing it by the sum of prediction + ground-truth
	# areas - the interesection area
	iou = interArea / float(boxAArea + boxBArea - interArea)
	# return the intersection over union value
	return iou

def calculate_ap(true_classes, pred_classes, true_bboxes, pred_bboxes, threshold=0.5):
    # Store the aps
    class_aps = np.zeros(len(CLASSES))
        
        # Loop through the images
          # Loop through each category in the image
           # Loop through predited bounding boxes
            # Loop through real bounding boxes
                # If they match IoU and predition --> TP++
                # Otherwise FP++
           # Remaining are FN (need to multiply by 2) 
        # Calculate AP
        
        
    # Track TP, FP, and FN for each class
    class_TPS = np.zeros(len(CLASSES))
    class_FPS = np.zeros(len(CLASSES))
    class_FNS = np.zeros(len(CLASSES))
        
    # Loop through the images
    for image_id in pred_classes.keys():
        
        # Get the true and predicted classes for this image
        true_classes_image = true_classes[image_id]
        pred_classes_image = pred_classes[image_id]
        
        # Get the true and predicted bboxes for this image
        true_bboxes_image = true_bboxes[image_id]
        pred_bboxes_image = pred_bboxes[image_id]
    
        # For each true category in the image
        for cat in true_classes_image:
            # If there are no predictions
            if(len(pred_classes_image) == 0):
                class_FNS[cat - 1] += len(true_classes_image)
                continue
            
            # Calculate number of FNS
            if(len(true_classes_image) > len(pred_classes_image)):
                class_FNS[cat - 1] += len(true_classes_image) - len(pred_classes_image)
            
            # Calculate TPS and FPS
            for j in range(len((true_bboxes_image))):
                for k in range(j, len(pred_bboxes_image)):
                    true_bbox = true_bboxes_image[j]
                    pred_bbox = pred_bboxes_image[k]
                    
                    # Calculate IoU
                    iou = IoU(true_bbox, pred_bbox)
                    
                    # If the IoU is greater than the threshold
                    if(iou > threshold):
                        # If the predicted class is the same as the true class
                        if(int(pred_classes_image[k]) == cat):
                            class_TPS[cat - 1] += 1
                        else:
                            class_FPS[cat - 1] += 1
                    else:
                        class_FPS[cat - 1] += 1
        
    for cat in range(len(CLASSES)):
        # Calculate Precision and Recall 
        precision = np.nan_to_num(class_TPS[cat] / (class_TPS[cat] + class_FPS[cat]), nan=0.1, posinf=1, neginf=0)
        recall = np.nan_to_num(class_TPS[cat] / (class_TPS[cat] + class_FNS[cat]), nan=0.1, posinf=1, neginf=0)
        class_aps[cat] += np.nan_to_num(precision*recall, nan=0.1, posinf=1, neginf=0)
    
    # Return the mean AP for each class
    return class_aps
    
    

In [None]:
# Calculate the Average Precision (AP) for each class
class_aps = calculate_ap(true_classes=test_labels['classification'], pred_classes=pred_classes, true_bboxes=test_labels['bbox'], pred_bboxes=pred_bboxes)

In [19]:
class_aps = np.zeros(len(CLASSES))

for threshold in np.arange(0.1, 1, 0.2):
    print(f"Threshold: {threshold}")
    pred_classes, pred_bboxes = detection(model, val_images_dir='../data/images/test', step=36, threshold=threshold)
    class_aps += (calculate_ap(true_classes=test_labels['classification'], pred_classes=pred_classes, true_bboxes=test_labels['bbox'], pred_bboxes=pred_bboxes, threshold=threshold)) * 100

Threshold: 0.1
0/374 images processed
50/374 images processed
100/374 images processed
150/374 images processed
200/374 images processed
250/374 images processed
300/374 images processed
350/374 images processed
Threshold: 0.30000000000000004
0/374 images processed
50/374 images processed
100/374 images processed
150/374 images processed
200/374 images processed
250/374 images processed
300/374 images processed
350/374 images processed
Threshold: 0.5000000000000001
0/374 images processed
50/374 images processed
100/374 images processed
150/374 images processed
200/374 images processed
250/374 images processed
300/374 images processed
350/374 images processed
Threshold: 0.7000000000000001
0/374 images processed
50/374 images processed
100/374 images processed
150/374 images processed
200/374 images processed
250/374 images processed
300/374 images processed
350/374 images processed
Threshold: 0.9000000000000001
0/374 images processed
50/374 images processed
100/374 images processed
150/

In [22]:
class_aps * 100 / 5

array([23.0516841 , 26.37819468, 20.        , 20.45913682, 20.        ,
       20.        ])

In [None]:
pred_classes

{}

In [None]:
test_labels['classification'][12]

[6, 6]