In [1]:
import os
import pandas as pd
import numpy as np
import itertools

https://madewithml.com/courses/mlops/splitting/

In [2]:
class LabelEncoder(object):
    """Label encoder for tag labels."""
    def __init__(self, class_to_index={}):
        self.class_to_index = class_to_index
        self.index_to_class = {v: k for k, v in self.class_to_index.items()}
        self.classes = list(self.class_to_index.keys())

    def __len__(self):
        return len(self.class_to_index)

    def __str__(self):
        return f"<LabelEncoder(num_classes={len(self)})>"

    def fit(self, y):
        classes = np.unique(list(itertools.chain.from_iterable(y)))
        for i, class_ in enumerate(classes):
            self.class_to_index[class_] = i
        self.index_to_class = {v: k for k, v in self.class_to_index.items()}
        self.classes = list(self.class_to_index.keys())
        return self

    def encode(self, y):
        y_one_hot = np.zeros((len(y), len(self.class_to_index)), dtype=int)
        for i, item in enumerate(y):
            for class_ in item:
                y_one_hot[i][self.class_to_index[class_]] = 1
        return y_one_hot

    def decode(self, y):
        classes = []
        for i, item in enumerate(y):
            indices = np.where(item == 1)[0]
            classes.append([self.index_to_class[index] for index in indices])
        return classes

    def save(self, fp):
        with open(fp, "w") as fp:
            contents = {"class_to_index": self.class_to_index}
            json.dump(contents, fp, indent=4, sort_keys=False)

    @classmethod
    def load(cls, fp):
        with open(fp, "r") as fp:
            kwargs = json.load(fp=fp)
        return cls(**kwargs)

In [3]:
import xml.etree.ElementTree as ET
def read_content(xml_file: str):

    tree = ET.parse(xml_file)
    root = tree.getroot()

    list_boxes = []
    list_labels = []
    filename = root.find('filename').text
    for boxes in root.iter('object'):
        label = str(boxes.find('name').text)
        ymin, xmin, ymax, xmax = None, None, None, None

        ymin = int(boxes.find("bndbox/ymin").text)
        xmin = int(boxes.find("bndbox/xmin").text)
        ymax = int(boxes.find("bndbox/ymax").text)
        xmax = int(boxes.find("bndbox/xmax").text)

        list_with_single_boxes = [xmin, ymin, xmax, ymax]
        list_boxes.append(list_with_single_boxes)
        list_labels.append(label)

    return filename, list_boxes,list_labels

In [4]:
data_dir = os.path.join('datasets','car_dent_voc_comb')

In [5]:
fname = data_dir+'/1--1-_jpeg.rf.4e797e339487b67b9581f01fb5482963.xml'

In [6]:
read_content(fname)

('1--1-_jpeg.rf.4e797e339487b67b9581f01fb5482963.jpg',
 [[75, 89, 638, 641]],
 ['front-bumper-dent'])

### Reading loading file names and corresponding labels

In [7]:
files = []
labels = []
for file in os.listdir(data_dir):
    
    if file.rsplit('.')[-1] == 'xml':
        path = os.path.join(data_dir,file)
        img,bbox,label = read_content(path)
        files.append(img)
        labels.append(label)

In [22]:
train_df = pd.DataFrame({'filename':files,'labels':labels})

### Checking class representations

In [23]:
train_df.explode('labels').value_counts('labels')

labels
front-bumper-dent          1014
doorouter-dent              776
bonnet-dent                 557
rear-bumper-dent            554
fender-dent                 480
quaterpanel-dent            417
Headlight-Damage            285
Rear-windscreen-Damage      253
Taillight-Damage            233
roof-dent                   232
RunningBoard-Dent           216
Sidemirror-Damage           185
Front-Windscreen-Damage     140
Signlight-Damage             56
pillar-dent                  42
medium-Bodypanel-Dent         3
Major-Rear-Bumper-Dent        1
dtype: int64

### dropping classes with very less representation

In [24]:
drop_labels = ['medium-Bodypanel-Dent','Major-Rear-Bumper-Dent']

In [25]:
train_df['labels'] = train_df['labels'].apply(lambda x:[v for v in x if v not in drop_labels])

In [26]:
train_df.explode('labels').value_counts('labels')

labels
front-bumper-dent          1014
doorouter-dent              776
bonnet-dent                 557
rear-bumper-dent            554
fender-dent                 480
quaterpanel-dent            417
Headlight-Damage            285
Rear-windscreen-Damage      253
Taillight-Damage            233
roof-dent                   232
RunningBoard-Dent           216
Sidemirror-Damage           185
Front-Windscreen-Damage     140
Signlight-Damage             56
pillar-dent                  42
dtype: int64

In [27]:
print(len(train_df))
train_df.dropna(inplace=True)
len(train_df)

3747


3747

### Splitting dataset

In [28]:
X = train_df.filename.to_numpy()
y = train_df.labels.to_numpy()

In [29]:
label_encoder = LabelEncoder()
label_encoder.fit(y)
num_classes = len(label_encoder)
num_classes

17

In [30]:
Y = label_encoder.encode(y)

In [33]:
# Split sizes
train_size = 0.7
val_size = 0.15
test_size = 0.15

In [41]:
from skmultilearn.model_selection import IterativeStratification
from collections import Counter
from skmultilearn.model_selection.measures import get_combination_wise_output_matrix

def iterative_train_test_split(X, y, train_size):
    """Custom iterative train test split which
    'maintains balanced representation with respect
    to order-th label combinations.'
    """
    stratifier = IterativeStratification(
        n_splits=2, order=1, sample_distribution_per_fold=[1.0-train_size, train_size, ])
    train_indices, test_indices = next(stratifier.split(X, y))
    X_train, y_train = X[train_indices], y[train_indices]
    X_test, y_test = X[test_indices], y[test_indices]
    return X_train, X_test, y_train, y_test

In [42]:
X_train, X_, y_train, y_ = iterative_train_test_split(
    X, Y, train_size=train_size)
X_val, X_test, y_val, y_test = iterative_train_test_split(
    X_, y_, train_size=0.5)

print(f"train: {len(X_train)} ({len(X_train)/len(X):.2f})\n"
      f"val: {len(X_val)} ({len(X_val)/len(X):.2f})\n"
      f"test: {len(X_test)} ({len(X_test)/len(X):.2f})")

train: 2612 (0.70)
val: 558 (0.15)
test: 577 (0.15)


In [56]:
from shutil import copy

output_dir = os.path.join('datasets','car_dent_voc_new','train')
for file in X_train:
    src1 = os.path.join(data_dir,file)
    src2 = os.path.join(data_dir,file.rsplit('.',maxsplit=1)[0]+'.xml')
    copy(src1,output_dir)
    copy(src2,output_dir)
    

In [57]:
from shutil import copy

output_dir = os.path.join('datasets','car_dent_voc_new','valid')
for file in X_val:
    src1 = os.path.join(data_dir,file)
    src2 = os.path.join(data_dir,file.rsplit('.',maxsplit=1)[0]+'.xml')
    copy(src1,output_dir)
    copy(src2,output_dir)
    

In [58]:
from shutil import copy

output_dir = os.path.join('datasets','car_dent_voc_new','test')
for file in X_test:
    src1 = os.path.join(data_dir,file)
    src2 = os.path.join(data_dir,file.rsplit('.',maxsplit=1)[0]+'.xml')
    copy(src1,output_dir)
    copy(src2,output_dir)
    