In [1]:
import cv2
import glob
import os
import shutil
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [2]:
base_dir = '/project/lt900038-ai23tn/kaggle/im_license_plate_recognition'

# Training images

## Exclude images whose labels contain errors

In [3]:
train1_folder = os.path.join(base_dir, '1-LPR-950/1-LPR-950/1')
train2_folder = os.path.join(base_dir, '2-LRP-950/2-LRP-950/2')
train1_files = glob.glob(os.path.join(train1_folder, '*.jpg'))
train2_files = glob.glob(os.path.join(train2_folder, '*.jpg'))
len(train1_files), len(train2_files)

(934, 1037)

In [5]:
def valid_image(src_path):
    tokens = src_path.split('/')
    label_path = '/'.join(tokens[:-1] + ['labels'] + [tokens[-1].replace('.jpg', '.txt')])
    try:
        with open(label_path, 'r') as label_file:
            bboxes_list = label_file.readlines()
        bboxes_list = [x.split() for x in bboxes_list]
    except:
        return False
    
    for bbox in bboxes_list:
        if float(bbox[1]) < 0 or float(bbox[1]) > 1:
            return False
        if float(bbox[2]) < 0 or float(bbox[2]) > 1:
            return False
        
    return True

In [6]:
valid_files = []

for path in tqdm(train1_files):
    if valid_image(path):
        valid_files.append(path)
        
for path in tqdm(train2_files):
    if valid_image(path):
        valid_files.append(path)

100%|██████████| 934/934 [00:05<00:00, 184.85it/s]
100%|██████████| 1037/1037 [00:06<00:00, 169.65it/s]


In [7]:
len(valid_files)

1929

## Copy images into folders following YOLO data format

In [16]:
train_files, val_files = train_test_split(valid_files, test_size=0.15) # forgot to set random_state

In [18]:
len(train_files), len(val_files)

(1639, 290)

In [38]:
save_dir = '/scratch/lt900002-ai2301/heart/image/yolo/train'
img_dir = '/scratch/lt900002-ai2301/heart/image/yolo/train/images'
label_dir = '/scratch/lt900002-ai2301/heart/image/yolo/train/labels'
os.makedirs(img_dir, exist_ok=True)
os.makedirs(label_dir, exist_ok=True)

for path in tqdm(train_files):
    tokens = path.split('/')
    label_path = '/'.join(tokens[:-1] + ['labels'] + [tokens[-1].replace('.jpg', '.txt')])
    shutil.copyfile(path, os.path.join(img_dir, tokens[-1]))
    shutil.copyfile(label_path, os.path.join(label_dir, tokens[-1].replace('.jpg', '.txt')))

100%|██████████| 1639/1639 [00:16<00:00, 98.10it/s] 


In [None]:
save_dir = '/scratch/lt900002-ai2301/heart/image/yolo/val'
img_dir = '/scratch/lt900002-ai2301/heart/image/yolo/val/images'
label_dir = '/scratch/lt900002-ai2301/heart/image/yolo/val/labels'
os.makedirs(img_dir, exist_ok=True)
os.makedirs(label_dir, exist_ok=True)

for path in tqdm(val_files):
    tokens = path.split('/')
    label_path = '/'.join(tokens[:-1] + ['labels'] + [tokens[-1].replace('.jpg', '.txt')])
    shutil.copyfile(path, os.path.join(img_dir, tokens[-1]))
    shutil.copyfile(label_path, os.path.join(label_dir, tokens[-1].replace('.jpg', '.txt')))

100%|██████████| 290/290 [00:01<00:00, 203.28it/s]


# Test images

In [10]:
def read_image(path):
    img = cv2.imread(path)
    return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

def write_image(path, img):
    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    cv2.imwrite(path, img)

In [8]:
test_folder = os.path.join(base_dir, 'test/test')

In [9]:
test_df = pd.read_csv(os.path.join(base_dir, 'test.csv'))

In [11]:
# Crop only the license plate and save to another folder
save_folder = '/scratch/lt900002-ai2301/heart/image/test'
os.makedirs(save_folder, exist_ok=True)

for i, row in tqdm(test_df.iterrows(), total=len(test_df)):
    img_path = os.path.join(test_folder, row['image_id'] + '.jpg')
    img = read_image(img_path)
    write_image(os.path.join(save_folder, row['image_id'] + '.jpg'), img[int(row['y1']):int(row['y2'])+1, int(row['x1']):int(row['x2'])+1])

100%|██████████| 1991/1991 [00:16<00:00, 117.93it/s]
