In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import imageio
import shutil
import os

# from Definitions_TSRD
from Definitions_TSRD import T, DATA_PATH, MODELS_PATH, TEACHER_TO_STUDENTS

<font size=7><b>CPSC471 Project: Data Preprocessing</b></font>

**Ruomu (Felix) Zou**

This notebook reads the original dataset in `../Data/OriginalFormat/` and converts it to a pytorch dataloader friendly format in `../Data/PytorchFriendlyFormat`. It also crops the images to only keep the sign.

# Read in All Requisite Info:

In [2]:
# load in raw label files
labels_train = pd.read_csv(os.path.join(DATA_PATH, "OriginalFormat", "Labels_Train.txt"), sep=";",
                           usecols=[0,1,2,3,4,5,6,7],
                           names=["ImageFile", "Width", "Height", "x1", "y1", "x2", "y2", "Category"])
labels_test = pd.read_csv(os.path.join(DATA_PATH, "OriginalFormat", "Labels_Test.txt"), sep=";",
                          usecols=[0,1,2,3,4,5,6,7],
                          names=["ImageFile", "Width", "Height", "x1", "y1", "x2", "y2", "Category"])

# load in english label names
category_names_english = pd.read_csv(os.path.join(DATA_PATH, "OriginalFormat", "Category_Names_English.txt"), sep=",",
                                     names=["CategoryNumber", "CategoryName_English"])

# merge label names into labels
labels_train = labels_train.merge(category_names_english, left_on="Category", right_on="CategoryNumber")
labels_test = labels_test.merge(category_names_english, left_on="Category", right_on="CategoryNumber")

# Reorganize the Data:

In [3]:
### make the new folder (flush if it already exists) ###
new_folder_path = os.path.join(DATA_PATH, "PytorchFriendlyFormat/")
if os.path.exists(new_folder_path):
    shutil.rmtree(new_folder_path)
os.makedirs(new_folder_path) # this is now empty folder


### reorganize training samples ###

# make the Train folder
train_path = os.path.join(new_folder_path, "Train/")
os.makedirs(train_path)

# make the category folders
for cat_id, cat_name in category_names_english.to_numpy():
    os.makedirs(os.path.join(train_path, f"{cat_name}/"))

# move each images over and crop
for img_file, category_name, x1, y1, x2, y2 in labels_train[["ImageFile", "CategoryName_English", "x1", "y1", "x2", "y2"]].to_numpy():
    orig_img_path = os.path.join(DATA_PATH, "OriginalFormat/Train/", img_file)
    new_img_path = os.path.join(train_path, f"{category_name}/", img_file)
    img = imageio.imread(orig_img_path)
    img = img[x1:x2,y1:y2,:]
    imageio.imwrite(new_img_path, img)
    

# ### reorganize testing samples ###

# make the Test folder
test_path = os.path.join(new_folder_path, "Test/")
os.makedirs(test_path)

# make the category folders
for cat_id, cat_name in category_names_english.to_numpy():
    os.makedirs(os.path.join(test_path, f"{cat_name}/"))

# move each images over and crop
for img_file, category_name, x1, y1, x2, y2 in labels_test[["ImageFile", "CategoryName_English", "x1", "y1", "x2", "y2"]].to_numpy():
    orig_img_path = os.path.join(DATA_PATH, "OriginalFormat/Test/", img_file)
    new_img_path = os.path.join(test_path, f"{category_name}/", img_file)
    img = imageio.imread(orig_img_path)
    img = img[x1:x2,y1:y2,:]
    imageio.imwrite(new_img_path, img)