# Car Bounding Box Dataset Pre-processing

# Import required libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import cv2
import os
import shutil
import datetime
import pytz

from tqdm import tqdm

%matplotlib inline

# Define root directory path

In [2]:
dataset_path = os.path.normpath(r'E:\Sync_With_NAS_Ext\Datasets\Image_Datasets\Stanford_Car_Dataset')

# Define Functions

## Function: Check original data first

In [3]:
def check_orig_data(dataset_path):
    
    ### Check original data first
    names_orig_csv_path = os.path.join(dataset_path, 'names_orig.csv')
    annot_train_orig_csv_path = os.path.join(dataset_path, 'annot_train_orig.csv')
    annot_test_orig_csv_path = os.path.join(dataset_path, 'annot_test_orig.csv')
    train_img_orig_path = os.path.join(dataset_path, 'car_data/train')
    test_img_orig_path = os.path.join(dataset_path, 'car_data/test')
    train_img_orig_num_fold = len(os.listdir(train_img_orig_path))
    test_img_orig_num_fold = len(os.listdir(test_img_orig_path))

    num_train_set_images = 0
    num_test_set_images = 0
    for _, _, files in os.walk(train_img_orig_path):
        num_train_set_images += len(files)
    for _, _, files in os.walk(test_img_orig_path):
        num_test_set_images += len(files)

    if (os.path.isfile(names_orig_csv_path)):
        print("names_orig.csv file found in dataset_path...")
    if (os.path.isfile(annot_train_orig_csv_path)):
        print("annot_train_orig.csv file found in dataset_path...")
    if (os.path.isfile(annot_test_orig_csv_path)):
        print("annot_test_orig.csv file found in dataset_path...")
    if (os.path.isdir(train_img_orig_path)):
        print("Training set images path found in dataset_path...")
    if (os.path.isdir(test_img_orig_path)):
        print("Test set images path found in dataset_path...")

    print()
    print(f"{train_img_orig_num_fold} folders found in training set images path")
    print(f"{test_img_orig_num_fold} folders found in test set images path")
    print()
    print(f"{num_train_set_images} images found in training set images path")
    print(f"{num_test_set_images} images found in test set images path")
    print()

## Function: Create folders

In [4]:
def create_cons_dset_folders(dataset_path):
    
    # Delete previously created directories (if any)
    cons_dir_path = os.path.join(dataset_path, "Consolidated_Dataset")
    cons_train_img_dir_path = os.path.join(cons_dir_path, "train_images")
    cons_test_img_dir_path = os.path.join(cons_dir_path, "test_images")
    cons_zip_file_path = os.path.join(dataset_path, "Consolidated_Dataset.zip")
    
    if os.path.isdir(cons_dir_path):
        print("Consolidated_Dataset folder found: Deleting...")
        shutil.rmtree(cons_dir_path)
    if os.path.isfile(cons_zip_file_path):
        print("Consolidated_Dataset.zip file found: Deleting...")
        os.remove(cons_zip_file_path)
        
    print()    
    print("Creating Consolidated_Dataset folder...")
    os.mkdir(cons_dir_path)
    print("Creating Consolidated_Dataset/train_images folder...")
    os.mkdir(cons_train_img_dir_path)    
    print("Creating Consolidated_Dataset/test_images folder...")
    os.mkdir(cons_test_img_dir_path)
    print()    

In [5]:
def create_red_dset_folders(dataset_path):
    
    # Delete previously created directories (if any)
    red_dir_path = os.path.join(dataset_path, "Reduced_Dataset")
    red_train_img_dir_path = os.path.join(red_dir_path, "train_images")
    red_test_img_dir_path = os.path.join(red_dir_path, "test_images")
    red_zip_file_path = os.path.join(dataset_path, "Reduced_Dataset.zip")

    if os.path.isdir(red_dir_path):
        print("Reduced_Dataset folder found: Deleting...")
        shutil.rmtree(red_dir_path)
    if os.path.isfile(red_zip_file_path):
        print("Reduced_Dataset.zip file found: Deleting...")
        os.remove(red_zip_file_path)
        
    print()    
    print("Creating Reduced_Dataset folder...")
    os.mkdir(red_dir_path)
    print("Creating Reduced_Dataset/train_images folder...")
    os.mkdir(red_train_img_dir_path)    
    print("Creating Reduced_Dataset/test_images folder...")
    os.mkdir(red_test_img_dir_path)
    print()

## Function: Consolidate all training and test set images

In [6]:
def cons_img_files(dataset_path):
    
    # Consolidate training set images
    train_img_root_path = os.path.join(dataset_path, 'car_data/train')
    train_img_dest_path = os.path.join(dataset_path, 'Consolidated_Dataset/train_images/')
    print("Consolidating training set images to Consolidated_Dataset/train_images... ",\
          end = '')
    for folder in os.listdir(train_img_root_path):
        curr_path = os.path.join(train_img_root_path, folder)
        for file in os.listdir(curr_path):
            file_path = os.path.join(curr_path, file)
            shutil.copy(file_path, train_img_dest_path)   
    print("%d images consolidated." %(len(os.listdir(train_img_dest_path))))
    
    # Consolidate test set images
    test_img_root_path = os.path.join(dataset_path, 'car_data/test')
    test_img_dest_path = os.path.join(dataset_path, 'Consolidated_Dataset/test_images/')
    print("Consolidating test set images to Consolidated_Dataset/test_images... ", end = '')
    for folder in os.listdir(test_img_root_path):
        curr_path = os.path.join(test_img_root_path, folder)
        for file in os.listdir(curr_path):
            file_path = os.path.join(curr_path, file)
            shutil.copy(file_path, test_img_dest_path) 
    print("%d images consolidated." %(len(os.listdir(test_img_dest_path))))
    print()

## Function: Create label-class dictionary mapping

In [7]:
def create_lbl_cls_dict(dataset_path):
    # Set path of class names csv file
    classes_path = os.path.join(dataset_path, 'names_orig.csv') 
    # Load classes into a dataframe
    classes = pd.read_csv(classes_path, header = None, names = ['class'])
    # Define empty dictionary to store label_class dictionary mapping
    label_class_dict = {}
    # Iterate through classes DF and update label_class dictionary mapping
    print("Creating label_class_dict...")
    for row in classes.iterrows():
        label_class_dict[row[0] + 1] = row[1]['class']
    
    print()
    return label_class_dict

## Function: Update train annotation file

In [8]:
def upd_train_ann(dataset_path):
    
    annot_train_orig_csv_path = os.path.join(dataset_path, 'annot_train_orig.csv')
    df_cols = ['filename', 'xmin', 'ymin', 'xmax', 'ymax', 'label']
    annot_train_df = pd.read_csv(annot_train_orig_csv_path, header = None, names = df_cols,\
                                 index_col = False)
    train_img_cons_path = os.path.join(dataset_path, 'Consolidated_Dataset/train_images/')
    print("Adding image height, width and image class to annot_train_df...")
    for ind, row in annot_train_df.iterrows():
        img_path = os.path.join(train_img_cons_path, row['filename'])
        img_arr = mpimg.imread(img_path)
        annot_train_df.loc[ind, 'img_h'] = img_arr.shape[0]    
        annot_train_df.loc[ind, 'img_w'] = img_arr.shape[1]            
        annot_train_df.loc[ind, 'class'] = label_class_dict[row['label']]
    annot_train_cons_csv_path = os.path.join(dataset_path,\
                                             'Consolidated_Dataset/annot_train_cons.csv')
    annot_train_df.to_csv(annot_train_cons_csv_path, index = False)
    print("annot_train_df saved in path Consolidated_Dataset/annot_train_cons.csv...")
    print()
    return annot_train_df

## Function: Update test annotation file

In [9]:
def upd_test_ann(dataset_path):
    
    annot_test_orig_csv_path = os.path.join(dataset_path, 'annot_test_orig.csv')
    df_cols = ['filename', 'xmin', 'ymin', 'xmax', 'ymax', 'label']
    annot_test_df = pd.read_csv(annot_test_orig_csv_path, header = None, names = df_cols,\
                                 index_col = False)
    test_img_cons_path = os.path.join(dataset_path, 'Consolidated_Dataset/test_images/')
    print("Adding image height, width and image class to annot_test_df...")
    for ind, row in annot_test_df.iterrows():
        img_path = os.path.join(test_img_cons_path, row['filename'])
        img_arr = mpimg.imread(img_path)
        annot_test_df.loc[ind, 'img_h'] = img_arr.shape[0]    
        annot_test_df.loc[ind, 'img_w'] = img_arr.shape[1]            
        annot_test_df.loc[ind, 'class'] = label_class_dict[row['label']]
    annot_test_cons_csv_path = os.path.join(dataset_path,\
                                             'Consolidated_Dataset/annot_test_cons.csv')
    annot_test_df.to_csv(annot_test_cons_csv_path, index = False)
    print("annot_test_df saved in path Consolidated_Dataset/annot_test_cons.csv...")
    print()
    return annot_test_df

## Function: Copy image files to Reduced_Dataset

In [10]:
def copy_img_red_dset(dataset_path, annot_train_df, annot_test_df, num_train_img_red,\
                      num_test_img_red):

    train_img_root_path = os.path.join(dataset_path, 'Consolidated_Dataset/train_images')
    train_img_dest_path = os.path.join(dataset_path, 'Reduced_Dataset/train_images/')
    test_img_root_path = os.path.join(dataset_path, 'Consolidated_Dataset/test_images')
    test_img_dest_path = os.path.join(dataset_path, 'Reduced_Dataset/test_images/')

    print("Copying training set images to Reduced_Dataset/train_images...")
    for df_ind in range(num_train_img_red):
        file_path = os.path.join(train_img_root_path, annot_train_df.loc[df_ind, 'filename'])
        shutil.copy(file_path, train_img_dest_path)   

    print("Copying test set images to Reduced_Dataset/test_images...")
    for df_ind in range(num_test_img_red):
        file_path = os.path.join(test_img_root_path, annot_test_df.loc[df_ind, 'filename'])
        shutil.copy(file_path, test_img_dest_path)    
    print()

## Function: Create annotation files for reduced dataset

In [11]:
def create_ann_red_dset(dataset_path, num_train_img_per_class, num_test_img_per_class):
    
    # Define paths of consolidated annotation files for training and test sets
    annot_train_cons_csv_path = os.path.join(dataset_path,\
                                             'Consolidated_Dataset/annot_train_cons.csv')
    annot_test_cons_csv_path = os.path.join(dataset_path,\
                                             'Consolidated_Dataset/annot_test_cons.csv')
    # Read consolidated train and test annotation files into a dataframe
    annot_train_cons_df = pd.read_csv(annot_train_cons_csv_path)
    annot_test_cons_df = pd.read_csv(annot_test_cons_csv_path)

    # Get list of unique classes
    class_list = annot_train_cons_df['class'].unique()

    # Create place holder for reduced annotation dataframes for training and test sets
    annot_train_red_df = pd.DataFrame(columns = annot_train_cons_df.columns)
    annot_test_red_df = pd.DataFrame(columns = annot_test_cons_df.columns)

    for class_val in class_list:
        temp_train_df = annot_train_cons_df[annot_train_cons_df['class'] == class_val]
        temp_test_df = annot_test_cons_df[annot_test_cons_df['class'] == class_val]    
        annot_train_red_df = annot_train_red_df.append(temp_train_df[0:num_train_img_per_class])
        annot_test_red_df = annot_test_red_df.append(temp_test_df[0:num_test_img_per_class])
    
    annot_train_red_csv_path = os.path.join(dataset_path,\
                                            'Consolidated_Dataset/annot_train_red.csv')
    annot_test_red_csv_path = os.path.join(dataset_path,\
                                            'Consolidated_Dataset/annot_test_red.csv')
    annot_train_red_df.to_csv(annot_train_red_csv_path, index = False)
    print("annot_train_red_df saved in path Consolidated_Dataset/annot_train_red.csv...")
    annot_test_red_df.to_csv(annot_test_red_csv_path, index = False)
    print("annot_test_red_df saved in path Consolidated_Dataset/annot_test_red.csv...")
    print()

## Function: Copy names.csv file

In [12]:
def copy_names_files(dataset_path):
    file_path = os.path.join(dataset_path, 'names_orig.csv')
    con_dset_path = os.path.join(dataset_path, 'Consolidated_Dataset/class_names.csv')
    print("Copying names.csv file to Consolidated_Dataset...")
    shutil.copy(file_path, con_dset_path)
    print()

## Function: Create zip files

In [13]:
def create_zip_files(dataset_path):
    con_dset_zip_file_name = os.path.join(dataset_path, "Consolidated_Dataset")
    con_dset_dir_name = os.path.join(dataset_path, "Consolidated_Dataset")
    
    print("Creating Consolidated_Dataset.zip...")
    shutil.make_archive(con_dset_zip_file_name, 'zip', con_dset_dir_name)    
    print()

# Run Preprocessing

In [14]:
# Run complete-preprocessing

# Define parameters
num_train_img_per_class = 7 # #images per class to use for reduced train dataset annotation file
num_test_img_per_class = 7 #images per class to use for reduced test dataset annotation file

# Start run-timer
start_time = datetime.datetime.now(pytz.timezone('Asia/Kolkata'))
print("Started preprocessing at %s. This process will take about 10 - 15 minutes..."\
      %(start_time.strftime("%H:%M:%S")))
print()

# Get some details about original dataset
check_orig_data(dataset_path) 
# Create Consolidated dataset folders
create_cons_dset_folders(dataset_path) 
# Consolidate training and test set images into Consolidated dataset folder
cons_img_files(dataset_path) 
# Create label-class dictionary
label_class_dict = create_lbl_cls_dict(dataset_path) 
# Add image width, height and class to consolidated train_annotation xls
annot_train_df = upd_train_ann(dataset_path) 
# Add image width, height and class to consolidated test_annotation xls
annot_test_df = upd_test_ann(dataset_path) 
# Create reduced train and test annotation xls files
create_ann_red_dset(dataset_path, num_train_img_per_class, num_test_img_per_class)
# Copy names_files to Consolidate dataset folder
copy_names_files(dataset_path)
# Zip Consolidated dataset folder
create_zip_files(dataset_path)

end_time = datetime.datetime.now(pytz.timezone('Asia/Kolkata'))
elap_time = ((end_time - start_time).total_seconds())/60
print("Completed preprocessing at %s. Elapsed time = %0.1f minutes."\
      %(end_time.strftime("%H:%M:%S"), elap_time)) 

Started preprocessing at 14:36:29. This process will take about 10 - 15 minutes...

names_orig.csv file found in dataset_path...
annot_train_orig.csv file found in dataset_path...
annot_test_orig.csv file found in dataset_path...
Training set images path found in dataset_path...
Test set images path found in dataset_path...

196 folders found in training set images path
196 folders found in test set images path

8144 images found in training set images path
8041 images found in test set images path

Consolidated_Dataset folder found: Deleting...

Creating Consolidated_Dataset folder...
Creating Consolidated_Dataset/train_images folder...
Creating Consolidated_Dataset/test_images folder...

Consolidating training set images to Consolidated_Dataset/train_images... 8144 images consolidated.
Consolidating test set images to Consolidated_Dataset/test_images... 8041 images consolidated.

Creating label_class_dict...

Adding image height, width and image class to annot_train_df...
annot_train