Labelling.ipynb
====
Preprocess data to create .csv file.

Although the Pytorch `ImageFolder` library is convenient, we practice processing data using pandas DataFrame.

**Created by vim_hjk**

## Import library

In [10]:
import os
import pandas as pd

from glob import glob
from prettyprinter import cpprint
from IPython.display import display

## Set root directory

In [11]:
ROOT_DIR = '.\\fruits-360\\'
DATA_SAVE_DIR = '.\\data\\'
SUBMISSION_DIR = '.\\prediction\\'

## os : When we want to copy a file or create a directory and get a list of files within specific directory

In [12]:
label_list = os.listdir(ROOT_DIR + 'Training')
cpprint(label_list)

[
    'Apple Braeburn',
    'Apple Crimson Snow',
    'Apple Golden 1',
    'Apple Golden 2',
    'Apple Golden 3',
    'Apple Granny Smith',
    'Apple Pink Lady',
    'Apple Red 1',
    'Apple Red 2',
    'Apple Red 3',
    'Apple Red Delicious',
    'Apple Red Yellow 1',
    'Apple Red Yellow 2',
    'Apricot',
    'Avocado',
    'Avocado ripe',
    'Banana',
    'Banana Lady Finger',
    'Banana Red',
    'Beetroot',
    'Blueberry',
    'Cactus fruit',
    'Cantaloupe 1',
    'Cantaloupe 2',
    'Carambula',
    'Cauliflower',
    'Cherry 1',
    'Cherry 2',
    'Cherry Rainier',
    'Cherry Wax Black',
    'Cherry Wax Red',
    'Cherry Wax Yellow',
    'Chestnut',
    'Clementine',
    'Cocos',
    'Corn',
    'Corn Husk',
    'Cucumber Ripe',
    'Cucumber Ripe 2',
    'Dates',
    'Eggplant',
    'Fig',
    'Ginger Root',
    'Granadilla',
    'Grape Blue',
    'Grape Pink',
    'Grape White',
    'Grape White 2',
    'Grape White 3',
    'Grape White 4',
    'Grapefruit Pink',

## glob : When we used to extract a list of files

In [13]:
example = glob('.\\*.py')
print(f'Example) Python file list : {example}')

num_train_imgs = len(glob(ROOT_DIR + f'Training\\*\\*.jpg'))

data = {'image_path' : [], 'label' : []}

for label in label_list:
    train_img_path_list = glob(ROOT_DIR + f'Training\\{label}\\*.jpg')
    for train_img_path in train_img_path_list:
        data['label'].append(label_list.index(label))
        data['image_path'].append(train_img_path)

# check
print(f"\nLength of whole train data : {num_train_imgs}\nLength of label column : {len(data['label'])}\nLength of image_path column : {len(data['image_path'])}")
    

Example) Python file list : ['.\\dataset.py', '.\\inference.py', '.\\loss.py', '.\\main.py', '.\\model.py', '.\\train.py', '.\\unit_t.py', '.\\utils.py']

Length of whole train data : 67692
Length of label column : 67692
Length of image_path column : 67692


## pandas : Data structure handling tool.

## Create train.csv

In [14]:
train_data = pd.DataFrame(data=data)
display(train_data.tail(5))
print(f'\nLength of train data : {len(train_data)}')

# make save directory
os.makedirs(DATA_SAVE_DIR, exist_ok=True)

# save train.csv file
train_data.to_csv(f'./{DATA_SAVE_DIR}/train.csv', index=False)

Unnamed: 0,image_path,label
67687,.\fruits-360\Training\Watermelon\r_6_100.jpg,130
67688,.\fruits-360\Training\Watermelon\r_7_100.jpg,130
67689,.\fruits-360\Training\Watermelon\r_81_100.jpg,130
67690,.\fruits-360\Training\Watermelon\r_8_100.jpg,130
67691,.\fruits-360\Training\Watermelon\r_9_100.jpg,130



Length of train data : 67692


## Create test.csv

In [15]:
num_test_imgs = len(glob(ROOT_DIR + f'Test\\*\\*.jpg'))

data = {'image_path' : [], 'label' : []}

for label in label_list:
    test_img_path_list = glob(ROOT_DIR + f'Test\\{label}\\*.jpg')
    for test_img_path in test_img_path_list:
        data['label'].append(label_list.index(label))
        data['image_path'].append(test_img_path)

# check
print(f"\nLength of whole test data : {num_test_imgs}\nLength of label column : {len(data['label'])}\nLength of image_path column : {len(data['image_path'])}")
    


Length of whole test data : 22688
Length of label column : 22688
Length of image_path column : 22688


In [16]:
test_data = pd.DataFrame(data=data)
display(test_data.tail(5))

print(f'\nLength of test data : {len(test_data)}')

# save test.csv file
test_data.to_csv(f'./{DATA_SAVE_DIR}/test.csv', index=False)

Unnamed: 0,image_path,label
22683,.\fruits-360\Test\Watermelon\r_95_100.jpg,130
22684,.\fruits-360\Test\Watermelon\r_96_100.jpg,130
22685,.\fruits-360\Test\Watermelon\r_97_100.jpg,130
22686,.\fruits-360\Test\Watermelon\r_98_100.jpg,130
22687,.\fruits-360\Test\Watermelon\r_99_100.jpg,130



Length of test data : 22688


## Create Submission Format

In [18]:
submission_format = pd.DataFrame(data={'image_path' : data['image_path']})
submission_format['label'] = ''
display(submission_format.head(5))

os.makedirs(SUBMISSION_DIR, exist_ok=True)
submission_format.to_csv(f'./{SUBMISSION_DIR}/info.csv', index=False)

Unnamed: 0,image_path,label
0,.\fruits-360\Test\Apple Braeburn\321_100.jpg,
1,.\fruits-360\Test\Apple Braeburn\322_100.jpg,
2,.\fruits-360\Test\Apple Braeburn\323_100.jpg,
3,.\fruits-360\Test\Apple Braeburn\324_100.jpg,
4,.\fruits-360\Test\Apple Braeburn\325_100.jpg,
