# Logo Detection - Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import os
import shutil

In [5]:
IMAGES_PATH = 'train' #folder with all the images. Change paths based on the working directory
ANNOT_PATH = 'annot_train.csv'  #original csv file with all the annotations

In [6]:
files_name = os.listdir(IMAGES_PATH) 
annot_data = pd.read_csv(ANNOT_PATH)
len(files_name), len(annot_data)

(38913, 46163)

Note: the total image annotations are more than the images themselves in the folder. We will just discard those in excess and consider as base line the 38,913 image files.

Moving images in the `train` folder to a new one called `test` with a 80/20 ratio. Note: make sure to have created beforehand the `test` folder.

In [None]:
np.random.seed(123)
for f in files_name:
    if np.random.rand(1) < 0.2:
        shutil.move(IMAGES_PATH+f, 'test/'+f)  #make sure to have already created 'test' folder in the directory

In [7]:
#getting file names in train and test folder 
train_files = os.listdir('train')
test_files = os.listdir('test')
len(train_files), len(test_files) #checking the length

(31154, 7759)

Target labels - Logos to be predicted

In [4]:
# Target labels
logos = ['Adidas','Apple Inc.','Chanel','Coca-Cola','Emirates','Hard Rock Cafe','Mercedes-Benz','NFL',\
         'Nike','Pepsi','Puma','Starbucks','The North Face','Toyota','Under Armour']

Splitting the annotations file in `.csv` based on the two images sets created. If the logo is not one present in the `logos` list, we consider it as `Other`. Afterwards, saving the new annotations for train and test into csv files.

In [8]:
annot_train = annot_data[annot_data.filename.isin(train_files)]
annot_train['image_id'] = [i for i in range(len(annot_train))] #column required for running detecto
annot_train.loc[~annot_train['class'].isin(logos),'class'] = 'Other'
annot_train.to_csv('annot_train.csv')

annot_test = annot_data[annot_data.filename.isin(test_files)]
annot_test.loc[~annot_test['class'].isin(logos),'class'] = 'Other'
annot_test.to_csv('annot_test.csv')

In [9]:
#checking the csv are of the same length as the number of image files
len(annot_train), len(annot_test)

(31154, 7759)