**This notebook contains code to load training dataset and create subsets of data used for feature and model development in the project.**

In [15]:
import numpy as np
import pandas as pd
import collections
import csv
import pathlib
import shutil

In [2]:
# load training data file from https://www.kaggle.com/competitions/landmark-recognition-2021/data?select=train.csv
TRAINING_FILE = '../Data/landmark-recognition-2021/train.csv'
df_train = pd.read_csv(TRAINING_FILE)

# print total number of images
print(df_train.columns)
print(f'Total Images: {len(df_train.landmark_id.values)}')

Index(['id', 'landmark_id'], dtype='object')
Total Images: 1580470


In [3]:
# count images per class
def extract_class_statistics(df_train):
    classes = {}
    for lid in df_train.landmark_id.values:
        if lid not in classes:
            classes[lid] = 0
        classes[lid] += 1
    return classes

# find classes with images >= threshold
def classes_more_than_threshold(classes, threshold):
    reduced_classes = {}
    for key, value in classes.items():
        if value >= threshold:
            reduced_classes[key] = value
    return reduced_classes

In [4]:
classes = extract_class_statistics(df_train)
sorted_classes = {k: v for k, v in sorted(classes.items(), key=lambda item: item[1], reverse=True)}

In [5]:
threshold = 500
reduced_classes = classes_more_than_threshold(classes, threshold)
sorted_reduced_classes = {k: v for k, v in sorted(reduced_classes.items(), key=lambda item: item[1], reverse=True)}
print(f'Total Clases with atleast {threshold} images: {len(reduced_classes)}')

Total Clases with atleast 500 images: 51


In [6]:
# create dataset with only images from classes with 500 or more images
with open('TargetClasses.csv', "w", newline='') as filetc:
    writertc = csv.writer(filetc, delimiter=',')
    writertc.writerow(('landmark_id','Frequency'))
    for key,value in reduced_classes.items():
        writertc.writerow((key, value))

In [8]:
# create subset of images corresponding to relevant landmark classes
index = 0
target_data = {}
for image_id in df_train['id']:
    landmark_id = df_train['landmark_id'][index]
    index += 1
    
    if landmark_id in sorted_reduced_classes:
        target_data[image_id] = landmark_id

with open('TargetData.csv', "w", newline='') as filetd:
    writertd = csv.writer(filetd, delimiter=',')
    writertd.writerow(('id','landmark_id'))
    for key,value in target_data.items():
        writertd.writerow((key, value))

print(f'Total Images (Reduced Data): {len(target_data)}')

Total Images (Reduced Data): 45579


In [11]:
image_root_dir = 'C:/W281-Project/Data/landmark-recognition-2021/train'
image_paths = [x for x in pathlib.Path(image_root_dir).rglob('*.jpg')]

In [None]:
target_image_root_dir = 'C:/W281-Project/Notebook/TargetImages'
for path in image_paths:
    image_id = path.name.replace('.jpg', '')
    if image_id in target_data:
        shutil.copy(path, target_image_root_dir)