## Import

In [1]:
import numpy as np
import pandas as pd
import collections

## Dataset

In [2]:
TRAINING_FILE = '../Data/landmark-recognition-2021/train.csv'
df_train = pd.read_csv(TRAINING_FILE)

print(df_train.columns)
print(f'Total Images: {len(df_train.landmark_id.values)}')

Index(['id', 'landmark_id'], dtype='object')
Total Images: 1580470


In [3]:
def extract_class_statistics(df_train):
    classes = {}
    for lid in df_train.landmark_id.values: 
        if lid in classes:
            classes[lid] += 1
        else:
            classes[lid] = 1
    
    return classes


def classes_more_than_threshold(classes, threshold):
    reduced_classes = {}
    
    for key,value in classes.items():
        if value >= threshold:
            reduced_classes[key] = value
    
    return reduced_classes

In [4]:
classes = extract_class_statistics(df_train)
sorted_classes = {k: v for k, v in sorted(classes.items(), key=lambda item: item[1], reverse=True)}

In [5]:
print(f'Total Clases: {len(classes)}')

Total Clases: 81313


In [6]:
threshold = 200
reduced_classes = classes_more_than_threshold(classes, threshold)
print(f'Total Clases with atleast {threshold} images: {len(reduced_classes)}')

Total Clases with atleast 200 images: 491


In [7]:
for i in range(1000, 0, -100):
    reduced_classes = classes_more_than_threshold(classes, i)
    print(f'Total Clases with atleast {i} images: {len(reduced_classes)}')

Total Clases with atleast 1000 images: 7
Total Clases with atleast 900 images: 12
Total Clases with atleast 800 images: 19
Total Clases with atleast 700 images: 27
Total Clases with atleast 600 images: 35
Total Clases with atleast 500 images: 51
Total Clases with atleast 400 images: 81
Total Clases with atleast 300 images: 196
Total Clases with atleast 200 images: 491
Total Clases with atleast 100 images: 2015


In [8]:
for i in range(100, 0, -10):
    reduced_classes = classes_more_than_threshold(classes, i)
    print(f'Total Clases with atleast {i} images: {len(reduced_classes)}')

Total Clases with atleast 100 images: 2015
Total Clases with atleast 90 images: 2458
Total Clases with atleast 80 images: 3026
Total Clases with atleast 70 images: 3801
Total Clases with atleast 60 images: 4916
Total Clases with atleast 50 images: 6501
Total Clases with atleast 40 images: 8876
Total Clases with atleast 30 images: 13018
Total Clases with atleast 20 images: 20731
Total Clases with atleast 10 images: 39676


In [9]:
for i in range(10, 0, -1):
    reduced_classes = classes_more_than_threshold(classes, i)
    print(f'Total Clases with atleast {i} images: {len(reduced_classes)}')

Total Clases with atleast 10 images: 39676
Total Clases with atleast 9 images: 43069
Total Clases with atleast 8 images: 46912
Total Clases with atleast 7 images: 51471
Total Clases with atleast 6 images: 57042
Total Clases with atleast 5 images: 64016
Total Clases with atleast 4 images: 72322
Total Clases with atleast 3 images: 76563
Total Clases with atleast 2 images: 81313
Total Clases with atleast 1 images: 81313


In [10]:
import cv2
import os

def load_images_from_folder(folder):
    size_dict = {}
    for filename in os.listdir(folder):
        img = cv2.imread(os.path.join(folder,filename))
        if img is not None:
            img_shape = str(img.shape[0]) + "-" + str(img.shape[1])            
            
            if img_shape not in size_dict:                
                size_dict[img_shape] = 0
            size_dict[img_shape] += 1
            
    return size_dict

In [13]:
folder = 'C:/W281-Project/Data/landmark-recognition-2021/train/0/0/0'
size_dict = load_images_from_folder(folder)

In [14]:
len(size_dict)

106

In [15]:
sorted_size_dict = {k: v for k, v in sorted(size_dict.items(), key=lambda item: item[1], reverse=True)}

In [16]:
sorted_size_dict

{'600-800': 126,
 '533-800': 54,
 '800-600': 27,
 '800-533': 22,
 '531-800': 14,
 '480-640': 9,
 '450-800': 7,
 '800-532': 6,
 '599-800': 5,
 '800-535': 5,
 '600-799': 5,
 '529-800': 5,
 '597-800': 4,
 '532-800': 4,
 '535-800': 4,
 '537-800': 3,
 '800-531': 3,
 '448-800': 3,
 '486-800': 3,
 '534-800': 3,
 '516-800': 2,
 '491-800': 2,
 '530-800': 2,
 '480-800': 2,
 '449-800': 2,
 '800-551': 2,
 '800-455': 2,
 '619-800': 2,
 '635-800': 2,
 '800-618': 1,
 '800-529': 1,
 '800-575': 1,
 '799-632': 1,
 '552-800': 1,
 '425-640': 1,
 '800-625': 1,
 '800-640': 1,
 '800-561': 1,
 '639-800': 1,
 '245-800': 1,
 '800-673': 1,
 '471-800': 1,
 '367-800': 1,
 '800-407': 1,
 '377-800': 1,
 '800-522': 1,
 '612-800': 1,
 '800-713': 1,
 '800-800': 1,
 '250-800': 1,
 '512-800': 1,
 '800-620': 1,
 '480-720': 1,
 '577-800': 1,
 '800-556': 1,
 '800-603': 1,
 '800-670': 1,
 '608-800': 1,
 '800-595': 1,
 '321-800': 1,
 '326-441': 1,
 '588-800': 1,
 '280-800': 1,
 '518-800': 1,
 '328-800': 1,
 '496-800': 1,
 '80

In [17]:
sorted_classes

{138982: 6272,
 126637: 2231,
 20409: 1758,
 83144: 1741,
 113209: 1135,
 177870: 1088,
 194914: 1073,
 149980: 971,
 139894: 966,
 1924: 944,
 176528: 920,
 9070: 900,
 20120: 888,
 46705: 888,
 168098: 888,
 161902: 861,
 36748: 839,
 120734: 820,
 176018: 820,
 10419: 776,
 45428: 757,
 14915: 754,
 41648: 742,
 165596: 734,
 47378: 731,
 187779: 730,
 38482: 704,
 40088: 679,
 25093: 663,
 162833: 662,
 173511: 656,
 189907: 621,
 190822: 614,
 76303: 604,
 191292: 600,
 109169: 597,
 51856: 576,
 192931: 572,
 41808: 570,
 107164: 542,
 189811: 540,
 101399: 538,
 64792: 531,
 80177: 527,
 27190: 520,
 152708: 514,
 143710: 508,
 31531: 505,
 27: 504,
 113838: 503,
 85633: 502,
 19605: 492,
 28139: 486,
 132969: 485,
 115821: 482,
 147897: 479,
 73300: 477,
 107801: 474,
 80272: 473,
 29794: 463,
 171683: 460,
 199450: 459,
 137203: 456,
 39865: 452,
 31361: 449,
 51272: 437,
 165900: 429,
 15445: 428,
 190956: 428,
 98993: 427,
 201840: 427,
 136302: 426,
 70644: 425,
 103899: 42

In [18]:
frequency = {}
count  = 0
for key, value in sorted_classes.items():
    count+= 1    
    frequency[value] = count
    

In [19]:
frequency

{6272: 1,
 2231: 2,
 1758: 3,
 1741: 4,
 1135: 5,
 1088: 6,
 1073: 7,
 971: 8,
 966: 9,
 944: 10,
 920: 11,
 900: 12,
 888: 15,
 861: 16,
 839: 17,
 820: 19,
 776: 20,
 757: 21,
 754: 22,
 742: 23,
 734: 24,
 731: 25,
 730: 26,
 704: 27,
 679: 28,
 663: 29,
 662: 30,
 656: 31,
 621: 32,
 614: 33,
 604: 34,
 600: 35,
 597: 36,
 576: 37,
 572: 38,
 570: 39,
 542: 40,
 540: 41,
 538: 42,
 531: 43,
 527: 44,
 520: 45,
 514: 46,
 508: 47,
 505: 48,
 504: 49,
 503: 50,
 502: 51,
 492: 52,
 486: 53,
 485: 54,
 482: 55,
 479: 56,
 477: 57,
 474: 58,
 473: 59,
 463: 60,
 460: 61,
 459: 62,
 456: 63,
 452: 64,
 449: 65,
 437: 66,
 429: 67,
 428: 69,
 427: 71,
 426: 72,
 425: 73,
 423: 74,
 418: 75,
 413: 77,
 409: 78,
 408: 79,
 402: 80,
 400: 81,
 398: 82,
 396: 83,
 394: 84,
 393: 85,
 390: 86,
 387: 87,
 386: 88,
 384: 89,
 381: 90,
 379: 91,
 376: 92,
 375: 93,
 368: 95,
 367: 96,
 366: 98,
 364: 100,
 363: 101,
 361: 102,
 358: 103,
 356: 104,
 354: 105,
 352: 109,
 351: 111,
 350: 114,
 34