In [None]:
import pandas as pd

CSV_DIR = 'csv/big_csv/'

PROPORTION = 0.8
RANDOM_STATE = 42
MEAN_WEIGHT = 1
OVERRIDE_TOP = 0

# Commands used to only show the progress were commented out

Get the dataframe from the csv

In [None]:
orig_dataset = pd.read_csv(CSV_DIR + 'filtered_datetime.csv')
#orig_dataset = pd.read_csv(CSV_DIR + 'train_data.csv')
# orig_dataset.head()

Shuffle

In [None]:
# shuffle dataset
orig_dataset = orig_dataset.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
# orig_dataset.head()

In [None]:
# shows the dataframe summarized in a table
def get_table(df, print_table = False):
    counts_df = df.groupby(['species', 'park']).size().unstack(fill_value=0)
    counts_df['Total'] = counts_df.sum(axis=1)
    counts_df.loc['Total'] = counts_df.sum()
    if print_table: print(counts_df)
    return counts_df


# get the mean number of images per species
mean_images_per_species = lambda x: x[x['species'] != 'emp'].groupby('species')['path'].count().mean()
# round mean to next thousand and multiply by weight
rounded_mean = lambda x: ((((mean_images_per_species(x)) + 999) // 1000) * 1000) * MEAN_WEIGHT

In [None]:
# get_table(orig_dataset, print_table=True)
# print(rounded_mean(orig_dataset))

for each species and park take a sample of PROPORTION% of the images

In [None]:
# generate train dataset
train_dataset = orig_dataset.groupby(['species', 'park']).apply(lambda x: x.sample(frac=PROPORTION, random_state=RANDOM_STATE)).reset_index(drop=True)
# test dataset are the discarded rows, it will be 1-PROPORTION of the original dataset
test_dataset = orig_dataset[~orig_dataset.index.isin(train_dataset.index)]
# print(rounded_mean(train_dataset), rounded_mean(test_dataset))

# DISPLAY THE DATAFRAMES
# df1 = get_table(orig_dataset)
# df2 = get_table(train_dataset)
# df1 = pd.concat([df1, df2], axis=1)
# df2 = get_table(test_dataset)
# concatenated_df = pd.concat([df1, df2], axis=1)
# display(concatenated_df)

get_even_images will take only a maximum of top images per species (top is the rounded mean). Will distribute the images such that between parks it will try 50/50 if not possible it will take more than the other to compensate.

In [None]:
def get_even_images(dataset, top):
    top = int(top)
    selected_images = pd.DataFrame()
    for species in dataset['species'].unique():
        images_of_species = dataset[dataset['species'] == species]
        total_images_for_species = images_of_species.shape[0]

        # Calculate the total number of images available in all parks for this species
        total_images_in_parks = images_of_species.groupby('park').size().to_dict()

        # Calculate the number of images to take from each park based on their relative sizes
        images_per_park = {park: min(int(top/2), total_images_in_parks[park]) for park in total_images_in_parks}
        images_per_park = dict(sorted(images_per_park.items(), key=lambda item: item[1], reverse=False))
        
        next_is_taking = int(top/2)
        if len(images_per_park) == 1:
            images_taken = images_of_species.head(top)
            selected_images = pd.concat([selected_images, images_taken])
            continue
        for park in images_per_park:
            images_taken = images_of_species[images_of_species['park'] == park].head(next_is_taking)
            next_is_taking = top - images_taken.shape[0]
            selected_images = pd.concat([selected_images, images_taken])
            
    return selected_images.reset_index(drop=True)


In [None]:
# If override top is set, it will use the value as the top number of images to take instead of the rounded mean
train_dataset = get_even_images(train_dataset, rounded_mean(train_dataset) if OVERRIDE_TOP == 0 else OVERRIDE_TOP)
test_dataset = get_even_images(test_dataset, rounded_mean(test_dataset) if OVERRIDE_TOP == 0 else OVERRIDE_TOP)

# df1 = get_table(train_dataset)
# df2 = get_table(test_dataset)
# concatenated_df = pd.concat([df1, df2], axis=1)
# display(concatenated_df)

In [None]:
# save dataframes
train_dataset.to_csv(CSV_DIR + 'train_data.csv', index=False)
test_dataset.to_csv(CSV_DIR + 'val_data.csv', index=False) # Change filename to val_data and rerun the last two cells for val split