In [2]:
import pandas as pd
import os
import json

def load_parquets(path_to_test_parquets, class_mapping_path):
    dfs = []
    for filename in os.listdir(path_to_test_parquets):
        if filename.endswith(".parquet"):
            file_path = os.path.join(path_to_test_parquets, filename)
            print(f"Processing: {file_path}")
            df = pd.read_parquet(file_path)
            dfs.append(df)
        full_df_test = pd.concat(dfs, ignore_index=True)
    with open(class_mapping_path) as f:
        species_dict = json.load(f)
        i = 0
        map_from_df_to_map = {}
        for key in species_dict:
            map_from_df_to_map[i] = key
            i+=1
        full_df_test['species_true'] = full_df_test['label'].map(lambda x: species_dict.get(map_from_df_to_map.get(x)))
    return full_df_test

In [3]:
df = load_parquets('/Users/wdgstl/UVA/PlantEdibilityClassification-/data/test_parquets', '/Users/wdgstl/UVA/PlantEdibilityClassification-/backend/data/class_mapping/plantnet300K_species_names.json')

Processing: /Users/wdgstl/UVA/PlantEdibilityClassification-/data/test_parquets/test-00007-of-00008-8caa262cecd65437.parquet
Processing: /Users/wdgstl/UVA/PlantEdibilityClassification-/data/test_parquets/test-00000-of-00008-ccdf9a8395f11a02.parquet
Processing: /Users/wdgstl/UVA/PlantEdibilityClassification-/data/test_parquets/test-00002-of-00008-952e223123a49903.parquet
Processing: /Users/wdgstl/UVA/PlantEdibilityClassification-/data/test_parquets/test-00001-of-00008-4cc7b1523923ba5e.parquet
Processing: /Users/wdgstl/UVA/PlantEdibilityClassification-/data/test_parquets/test-00006-of-00008-5932eb3f032283e1.parquet
Processing: /Users/wdgstl/UVA/PlantEdibilityClassification-/data/test_parquets/test-00004-of-00008-6882acfdcab0bbdd.parquet
Processing: /Users/wdgstl/UVA/PlantEdibilityClassification-/data/test_parquets/test-00005-of-00008-f6f2fd539fae591e.parquet
Processing: /Users/wdgstl/UVA/PlantEdibilityClassification-/data/test_parquets/test-00003-of-00008-4e3b69c54dfa5948.parquet


In [4]:
species_counts = df['species_true'].value_counts()
species_counts

species_true
Daucus_carota              902
Alliaria_petiolata         858
Hypericum_perforatum       767
Centranthus_ruber          668
Cirsium_vulgare            648
                          ... 
Schinopsis_balansae          1
Smilax_china                 1
Barringtonia_acutangula      1
Thesium_linophyllon          1
Trifolium_spumosum           1
Name: count, Length: 1019, dtype: int64

In [5]:
most_prevalent_species = species_counts.head(1)
least_prevalent_species = species_counts.tail(1)

In [6]:
print(f'Most prevalent: {most_prevalent_species}')
print(f'Least prevalent: {least_prevalent_species}')

Most prevalent: species_true
Daucus_carota    902
Name: count, dtype: int64
Least prevalent: species_true
Trifolium_spumosum    1
Name: count, dtype: int64


In [7]:
species_counts.to_csv('../output/species_counts.csv')

In [8]:
df['genus'] = df['species_true'].str.split('_').str[0]

In [9]:
genus_counts = df['genus'].value_counts()
genus_counts

genus
Trifolium      1855
Sedum          1721
Anemone        1628
Lamium         1552
Cirsium        1525
               ... 
Zaleya            1
Vepris            1
Freycinetia       1
Stemodia          1
Alibertia         1
Name: count, Length: 303, dtype: int64

In [10]:
genus_counts.to_csv('../output/genus_counts.csv')

In [11]:
most_prevalent_genus = genus_counts.head(1)
least_prevalent_genus = genus_counts.tail(1)
print(f'Most prevalent: {most_prevalent_genus}')
print(f'Least prevalent: {least_prevalent_genus}')

Most prevalent: genus
Trifolium    1855
Name: count, dtype: int64
Least prevalent: genus
Alibertia    1
Name: count, dtype: int64


In [12]:
from PIL import Image
from io import BytesIO
import numpy as np

widths = []
heights = []
total_pixels = []

for idx, row in df.iterrows():
    image_bytes = row['image']['bytes']
    with Image.open(BytesIO(image_bytes)) as img:
        width, height = img.size
        widths.append(width)
        heights.append(height)
        total_pixels.append(width * height)

avg_width = np.mean(widths)
avg_height = np.mean(heights)
avg_pixels = np.mean(total_pixels)

print(f"Number of images: {len(df)}")
print(f"Average width: {avg_width:.2f} pixels")
print(f"Average height: {avg_height:.2f} pixels")
print(f"Average total pixels per image: {avg_pixels:.2f}")

        

Number of images: 31112
Average width: 569.72 pixels
Average height: 569.92 pixels
Average total pixels per image: 328006.98
