In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

INTERESTED = ['emp', 'mus', 'rara', 'ory', 'fsi', 'lyn', 'lut', 'sus', 'mel', 'vul', 'lep', 'equ', 'cer', 'bos', 'gen', 'her', 'dam', 'fel', 'can', 'ovar', 'mafo', 'capi', 'caae', 'ovor', 'caca']
PARKNAME = {'snieves': "Sierra de las Nieves", 'donana': "Doñana"}
COLORS = {'all': "#8D32EB", 'snieves': "#888888", 'donana': "#C2A031"}
DATAFRAME = pd.read_csv('raw.csv')

In [None]:
# Only use INTERESTED species
DATAFRAME = DATAFRAME[DATAFRAME['species'].isin(INTERESTED)]

In [None]:
# Pie chart for Species Images vs Empty Images
def get_pie_chart(pie_data, title, colors):
    plt.figure(figsize=(8, 6))

    wedges, texts = plt.pie(pie_data['Count'], startangle=90, autopct=None, textprops={'fontsize': 10}, colors=colors)

    labels = [f"{row['Category']}\n{round(row['Count']/1000,1)}k ({row['Count'] / pie_data['Count'].sum() * 100:.1f}%)" for _, row in pie_data.iterrows()]
    for i, wedge in enumerate(wedges):
        x, y = wedge.center
        angle = (wedge.theta2 + wedge.theta1) / 2
        x = 0.5 * wedge.r * np.cos(np.radians(angle))
        y = 0.5 * wedge.r * np.sin(np.radians(angle))
        plt.text(x, y, labels[i], ha='center', va='center', fontsize=12, color='white')

    plt.tight_layout()
    plt.savefig(f'images/{title}_comparison.png', bbox_inches='tight', dpi=300)
    plt.show()

# Pie chart for Species Images vs Empty Images
speciesvempty = pd.DataFrame({
    'Category': ['Species', 'Empty'],
    'Count': [DATAFRAME[DATAFRAME['species'] != 'emp'].shape[0], DATAFRAME[DATAFRAME['species'] == 'emp'].shape[0]]
})
colors = sns.light_palette(COLORS['all'], as_cmap=False, n_colors=4)[-2:][::-1]
get_pie_chart(speciesvempty, 'empty_species', colors)

# Pie chart for Park Images
parks = DATAFRAME['park'].value_counts().index
parks_data = pd.DataFrame({
    'Category': [PARKNAME.get(park, park) for park in parks],
    'Count': DATAFRAME['park'].value_counts().values
})
colors = [f"{COLORS[park]}" for park in parks]
get_pie_chart(parks_data, 'parks', colors)

In [None]:
# Function to plot species distribution
def plot_species_distribution(dataframe, title, color):
    species_counts = dataframe['species'].value_counts().reindex(INTERESTED[1:], fill_value=0)

    plt.figure(figsize=(12, 6))

    colors = sns.light_palette(color, as_cmap=False, n_colors=4)[-2:]*12
    sns.barplot(x=species_counts.index, y=species_counts.values, palette=colors, hue=species_counts.index, legend=False)

    plt.yscale('log')
    plt.ylim(1, species_counts.max() * 5)

    plt.xlabel('Species')
    plt.ylabel('Count (Log Scale)')
    plt.xticks(rotation=45)

    for i, count in enumerate(species_counts.values):
        if count < 1: continue
        plt.text(i, count + count * 0.1, f'{count / 1000:.1f}k' if count >= 1000 else str(count), 
                 ha='center', va='bottom', fontsize=10, rotation=0)

    plt.tight_layout()
    plt.savefig(f'images/{title}_distribution.png', bbox_inches='tight', dpi=300)
    plt.show()
    
# Plot species distribution
plot_species_distribution(DATAFRAME[DATAFRAME['species'] != 'emp'], "species", COLORS['all'])

# Plot species distribution for each park
for park in DATAFRAME['park'].unique():
    plot_species_distribution(DATAFRAME[DATAFRAME['park'] == park], park, COLORS[park])