In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

INTERESTED = ['emp', 'mus', 'rara', 'ory', 'fsi', 'lyn', 'lut', 'sus', 'mel', 'vul', 'lep', 'equ', 'cer', 'bos', 'gen', 'her', 'dam', 'fel', 'can', 'ovar', 'mafo', 'capi', 'caae', 'ovor', 'caca']
PARKNAME = {'snieves': "Sierra de las Nieves", 'donana': "Doñana"}
COLORS = {'all': "#8D32EB", 'snieves': "#888888", 'donana': "#C2A031"}
DATAFRAME = pd.read_csv('raw.csv')

In [None]:
# Filter the dataframe to keep only rows where the species is in the INTERESTED list
DATAFRAME = DATAFRAME[DATAFRAME['species'].isin(INTERESTED)]

In [None]:
# Define pie_data
pie_data = pd.DataFrame({
    'Category': ['Species', 'Empty'],
    'Count': [DATAFRAME[DATAFRAME['species'] != 'emp'].shape[0], DATAFRAME[DATAFRAME['species'] == 'emp'].shape[0]]
})

# Prepare labels with percentage and count
labels = [f"{row['Category']}\n{round(row['Count']/1000,1)}k ({row['Count'] / pie_data['Count'].sum() * 100:.1f}%)" for _, row in pie_data.iterrows()]

# Plot a pie chart with labels inside
plt.figure(figsize=(8, 6))
cmap = sns.light_palette(COLORS['all'], as_cmap=False, n_colors=4)[-2:][::-1]
wedges, texts = plt.pie(pie_data['Count'], startangle=90, autopct=None, textprops={'fontsize': 10}, colors=cmap)

# Add labels inside the pie chart
for i, wedge in enumerate(wedges):
    x, y = wedge.center
    angle = (wedge.theta2 + wedge.theta1) / 2
    x = 0.5 * wedge.r * np.cos(np.radians(angle))
    y = 0.5 * wedge.r * np.sin(np.radians(angle))
    plt.text(x, y, labels[i], ha='center', va='center', fontsize=12, color='white')

plt.title('Comparison of Empty Images vs Species Images')
plt.tight_layout()
plt.show()


In [None]:
def plot_species_distribution(dataframe, title, color):
    species_counts = dataframe['species'].value_counts()
    species_counts = species_counts.reindex(INTERESTED[1:], fill_value=0)

    plt.figure(figsize=(12, 6))
    cmap = sns.light_palette(color, as_cmap=False, n_colors=4)[-2:]
    sns.barplot(x=species_counts.index, y=species_counts.values, palette=cmap)
    plt.yscale('log')
    plt.title(title)
    plt.xlabel('Species')
    plt.ylabel('Count (Log Scale)')
    plt.xticks(rotation=45)

    # Adjust the y-axis limit to make space for the numbers
    plt.ylim(1, species_counts.max() * 5)

    # Add the number of images at the top of each bar
    for i, count in enumerate(species_counts.values):
        if count < 1: continue
        plt.text(i, count + count * 0.1, f'{count / 1000:.1f}k' if count >= 1000 else str(count), 
                 ha='center', va='bottom', fontsize=10, rotation=0)

    plt.tight_layout()
    plt.show()
    
plot_species_distribution(DATAFRAME[DATAFRAME['species'] != 'emp'], "Species Distribution (Excluding 'emp')", COLORS['all'])

In [None]:
# Define pie_data for parks
pie_data = pd.DataFrame({
    'Park': DATAFRAME['park'].value_counts().index,
    'Count': DATAFRAME['park'].value_counts().values
})

# Prepare labels with percentage and count
labels = [f"{PARKNAME[row['Park']]}\n{round(row['Count']/1000,1)}k ({row['Count'] / pie_data['Count'].sum() * 100:.1f}%)" for _, row in pie_data.iterrows()]

# Map colors for each park
colors = [f"{COLORS[park]}" for park in pie_data['Park']]

# Plot a pie chart with labels inside
plt.figure(figsize=(8, 6))
wedges, texts = plt.pie(pie_data['Count'], startangle=90, autopct=None, textprops={'fontsize': 10}, colors=colors)

# Add labels inside the pie chart
for i, wedge in enumerate(wedges):
    x, y = wedge.center
    angle = (wedge.theta2 + wedge.theta1) / 2
    x = 0.5 * wedge.r * np.cos(np.radians(angle))
    y = 0.5 * wedge.r * np.sin(np.radians(angle))
    plt.text(x, y, labels[i], ha='center', va='center', fontsize=12, color='white')

plt.title('Distribution of Parks')
plt.tight_layout()
plt.show()

In [None]:
# List of parks
parks = DATAFRAME['park'].unique()
# Plot species distribution for each park
for park in parks:
    plot_species_distribution(DATAFRAME[DATAFRAME['park'] == park], PARKNAME[park], COLORS[park])