In [7]:
import os
import pandas as pd

# Define the habitat types and taxa
habitats = ["forest", "natural_land", "pasture", "cropland"]
taxa = ["Amphibians", "Bird", "Mammals"]

# Define a dictionary to hold the counts
species_counts = {taxon: {} for taxon in taxa}
species_counts["Total"] = {habitat: 0 for habitat in habitats}
species_counts["Total"]["total_species"] = 0

# Path to your CSV files
data_folder = "/storage/homefs/ch21o450/scripts/climate_Hari_etal_inprep/habitat_counts/"  # Update this path

# Read each file and count species per habitat and taxa
for taxon in taxa:
    total_taxon_count = 0  # Total for each taxon
    for habitat in habitats:
        filename = f"habitat_{habitat}_{taxon}.csv"
        file_path = os.path.join(data_folder, filename)
        
        # Read the CSV file assuming each row is a unique species
        try:
            species_df = pd.read_csv(file_path)
            species_count = species_df.shape[0]
            
            # Store the species count
            species_counts[taxon][habitat] = species_count
            species_counts["Total"][habitat] += species_count
            total_taxon_count += species_count
        except FileNotFoundError:
            print(f"File not found: {filename}")
            species_counts[taxon][habitat] = 0  # If file is missing, assume 0 species
            
    # Add total species count for the taxon
    species_counts[taxon]["total_species"] = total_taxon_count
    species_counts["Total"]["total_species"] += total_taxon_count

# Calculate percentages
total_species = species_counts["Total"]["total_species"]
for taxon in taxa:
    for habitat in habitats:
        count = species_counts[taxon][habitat]
        percentage = (count / species_counts[taxon]["total_species"] * 100) if species_counts[taxon]["total_species"] > 0 else 0
        species_counts[taxon][habitat] = f"{count} ({percentage:.1f}%)"
    species_counts[taxon]["total_species"] = f"{species_counts[taxon]['total_species']}"

# Total percentages
for habitat in habitats:
    count = species_counts["Total"][habitat]
    percentage = (count / total_species * 100) if total_species > 0 else 0
    species_counts["Total"][habitat] = f"{count} ({percentage:.1f}%)"
species_counts["Total"]["total_species"] = f"{total_species}"

# Create DataFrame for better visualization
table_df = pd.DataFrame(species_counts).T

# Display the table
print(table_df)


                   forest  natural_land       pasture      cropland  \
Amphibians   2235 (39.2%)  2211 (38.8%)   642 (11.3%)   615 (10.8%)   
Bird         5770 (39.8%)  4644 (32.0%)  2096 (14.5%)  1987 (13.7%)   
Mammals      2907 (47.7%)  2210 (36.3%)    436 (7.2%)    542 (8.9%)   
Total       10912 (41.5%)  9065 (34.5%)  3174 (12.1%)  3144 (12.0%)   

           total_species  
Amphibians          5703  
Bird               14497  
Mammals             6095  
Total              26295  


In [8]:
# Dictionary to store unique species for each taxon and habitat
unique_species = {taxon: {habitat: set() for habitat in habitats} for taxon in taxa}
total_species_per_taxon = {taxon: set() for taxon in taxa}  # To track unique species across habitats for each taxon


# Process each file and store unique species
for taxon in taxa:
    for habitat in habitats:
        filename = f"habitat_{habitat}_{taxon}.csv"
        file_path = os.path.join(data_folder, filename)
        
        # Read the CSV file assuming each row represents a unique species
        try:
            species_df = pd.read_csv(file_path)
            species_list = species_df["Species"].unique()  # Replace "species" with the actual column name for species
            unique_species[taxon][habitat].update(species_list)
            
            # Update total unique species for the taxon across habitats
            total_species_per_taxon[taxon].update(species_list)
        except FileNotFoundError:
            print(f"File not found: {filename}")

# Calculate counts and percentages
species_counts = {}
for taxon in taxa:
    species_counts[taxon] = {}
    total_count_taxon = len(total_species_per_taxon[taxon])  # Total unique species for this taxon
    
    for habitat in habitats:
        count = len(unique_species[taxon][habitat])
        percentage = (count / total_count_taxon * 100) if total_count_taxon > 0 else 0
        species_counts[taxon][habitat] = f"{count} ({percentage:.1f}%)"
    
    # Total species count for each taxon
    species_counts[taxon]["total_species"] = total_count_taxon

# Calculate overall totals
overall_total = sum(len(species) for species in total_species_per_taxon.values())
species_counts["Total"] = {}
for habitat in habitats:
    total_count_habitat = sum(len(unique_species[taxon][habitat]) for taxon in taxa)
    percentage = (total_count_habitat / overall_total * 100) if overall_total > 0 else 0
    species_counts["Total"][habitat] = f"{total_count_habitat} ({percentage:.1f}%)"
species_counts["Total"]["total_species"] = overall_total

# Create DataFrame for better visualization
table_df = pd.DataFrame(species_counts).T

# Display the table
print(table_df)

                   forest  natural_land       pasture      cropland  \
Amphibians   2235 (82.7%)  2211 (81.8%)   642 (23.7%)   615 (22.7%)   
Bird         5770 (79.5%)  4644 (64.0%)  2096 (28.9%)  1987 (27.4%)   
Mammals      2907 (73.8%)  2210 (56.1%)   436 (11.1%)   542 (13.8%)   
Total       10912 (78.5%)  9065 (65.2%)  3174 (22.8%)  3144 (22.6%)   

           total_species  
Amphibians          2704  
Bird                7260  
Mammals             3939  
Total              13903  


In [21]:
import os
import pandas as pd
from collections import Counter, defaultdict

habitats = ["forest", "natural_land", "pasture", "cropland"]
taxa = ["Amphibians", "Bird", "Mammals"]

# Dictionary to store unique species for each habitat and taxon
unique_species = {taxon: {habitat: set() for habitat in habitats} for taxon in taxa}
total_species_per_taxon = {taxon: set() for taxon in taxa}  # Unique species across all habitats for each taxon

# Dictionary to track the number of habitats each species occurs in per taxon
species_habitat_counts = {taxon: defaultdict(int) for taxon in taxa}

# Define a dictionary to hold the counts
species_counts = {taxon: {} for taxon in taxa}
species_counts["Total"] = {habitat: 0 for habitat in habitats}
species_counts["Total"]["total_species"] = 0

# Path to your CSV files
data_folder = "/storage/homefs/ch21o450/scripts/climate_Hari_etal_inprep/habitat_counts/"  # Update this path

# Process each file and store unique species
for taxon in taxa:
    for habitat in habitats:
        filename = f"habitat_{habitat}_{taxon}.csv"
        file_path = os.path.join(data_folder, filename)
        
        # Read the CSV file assuming each row represents a unique species
        try:
            species_df = pd.read_csv(file_path)
            species_list = species_df["Species"].unique()  # Replace "species" with the actual column name for species
            unique_species[taxon][habitat].update(species_list)
            
            # Update the habitat count for each species in this taxon
            for species in species_list:
                species_habitat_counts[taxon][species] += 1
                
            # Update total unique species for the taxon across habitats
            total_species_per_taxon[taxon].update(species_list)
        except FileNotFoundError:
            print(f"File not found: {filename}")

# Calculate total number of species in multiple habitats
multiple_habitat_counts = {taxon: Counter() for taxon in taxa}
overall_habitat_counts = Counter()

for taxon in taxa:
    for species, count in species_habitat_counts[taxon].items():
        multiple_habitat_counts[taxon][count] += 1
        overall_habitat_counts[count] += 1

# Create the main table for habitat counts and percentages
main_table = []
for taxon in taxa:
    row = {}
    total_count_taxon = len(total_species_per_taxon[taxon])  # Total unique species for this taxon
    row["Total # of species"] = total_count_taxon

    for habitat in habitats:
        count = len(unique_species[taxon][habitat])
        percentage = (count / total_count_taxon * 100) if total_count_taxon > 0 else 0
        row[habitat.capitalize() + " (count, %)"] = f"{count} ({percentage:.1f}%)"
    
    # Add number of species in multiple habitats
    one_habitat = multiple_habitat_counts[taxon][1]
    row["Only 1 habitat"] = one_habitat
    row["Multiple habitats"] = total_count_taxon - one_habitat
    
    # Add taxon name and append row
    row["Taxon"] = taxon.capitalize()
    main_table.append(row)

# Add overall totals to the table
overall_total_count = sum(len(total_species_per_taxon[taxon]) for taxon in taxa)
overall_row = {"Taxon": "Overall", "Total # of species": overall_total_count}
for habitat in habitats:
    count = sum(len(unique_species[taxon][habitat]) for taxon in taxa)
    percentage = (count / overall_total_count * 100) if overall_total_count > 0 else 0
    overall_row[habitat.capitalize() + " (count, %)"] = f"{count} ({percentage:.1f}%)"
overall_row["Only 1 habitat"] = overall_habitat_counts[1]
overall_row["Multiple habitats"] = overall_total_count - overall_habitat_counts[1]
main_table.append(overall_row)

# Convert the table to a DataFrame
combined_table_df = pd.DataFrame(main_table)

# Create an additional table showing species counts in 1, 2, 3, etc., habitats per taxon and overall
appendix_data = {}
for taxon in taxa:
    appendix_data[taxon.capitalize()] = dict(multiple_habitat_counts[taxon])
appendix_data["Overall"] = dict(overall_habitat_counts)
appendix_table_df = pd.DataFrame(appendix_data).fillna(0).astype(int)
# Calculate percentages for "Only 1 habitat" and "Multiple habitats"
# Calculate percentages for "Only 1 habitat" and "Multiple habitats"
for index, row in combined_table_df.iterrows():
    total_species = row["Total # of species"]
    only_one_habitat = row["Only 1 habitat"]
    multiple_habitats = row["Multiple habitats"]
    
    # Add percentage values using loc to update the DataFrame directly
    combined_table_df.loc[index, "Only 1 habitat"] = f"{only_one_habitat} ({only_one_habitat / total_species * 100:.1f}%)"
    combined_table_df.loc[index, "Multiple habitats"] = f"{multiple_habitats} ({multiple_habitats / total_species * 100:.1f}%)"

# Save the combined table to a CSV file
combined_table_df.to_csv("/storage/homefs/ch21o450/scripts/climate_Hari_etal_inprep/functions/figures/main_figures/combined_table.csv", index=False)



  combined_table_df.loc[index, "Only 1 habitat"] = f"{only_one_habitat} ({only_one_habitat / total_species * 100:.1f}%)"
  combined_table_df.loc[index, "Multiple habitats"] = f"{multiple_habitats} ({multiple_habitats / total_species * 100:.1f}%)"


In [22]:
combined_table_df

Unnamed: 0,Total # of species,"Forest (count, %)","Natural_land (count, %)","Pasture (count, %)","Cropland (count, %)",Only 1 habitat,Multiple habitats,Taxon
0,2704,2235 (82.7%),2211 (81.8%),642 (23.7%),615 (22.7%),660 (24.4%),2044 (75.6%),Amphibians
1,7260,5770 (79.5%),4644 (64.0%),2096 (28.9%),1987 (27.4%),2622 (36.1%),4638 (63.9%),Bird
2,3939,2907 (73.8%),2210 (56.1%),436 (11.1%),542 (13.8%),2372 (60.2%),1567 (39.8%),Mammals
3,13903,10912 (78.5%),9065 (65.2%),3174 (22.8%),3144 (22.6%),5654 (40.7%),8249 (59.3%),Overall
