# Decision Tree Results & QAQC

In [None]:
import pandas as pd
import sys
import numpy as np
import matplotlib.pyplot as plt
sys.path.append('../src/')
import image_availability as img
import clean_raw_data as clean

%load_ext autoreload
%autoreload 2

In [None]:
df = pd.read_csv("../data/results/decisons_yr0.csv")

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.target_sys.unique()

In [None]:
df[df['target_sys'].isna()]

### What is the tree cover distribution for each target system?

In [None]:
target_sys_list = df['target_sys'].unique()

# Determine the number of rows needed for 2 columns
n_cols = 2
n_rows = int(np.ceil(len(target_sys_list) / n_cols))

# Set up the figure and axes for subplots with 2 columns
fig, axes = plt.subplots(n_rows, n_cols, figsize=(14, 5 * n_rows))

# Flatten axes in case it's a 2D array (needed for easy iteration)
axes = axes.flatten()

# Loop through each target_sys and create a histogram
for i, target_sys in enumerate(target_sys_list):
    ax = axes[i]
    
    # Filter the dataframe for the current target_sys
    df_filtered = df[df['target_sys'] == target_sys]
    
    # Define the bins
    bins = np.linspace(0, 100, 11)  # 10 bins from 0 to 100
    
    # Plot histograms for values < 40 and > 40 in different colors
    df_below_40 = df_filtered[df_filtered['ttc'] <= 40.0]['ttc']
    df_above_40 = df_filtered[df_filtered['ttc'] > 40.0]['ttc']
    
    # Plot values 
    ax.hist(df_below_40, bins=bins, color='blue', label='ttc <= 40', alpha=0.7)
    ax.hist(df_above_40, bins=bins, color='orange', label='ttc > 40', alpha=0.7)
    
    # Add titles and labels
    ax.set_title(f'{target_sys}')
    ax.set_xlabel('tree cover (%)')
    ax.set_ylabel('Frequency')
    
    # Add a legend to differentiate the colors
    ax.legend()

# Remove any extra empty subplots (if the number of target_sys is odd)
for j in range(len(target_sys_list), len(axes)):
    fig.delaxes(axes[j])

# Adjust layout for clarity
plt.tight_layout()
plt.show()

### How many projects are closed canopy but don't have enough baseline imgs for remote verification?

### What does the distribution of images look like?

In [None]:
columns = ['baseline_img', 
          'ev_img', 'ttc']
num_plots = len(columns)
fig, axes = plt.subplots(2, (num_plots // 2 + num_plots % 2), figsize=(15, 10))
axes = axes.flatten()
for i, col in enumerate(columns):
    df[col].plot(kind='hist', bins=15, ax=axes[i], title=f'{col} availability', color='#2a9d8f')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Frequency')
# Remove any empty subplots
for j in range(len(columns), len(axes)):
    fig.delaxes(axes[j])
plt.tight_layout()

### What does the distribution of target systems and practices look like?

In [None]:
columns = ['target_sys', 'practice']
num_plots = len(columns)
fig, axes = plt.subplots(2, (num_plots // 2 + num_plots % 2), figsize=(13, 7))
axes = axes.flatten()

colors = ['#2a9d8f', '#e76f51', '#f4a261', '#264653']

for i, col in enumerate(columns):
    ax = axes[i]
    value_counts = df[col].value_counts()
    value_counts.plot(kind='barh', ax=ax, title=f'{col.replace("_", " ").title()}', color=colors[i % len(colors)])
    ax.set_xlabel('Polygon count', fontsize=12)
    ax.set_ylabel('')
    
    # Increase font size of ticks
    ax.tick_params(axis='both', which='major', labelsize=10)
    
    # Add gridlines for better readability
    ax.grid(True, axis='x', linestyle='--', alpha=0.7)
    
    # Add bar labels
    for bars in ax.containers:
        ax.bar_label(bars, label_type='edge', padding=3, fontsize=10, color='black')

# Remove any empty subplots
for j in range(len(columns), len(axes)):
    fig.delaxes(axes[j])

# Adjust layout and spacing
plt.tight_layout(pad=3.0)
plt.subplots_adjust(wspace=0.3, hspace=0.5)
plt.show()

In [None]:
method_piechart_perprj(final)