## Essencial Functions and Libraries

In [None]:
import os
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from tifffile import imread
from sklearn.model_selection import train_test_split

# Main directory
dataset_dir        = 'spotlite_dataset_loca1_date1_dir1'
images_dataset_dir = os.path.join(dataset_dir, 'images')
masks_dataset_dir  = os.path.join(dataset_dir, 'masks')

def load_dataset_stage(images_dir, masks_dir, stage=1, max_patches=None, seed=42, min_forest_pixels=200):
    random.seed(seed)
    np.random.seed(seed)

    img_files = sorted([f for f in os.listdir(images_dir)
                        if f.endswith('.tif') and not f.endswith('_mask.tif')])
    msk_files = sorted([f for f in os.listdir(masks_dir)
                        if f.endswith('_mask.tif')])

    img_bases = {os.path.splitext(f)[0]: f for f in img_files}
    msk_bases = {os.path.splitext(f)[0].replace('_mask',''): f for f in msk_files}
    common = sorted(set(img_bases) & set(msk_bases))

    forest, background = [], []

    for base in common:
        img_path = os.path.join(images_dir, img_bases[base])
        msk_path = os.path.join(masks_dir, msk_bases[base])
        mask = imread(msk_path)
        unique_classes, counts = np.unique(mask, return_counts=True)
        class_count = dict(zip(unique_classes, counts))

        # Stage 1 - Background vs Forest
        if stage == 1:
            forest_pixels = sum(class_count.get(c, 0) for c in [2, 3, 4])
            background_pixels = sum(class_count.get(c, 0) for c in [1, 5])

            if forest_pixels >= min_forest_pixels:
                forest.append((img_path, msk_path))
            elif background_pixels > 100:  # avoid patches that are completely empty (pure class 0)
                background.append((img_path, msk_path))

        # Stage 2 - only if it contains class 3 or 4 (pine or eucalyptus)
        elif stage == 2:
            if 3 in unique_classes or 4 in unique_classes:
                forest.append((img_path, msk_path))  # only forest matters here

    # Balancing Stage 1
    if stage == 1:
        min_len = min(len(forest), len(background))
        random.shuffle(forest)
        random.shuffle(background)
        selected = forest[:min_len] + background[:min_len]
    else:
        selected = forest  # Stage 2

    # Limit patches
    if max_patches:
        random.shuffle(selected)
        selected = selected[:max_patches]

    # Load data
    imgs, msks = [], []
    for img_path, msk_path in selected:
        img = imread(img_path)
        msk = imread(msk_path)

        if img.ndim == 3 and img.shape[0] in (3, 4):
            img = img.transpose(1, 2, 0)
        if img.ndim == 2:
            img = np.stack([img]*3, axis=-1)

        imgs.append(img)
        msks.append(msk)

    return np.stack(imgs, axis=0), np.stack(msks, axis=0)


## Features and split train/test

In [6]:
def extract_pixel_samples(X_patches, y_patches, classes=[1, 3, 4], max_per_class=None, seed=42):
    np.random.seed(seed)
    class_pixels = []

    for cls in classes:
        # Boolean mask where the current class is located
        mask = (y_patches == cls)
        idxs = np.where(mask)

        # Extract pixels
        X_cls = X_patches[idxs]
        y_cls = np.full(X_cls.shape[0], cls, dtype=np.uint8)

        # Subsampling (class balancing)
        if max_per_class and X_cls.shape[0] > max_per_class:
            sel = np.random.choice(X_cls.shape[0], size=max_per_class, replace=False)
            X_cls = X_cls[sel]
            y_cls = y_cls[sel]

        class_pixels.append((X_cls, y_cls))

    # Concatenate all classes
    X_final = np.concatenate([x for x, _ in class_pixels], axis=0)
    y_final = np.concatenate([y for _, y in class_pixels], axis=0)

    return X_final, y_final


In [None]:
# 1. Load patches for Stage 1
X1_patches, y1_patches = load_dataset_stage(images_dataset_dir, masks_dataset_dir, stage=1)

# 2. Split train/test by patch
from sklearn.model_selection import train_test_split

idx = np.arange(X1_patches.shape[0])
train_idx, test_idx = train_test_split(idx, test_size=0.2, random_state=42)

X1_train_patches, y1_train_patches = X1_patches[train_idx], y1_patches[train_idx]
X1_test_patches,  y1_test_patches  = X1_patches[test_idx],  y1_patches[test_idx]

# 3. Extract pixels by class (with balancing)
X1_train, y1_train = extract_pixel_samples(X1_train_patches, y1_train_patches, classes=[1,3,4], max_per_class=50000)
X1_test,  y1_test  = extract_pixel_samples(X1_test_patches,  y1_test_patches,  classes=[1,3,4], max_per_class=20000)


### Train Random Forest 

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
import numpy as np
import matplotlib.pyplot as plt

def train_two_stage_rf(X1, y1, X2, y2, forest_params, species_params):
    # Stage 1: Forest vs Background
    # background = classe 1 → 0 ; floresta = classe 3 ou 4 → 1
    y1_bin = (y1 != 1).astype(int)
    forest_clf = RandomForestClassifier(**forest_params)
    forest_clf.fit(X1, y1_bin)

    # Stage 2: y2 já contém só classes 3 e 4
    species_clf = RandomForestClassifier(**species_params)
    species_clf.fit(X2, y2)

    return forest_clf, species_clf

def predict_two_stage_rf(forest_clf, species_clf, X):
    is_forest = forest_clf.predict(X).astype(bool)
    y_pred = np.full(X.shape[0], 1, dtype=int)  # Default: background = 1

    if is_forest.any():
        y_pred[is_forest] = species_clf.predict(X[is_forest])

    return y_pred


In [None]:
forest_params = {
    'n_estimators': 100,
    'max_depth': 20,
    'class_weight': 'balanced',  
    'n_jobs': -1,                
    'random_state': 42
}

species_params = {
    'n_estimators': 100,
    'max_depth': 20,
    'class_weight': 'balanced', 
    'n_jobs': -1,
    'random_state': 42
}



# 2. Separar os patches em treino/teste
idx2 = np.arange(X2_patches.shape[0])
train_idx2, test_idx2 = train_test_split(idx2, test_size=0.2, random_state=42)

X2_train_patches = X2_patches[train_idx2]
y2_train_patches = y2_patches[train_idx2]
X2_test_patches  = X2_patches[test_idx2]
y2_test_patches  = y2_patches[test_idx2]

# 3. Extrair vetores por pixel (apenas classes 3 e 4)
X2_train, y2_train = extract_pixel_samples(
    X2_train_patches, y2_train_patches,
    classes=[3, 4], max_per_class=30000
)

X2_test, y2_test = extract_pixel_samples(
    X2_test_patches, y2_test_patches,
    classes=[3, 4], max_per_class=15000
)

forest_clf, species_clf = train_two_stage_rf(
    X1_train, y1_train,  # Stage 1: fundo vs floresta
    X2_train, y2_train,  # Stage 2: pinus vs eucalipto
    forest_params, species_params
)

# Predição completa
y_pred_full = predict_two_stage_rf(forest_clf, species_clf, X1_test)

# Prediction
y_pred_full = predict_two_stage_rf(forest_clf, species_clf, X1_test)

# Stage 1 - Evaluation
y_forest_true = np.isin(y1_test, [2, 3, 4]).astype(int)
y_forest_pred = forest_clf.predict(X1_test)
print("=== Stage 1: Background vs. Forest ===")
print(classification_report(
    y_forest_true, y_forest_pred,
    target_names=['Background', 'Forest'], digits=4
))

# Stage 2 - Evaluation (isolated where ground truth is forest 3 or 4)
mask_fg = np.isin(y2_test, [3, 4])
X_test_fg = X2_test[mask_fg]
y_test_fg = y2_test[mask_fg]
y_pred_fg = species_clf.predict(X_test_fg)
print("=== Stage 2: Pinus vs. Eucalyptus (isolated) ===")
print(classification_report(
    y_test_fg, y_pred_fg,
    target_names=['Pinus', 'Eucalyptus'], digits=4
))

# Final Evaluation  (3 classes)
print("=== Full Pipeline: 3 classes (1=BG, 3=Pinus, 4=Eucalipto) ===")
print(classification_report(
    y1_test, y_pred_full,
    labels=[1, 3, 4],
    target_names=['Background', 'Pinus', 'Eucalyptus'],
    digits=4
))

ConfusionMatrixDisplay.from_predictions(
    y1_test, y_pred_full,
    labels=[1, 3, 4],
    display_labels=['Background', 'Pinus', 'Eucalyptus'],
    normalize='true',
    values_format='.2f'
)
plt.title("Confusion Matrix — Random Forest 3-class (Normalized)")
plt.show()