In [None]:
import geopandas
import math
import numpy
import pandas
import rasterio
import rasterio.plot
import shapely.geometry
import sklearn.ensemble

import analysis
import data
import datasets
import reproducibility
import satellite

In [None]:
AGGREGATE_LAYER_COUNT = 3
AGGREGATE_LAYER_START = 1
MEASUREMENT_DISAMBIGUATION_MODE = lambda series: series.max([target_column])
PIXEL_LAYER_COUNT = 3
PIXEL_LAYER_START = 0
SEED_MIN = 0
SEED_MAX = 100
TARGET_COLUMNS = ["SC", "V", "CR", "CO", "NI", "CU", "ZN", "GA", "MO", "W", "SN", "SB", "RB", "SR", "Y", "NB", "ZR", "CS", "CD", "BA", "LA", "CE", "PR", "ND", "SM", "EU", "GD", "TB", "DY", "HO", "ER", "TM", "YB", "LU", "HF", "TA", "PB", "TH", "U", "AU", "AG", "S", "AS_", "SE", "TE", "GE", "BI", "TL", "BE", "B", "F", "CL", "INDIUM"]

In [None]:
satellite_image = datasets.satellite_image()

In [None]:
target_performance_map = {}

for target_column in TARGET_COLUMNS:
    if target_column in target_performance_map:
        continue

    positive_threshold = 200
    satellite_data_frame = datasets.geochemical_satellite_analysis(satellite_image, target_column, positive_threshold)
    satellite.create_features(satellite_data_frame, satellite_image, PIXEL_LAYER_START, PIXEL_LAYER_COUNT, AGGREGATE_LAYER_START, AGGREGATE_LAYER_COUNT)

    index_data_frame = satellite.create_summary_data_frame(satellite_data_frame, satellite_image, target_column, MEASUREMENT_DISAMBIGUATION_MODE, PIXEL_LAYER_START, PIXEL_LAYER_COUNT, AGGREGATE_LAYER_START, AGGREGATE_LAYER_COUNT)
    x_labels = index_data_frame.x_labels()
    y_labels = index_data_frame.y_labels()

    max_f1_seed = None
    max_f1 = None

    for seed in range(SEED_MIN, SEED_MAX):
        x_train, y_train, x_test, y_test = data.split_train_test(index_data_frame, x_labels, y_labels, seed=seed)

        histogram = sklearn.ensemble.HistGradientBoostingRegressor(loss="squared_error", validation_fraction=0.2, min_samples_leaf=5, random_state=seed).fit(x_train, y_train)

        y_test_predicted = histogram.predict(x_test)
        comparison, confusion_matrix = analysis.evaluate_performance(y_test, y_test_predicted, positive_threshold)

        if (max_f1 is None or math.isnan(max_f1) or confusion_matrix.f1_score > max_f1):
            max_f1_seed = seed
            max_f1 = confusion_matrix.f1_score
            
        print(f"Target: {target_column} | Max F1: {max_f1} | Seed: {max_f1_seed}")

    target_performance_map[target_column] = {
        "max_f1": max_f1,
        "max_f1_seed": max_f1_seed,
        "positive_threshold": positive_threshold
    }

In [None]:
import json
with open("Target Performance Map.json", "w") as file:
    json.dump(target_performance_map, file)