# Soil parameters estimation

**Thoughts**

mean values per each band per patch?

pred from mean or mean of preds?

test svm?

In [1]:
import numpy as np
import pandas as pd
import torch
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from torch.utils.data import DataLoader
from xgboost import XGBRegressor

from src.consts import MAX_PATH
from src.data.dataset import HyperviewDataset
from src.models.modeller import Modeller

## Read model and dataset

In [2]:
IMG_SIZE = 10
CHANNELS = 150
K = 5
NUM_PARAMS = 3
MAX_VAL = 6000
BATCH_SIZE = 8

MODEL_PATH = "output/modeller_var=GaussianRenderer_bias=Mean_k=5.pth"

In [3]:
dir = "data/hyperview/train_data/train_data"
gt_path = "data/hyperview/train_data/train_gt.csv"

In [4]:
model = Modeller(IMG_SIZE, CHANNELS, K, NUM_PARAMS)
model.load_state_dict(torch.load(MODEL_PATH))

<All keys matched successfully>

In [5]:
with open(MAX_PATH, "rb") as f:
    maxx = np.load(f)
maxx[maxx > MAX_VAL] = MAX_VAL

In [6]:
dataset = HyperviewDataset(dir, IMG_SIZE, MAX_VAL, 0, maxx)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=True)

In [7]:
def prepare_datasets(dataloader: DataLoader) -> list[np.ndarray]:
    imgs = []
    preds = []
    for data in dataloader:
        imgs.append(data.detach().numpy())
        pred = model(data)
        preds.append(pred.detach().numpy())
    return imgs, preds

Features - mean per patch

In [8]:
def aggregate_features(feature_list: list[np.ndarray], features_num: int, batch_size: int, img_size: int) -> np.ndarray:
    features = np.array(feature_list)
    features = features.reshape(features.shape[0] * batch_size, features_num, img_size, img_size)
    features[features == 0] = np.nan
    feature_means = np.nanmean(features, axis=(2, 3))
    return feature_means

In [9]:
imgs, preds = prepare_datasets(dataloader)
pred_means = aggregate_features(preds, K * NUM_PARAMS, BATCH_SIZE, IMG_SIZE)
img_means = aggregate_features(imgs, CHANNELS, BATCH_SIZE, IMG_SIZE)

In [10]:
gt = pd.read_csv(gt_path)
gt.head()

Unnamed: 0,sample_index,P,K,Mg,pH
0,0,45.1,188.0,179.0,7.2
1,1,44.8,205.0,188.0,7.0
2,2,44.4,207.0,145.0,6.8
3,3,46.5,204.0,143.0,6.8
4,4,52.0,212.0,167.0,6.7


In [11]:
gt = gt[:img_means.shape[0]]
gt = gt.drop(["sample_index"], axis=1)

In [15]:
x_pred = pred_means
x_img = img_means
y = gt

## Predictions using XGBoost

In [16]:
def predict_params(x: np.ndarray, y: np.ndarray):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    model = MultiOutputRegressor(XGBRegressor(n_estimators=100, learning_rate=1e-3))
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    mse = mean_squared_error(y_test, preds, multioutput="raw_values")
    return mse

In [22]:
mse_img = predict_params(x_img, y)
mse_pred = predict_params(x_pred, y)

In [26]:
results = pd.DataFrame(columns=gt.columns)
results.loc["Original image"] = mse_img
results.loc["Modeller output"] = mse_pred
results

Unnamed: 0,P,K,Mg,pH
Original image,589.001784,3256.663636,1575.077108,0.064819
Modeller output,583.282931,3234.273224,1580.202594,0.065491
