In [None]:
# Vegetation index dataset construction

In [19]:
import pandas as pd
import numpy as np
import os
import re
import time
from tqdm import tqdm

# ==================== Path configuration ====================
input_dir = r"E:\jupyter_data\Hapke\Submit\data\processed\analyzes"
output_dir = r"E:\jupyter_data\Hapke\Submit\data\processed\vegetation_indices"
os.makedirs(output_dir, exist_ok=True)

start_time = time.time()

# Wavelength to column name mapping (optional)
wavelength_to_band = {wl: f'{wl}nm' for wl in range(390, 966, 4)}

# ==================== Get band column name accurately ====================
def get_band_column(df, wavelength, prefix):
    """Exact match: prefix_390nm"""
    target_col = f"{prefix}_{int(wavelength)}nm"
    if target_col in df.columns:
        return target_col

    # Fuzzy match to the nearest wavelength
    pattern = re.compile(rf"^{re.escape(prefix)}_(\d+)nm$")
    available = []
    for col in df.columns:
        match = pattern.match(col)
        if match:
            wl = int(match.group(1))
            available.append((wl, col))

    if not available:
        raise ValueError(f"Prefix '{prefix}' has no matching band column")

    closest_wl, closest_col = min(available, key=lambda x: abs(x[0] - wavelength))
    return closest_col

# ==================== Range mean ====================
def get_range_mean(df, start_wl, end_wl, prefix):
    pattern = re.compile(rf"^{re.escape(prefix)}_(\d+)nm$")
    cols_in_range = []
    for col in df.columns:
        match = pattern.match(col)
        if match:
            wl = int(match.group(1))
            if start_wl <= wl <= end_wl:
                cols_in_range.append(col)
    if not cols_in_range:
        raise ValueError(f"Prefix '{prefix}' has no band in {start_wl}-{end_wl}nm")
    return df[cols_in_range].mean(axis=1)

# ==================== Vegetation index formulas ====================
vegetation_indices = [
    {'name': 'NDVI', 'formula': lambda df, p: (df[get_band_column(df, 860, p)] - df[get_band_column(df, 670, p)]) / (df[get_band_column(df, 860, p)] + df[get_band_column(df, 670, p)] + 1e-10)},
    {'name': 'ND800_680', 'formula': lambda df, p: (df[get_band_column(df, 800, p)] - df[get_band_column(df, 680, p)]) / (df[get_band_column(df, 800, p)] + df[get_band_column(df, 680, p)] + 1e-10)},
    {'name': 'mND_705', 'formula': lambda df, p: (df[get_band_column(df, 750, p)] - df[get_band_column(df, 705, p)]) / (df[get_band_column(df, 750, p)] + df[get_band_column(df, 705, p)] - 2 * df[get_band_column(df, 445, p)] + 1e-10)},
    {'name': 'DD', 'formula': lambda df, p: (df[get_band_column(df, 749, p)] - df[get_band_column(df, 720, p)]) - (df[get_band_column(df, 701, p)] - df[get_band_column(df, 672, p)])},
    {'name': 'mSR_705', 'formula': lambda df, p: (df[get_band_column(df, 750, p)] - df[get_band_column(df, 445, p)]) / (df[get_band_column(df, 705, p)] - df[get_band_column(df, 445, p)] + 1e-10)},
    {'name': 'DATT', 'formula': lambda df, p: df[get_band_column(df, 672, p)] / (df[get_band_column(df, 708, p)] * df[get_band_column(df, 550, p)] + 1e-10)},
    {'name': 'Chl_Rred_edge', 'formula': lambda df, p: ((get_range_mean(df, 750, 800, p) - get_range_mean(df, 430, 470, p)) / (get_range_mean(df, 695, 740, p) - get_range_mean(df, 430, 470, p) + 1e-10)) - 1},
    {'name': 'TBI_13', 'formula': lambda df, p: (df[get_band_column(df, 914, p)] - df[get_band_column(df, 822, p)]) / (df[get_band_column(df, 822, p)] - df[get_band_column(df, 774, p)] + 1e-10)},
    {'name': 'TBI_24', 'formula': lambda df, p: (df[get_band_column(df, 822, p)] - df[get_band_column(df, 778, p)]) / (df[get_band_column(df, 822, p)] - 2 * df[get_band_column(df, 778, p)] + df[get_band_column(df, 918, p)] + 1e-10)},  
    {'name': 'TBI_25', 'formula': lambda df, p: (df[get_band_column(df, 918, p)] - df[get_band_column(df, 778, p)]) / (df[get_band_column(df, 918, p)] - 2 * df[get_band_column(df, 778, p)] + df[get_band_column(df, 822, p)] + 1e-10)},
    ]

# ==================== Calculate indices ====================
def calculate_vegetation_indices(df, prefix):
    results = {}
    for idx in vegetation_indices:
        name = idx['name']
        try:
            val = idx['formula'](df, prefix)
            val = val.replace([np.inf, -np.inf], np.nan).fillna(0)
            results[name] = val
        except Exception as e:
            print(f"  [Warning] {prefix}_{name} calculation failed: {e}")
            results[name] = pd.Series(0, index=df.index)
    return pd.DataFrame(results)

# ==================== Main program ====================
csv_files = ['ave.csv', 'ave_a.csv', 'ave_b.csv', 'sun.csv', 'sun_a.csv', 'sun_b.csv', 'w.csv']
print(f"Processing {len(csv_files)} files: {csv_files}")

for input_filename in tqdm(csv_files, desc="Processing files"):
    input_file = os.path.join(input_dir, input_filename)
    output_file = os.path.join(output_dir, input_filename.replace('.csv', '_indices.csv'))

    if not os.path.exists(input_file):
        print(f"File does not exist: {input_filename}")
        continue

    try:
        df = pd.read_csv(input_file)
    except Exception as e:
        print(f"Failed to read {input_filename}: {e}")
        continue

    if 'New_Tree_ID' not in df.columns or 'AGB' not in df.columns:
        print(f"Missing required columns: {input_filename}")
        continue

    # Accurately extract all prefixes (e.g., sun_a)
    band_columns = [col for col in df.columns if re.search(r'_\d+nm$', col)]
    if not band_columns:
        print(f"No band columns: {input_filename}")
        continue

    prefixes = set(col.rsplit('_', 1)[0] for col in band_columns)
    print(f"\nProcessing: {input_filename} | Prefixes: {prefixes}")

    output_df = df[['New_Tree_ID', 'AGB']].copy()

    for prefix in prefixes:
        try:
            indices_df = calculate_vegetation_indices(df, prefix)
            indices_df.columns = [f"{prefix}_{col}" for col in indices_df.columns]
            output_df = pd.concat([output_df, indices_df], axis=1)
            print(f"  {prefix} index calculation completed")
        except Exception as e:
            print(f"  {prefix} failed: {e}")

    if len(output_df.columns) > 2:
        output_df.to_csv(output_file, index=False)
        print(f"Saved: {output_file}")
    else:
        print(f"No valid indices: {input_filename}")

total_time = time.time() - start_time
print(f"\nTotal time: {total_time:.2f} seconds")

Processing 7 files: ['ave.csv', 'ave_a.csv', 'ave_b.csv', 'sun.csv', 'sun_a.csv', 'sun_b.csv', 'w.csv']


Processing files: 100%|██████████| 7/7 [00:00<00:00, 73.92it/s]


Processing: ave.csv | Prefixes: {'ave'}
  ave index calculation completed
Saved: E:\jupyter_data\Hapke\Submit\data\processed\vegetation_indices\ave_indices.csv

Processing: ave_a.csv | Prefixes: {'ave_a'}
  ave_a index calculation completed
Saved: E:\jupyter_data\Hapke\Submit\data\processed\vegetation_indices\ave_a_indices.csv

Processing: ave_b.csv | Prefixes: {'ave_b'}
  ave_b index calculation completed
Saved: E:\jupyter_data\Hapke\Submit\data\processed\vegetation_indices\ave_b_indices.csv

Processing: sun.csv | Prefixes: {'sun'}
  sun index calculation completed
Saved: E:\jupyter_data\Hapke\Submit\data\processed\vegetation_indices\sun_indices.csv

Processing: sun_a.csv | Prefixes: {'sun_a'}
  sun_a index calculation completed
Saved: E:\jupyter_data\Hapke\Submit\data\processed\vegetation_indices\sun_a_indices.csv

Processing: sun_b.csv | Prefixes: {'sun_b'}
  sun_b index calculation completed
Saved: E:\jupyter_data\Hapke\Submit\data\processed\vegetation_indices\sun_b_indices.csv

P




In [20]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ==============================
# 1. Read data
# ==============================
file_path = r"E:\jupyter_data\Hapke\Submit\data\processed\vegetation_indices\w_indices.csv"
df = pd.read_csv(file_path)

# ==============================
# 2. Select independent variables (all columns starting with w_) and dependent variable
# ==============================
veg_cols = [col for col in df.columns if col.startswith('w_')]
X_dict = {col: df[[col]] for col in veg_cols}
y = df['AGB'].values  # convert to numpy array for easier calculation later

# ==============================
# 3. Cross-validation settings
# ==============================
kf = KFold(n_splits=5, shuffle=True, random_state=80)
model = LinearRegression()
results = []

# ==============================
# 4. Perform 5-fold CV for each vegetation index
# ==============================
for idx_name, X in X_dict.items():
    r2_scores = []
    rmse_scores = []
    mae_scores = []

    # Manual loop to obtain predictions
    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        r2_scores.append(r2_score(y_test, y_pred))
        rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))
        mae_scores.append(mean_absolute_error(y_test, y_pred))

    # Calculate mean and standard deviation
    r2_mean, r2_std = np.mean(r2_scores), np.std(r2_scores)
    rmse_mean, rmse_std = np.mean(rmse_scores), np.std(rmse_scores)
    mae_mean, mae_std = np.mean(mae_scores), np.std(mae_scores)
    rrmse = (rmse_mean / np.mean(y)) * 100

    # Fit regression equation on all data
    model.fit(X, y)
    coef = model.coef_[0]
    intercept = model.intercept_
    equation = f"AGB = {coef:.4f} * {idx_name} + {intercept:.4f}"

    results.append({
        'Vegetation Index': idx_name,
        'R² (mean ± std)': f"{r2_mean:.4f} ± {r2_std:.4f}",
        'RMSE (mean ± std)': f"{rmse_mean:.4f} ± {rmse_std:.4f}",
        'MAE (mean ± std)': f"{mae_mean:.4f} ± {mae_std:.4f}",
        'RRMSE (%)': f"{rrmse:.2f}",
        'Regression Equation': equation
    })

# ==============================
# 5. Aggregate and print results
# ==============================
results_df = pd.DataFrame(results)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)

print("\n===== 5-fold Cross Validation Results for Each Vegetation Index (R², RMSE, MAE) =====")
print(results_df.to_string(index=False))
print("===============================================================================\n")

# ==============================
# 6. Save as CSV file (auto-create directory)
# ==============================
output_csv_path = r"E:\jupyter_data\Hapke\Submit\data\processed\CV_results\w_indices_CV_results.csv"
output_dir = os.path.dirname(output_csv_path)
os.makedirs(output_dir, exist_ok=True)  # Auto-create directory

results_df.to_csv(output_csv_path, index=False, encoding='utf_8_sig')

print(f"Results have been successfully saved to:\n{output_csv_path}\n")


===== 5-fold Cross Validation Results for Each Vegetation Index (R², RMSE, MAE) =====
Vegetation Index  R² (mean ± std) RMSE (mean ± std) MAE (mean ± std) RRMSE (%)                         Regression Equation
          w_NDVI -0.1020 ± 0.4019  28.2421 ± 4.8085 22.8255 ± 3.2976     42.08          AGB = -93.0395 * w_NDVI + 133.1445
     w_ND800_680 -0.0555 ± 0.2918  27.8363 ± 4.1097 22.6741 ± 3.1391     41.47     AGB = -89.7302 * w_ND800_680 + 127.2230
       w_mND_705  0.0633 ± 0.1635  26.4479 ± 3.9292 21.7691 ± 3.5933     39.40      AGB = -130.7876 * w_mND_705 + 119.8107
            w_DD  0.0971 ± 0.1747  25.9509 ± 4.0845 21.3733 ± 3.8164     38.66            AGB = -133.7566 * w_DD + 77.8593
       w_mSR_705  0.0933 ± 0.1836  26.0287 ± 4.3947 21.4282 ± 4.1623     38.78       AGB = -25.9205 * w_mSR_705 + 129.3041
          w_DATT  0.1587 ± 0.1188  25.1100 ± 3.5626 20.8628 ± 3.3698     37.41          AGB = -30.3898 * w_DATT + 124.1484
 w_Chl_Rred_edge  0.1198 ± 0.1781  25.6384 ± 4.3051 

In [21]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ==============================
# 1. Read data
# ==============================
file_path = r"E:\jupyter_data\Hapke\Submit\data\processed\vegetation_indices\ave_indices.csv"
df = pd.read_csv(file_path)

# ==============================
# 2. Select independent variables (all columns starting with ave_) and dependent variable
# ==============================
veg_cols = [col for col in df.columns if col.startswith('ave_')]
X_dict = {col: df[[col]] for col in veg_cols}
y = df['AGB'].values  # Convert to numpy array for later computation

# ==============================
# 3. Cross-validation settings
# ==============================
kf = KFold(n_splits=5, shuffle=True, random_state=40)
model = LinearRegression()
results = []

# ==============================
# 4. 5-fold CV for each vegetation index
# ==============================
for idx_name, X in X_dict.items():
    r2_scores = []
    rmse_scores = []
    mae_scores = []

    # Manually loop to get predictions
    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        r2_scores.append(r2_score(y_test, y_pred))
        rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))
        mae_scores.append(mean_absolute_error(y_test, y_pred))

    # Calculate mean and standard deviation
    r2_mean, r2_std = np.mean(r2_scores), np.std(r2_scores)
    rmse_mean, rmse_std = np.mean(rmse_scores), np.std(rmse_scores)
    mae_mean, mae_std = np.mean(mae_scores), np.std(mae_scores)
    rrmse = (rmse_mean / np.mean(y)) * 100

    # Fit regression equation on all data
    model.fit(X, y)
    coef = model.coef_[0]
    intercept = model.intercept_
    equation = f"AGB = {coef:.4f} * {idx_name} + {intercept:.4f}"

    results.append({
        'Vegetation Index': idx_name,
        'R² (mean ± std)': f"{r2_mean:.4f} ± {r2_std:.4f}",
        'RMSE (mean ± std)': f"{rmse_mean:.4f} ± {rmse_std:.4f}",
        'MAE (mean ± std)': f"{mae_mean:.4f} ± {mae_std:.4f}",
        'RRMSE (%)': f"{rrmse:.2f}",
        'Regression Equation': equation
    })

# ==============================
# 5. Aggregate and print results
# ==============================
results_df = pd.DataFrame(results)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)

print("\n===== 5-fold Cross Validation Results for Each Vegetation Index (R², RMSE, MAE) =====")
print(results_df.to_string(index=False))
print("===============================================================================\n")

# ==============================
# 6. Save as CSV file (auto-create directory)
# ==============================
output_csv_path = r"E:\jupyter_data\Hapke\Submit\data\processed\CV_results\ave_indices_CV_results.csv"
output_dir = os.path.dirname(output_csv_path)
os.makedirs(output_dir, exist_ok=True)  # Auto-create directory

results_df.to_csv(output_csv_path, index=False, encoding='utf_8_sig')

print(f"Results have been successfully saved to:\n{output_csv_path}\n")


===== 5-fold Cross Validation Results for Each Vegetation Index (R², RMSE, MAE) =====
 Vegetation Index  R² (mean ± std) RMSE (mean ± std) MAE (mean ± std) RRMSE (%)                           Regression Equation
         ave_NDVI  0.0623 ± 0.1165  26.0041 ± 7.7979 22.0249 ± 8.1663     38.74         AGB = -339.3562 * ave_NDVI + 358.0411
    ave_ND800_680  0.0113 ± 0.1098  26.6602 ± 7.7307 22.6308 ± 8.1832     39.72    AGB = -202.7852 * ave_ND800_680 + 233.2780
      ave_mND_705 -0.0554 ± 0.0862  27.4673 ± 7.5733 23.8255 ± 7.6338     40.92      AGB = -109.3745 * ave_mND_705 + 129.9792
           ave_DD -0.1187 ± 0.1171  28.0197 ± 6.6613 24.5545 ± 6.9585     41.74              AGB = 70.5895 * ave_DD + 63.0776
      ave_mSR_705 -0.0458 ± 0.0839  27.3431 ± 7.5160 23.6487 ± 7.6001     40.74       AGB = -11.0341 * ave_mSR_705 + 108.1784
         ave_DATT  0.0026 ± 0.1195  26.4297 ± 6.2533 23.5867 ± 6.2966     39.38           AGB = -2.9561 * ave_DATT + 112.3888
ave_Chl_Rred_edge -0.0009 ± 0.1

In [22]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ==============================
# 1. Read data
# ==============================
file_path = r"E:\jupyter_data\Hapke\Submit\data\processed\vegetation_indices\ave_a_indices.csv"
df = pd.read_csv(file_path)

# ==============================
# 2. Select independent variables (all columns starting with ave_a_) and dependent variable
# ==============================
veg_cols = [col for col in df.columns if col.startswith('ave_a_')]
X_dict = {col: df[[col]] for col in veg_cols}
y = df['AGB'].values  # Convert to numpy array for later calculations

# ==============================
# 3. Cross-validation settings
# ==============================
kf = KFold(n_splits=5, shuffle=True, random_state=40)
model = LinearRegression()
results = []

# ==============================
# 4. 5-fold CV for each vegetation index
# ==============================
for idx_name, X in X_dict.items():
    r2_scores = []
    rmse_scores = []
    mae_scores = []

    # Manual loop to also get predictions
    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        r2_scores.append(r2_score(y_test, y_pred))
        rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))
        mae_scores.append(mean_absolute_error(y_test, y_pred))

    # Calculate mean and standard deviation
    r2_mean, r2_std = np.mean(r2_scores), np.std(r2_scores)
    rmse_mean, rmse_std = np.mean(rmse_scores), np.std(rmse_scores)
    mae_mean, mae_std = np.mean(mae_scores), np.std(mae_scores)
    rrmse = (rmse_mean / np.mean(y)) * 100

    # Fit regression on all data
    model.fit(X, y)
    coef = model.coef_[0]
    intercept = model.intercept_
    equation = f"AGB = {coef:.4f} * {idx_name} + {intercept:.4f}"

    results.append({
        'Vegetation Index': idx_name,
        'R² (mean ± std)': f"{r2_mean:.4f} ± {r2_std:.4f}",
        'RMSE (mean ± std)': f"{rmse_mean:.4f} ± {rmse_std:.4f}",
        'MAE (mean ± std)': f"{mae_mean:.4f} ± {mae_std:.4f}",
        'RRMSE (%)': f"{rrmse:.2f}",
        'Regression Equation': equation
    })

# ==============================
# 5. Aggregate and print results
# ==============================
results_df = pd.DataFrame(results)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)

print("\n===== 5-fold Cross Validation Results for Each Vegetation Index (R², RMSE, MAE) =====")
print(results_df.to_string(index=False))
print("===============================================================================\n")

# ==============================
# 6. Save as CSV file (auto-create directory)
# ==============================
output_csv_path = r"E:\jupyter_data\Hapke\Submit\data\processed\CV_results\ave_a_indices_CV_results.csv"
output_dir = os.path.dirname(output_csv_path)
os.makedirs(output_dir, exist_ok=True)  # Auto-create directory

results_df.to_csv(output_csv_path, index=False, encoding='utf_8_sig')

print(f"Results have been successfully saved to:\n{output_csv_path}\n")


===== 5-fold Cross Validation Results for Each Vegetation Index (R², RMSE, MAE) =====
   Vegetation Index  R² (mean ± std) RMSE (mean ± std) MAE (mean ± std) RRMSE (%)                             Regression Equation
         ave_a_NDVI  0.1791 ± 0.0797  24.1986 ± 6.6944 20.4418 ± 6.9209     36.05         AGB = -425.4891 * ave_a_NDVI + 426.7436
    ave_a_ND800_680  0.1576 ± 0.1038  24.5717 ± 7.0897 20.7041 ± 7.1869     36.61    AGB = -339.5800 * ave_a_ND800_680 + 340.7989
      ave_a_mND_705  0.1342 ± 0.1761  24.9390 ± 7.8387 20.7374 ± 7.5849     37.15      AGB = -294.6016 * ave_a_mND_705 + 232.4720
           ave_a_DD -0.0329 ± 0.1441  27.1232 ± 7.4328 22.8596 ± 7.2009     40.41            AGB = -433.1204 * ave_a_DD + 91.8282
      ave_a_mSR_705  0.1336 ± 0.1776  24.9220 ± 7.7405 20.7002 ± 7.5185     37.13       AGB = -28.7186 * ave_a_mSR_705 + 170.1083
         ave_a_DATT  0.0128 ± 0.1058  26.3210 ± 6.3317 23.2842 ± 6.8069     39.21            AGB = -1.9870 * ave_a_DATT + 95.2829
ave

In [23]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ==============================
# 1. Load data
# ==============================
file_path = r"E:\jupyter_data\Hapke\Submit\data\processed\vegetation_indices\ave_b_indices.csv"
df = pd.read_csv(file_path)

# ==============================
# 2. Select independent variables (columns starting with ave_b_) and dependent variable
# ==============================
veg_cols = [col for col in df.columns if col.startswith('ave_b_')]
X_dict = {col: df[[col]] for col in veg_cols}
y = df['AGB'].values  # convert to numpy array for later calculation

# ==============================
# 3. Set up cross-validation
# ==============================
kf = KFold(n_splits=5, shuffle=True, random_state=40)
model = LinearRegression()
results = []

# ==============================
# 4. 5-fold CV for each vegetation index
# ==============================
for idx_name, X in X_dict.items():
    r2_scores = []
    rmse_scores = []
    mae_scores = []

    # Manual loop to get prediction values
    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        r2_scores.append(r2_score(y_test, y_pred))
        rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))
        mae_scores.append(mean_absolute_error(y_test, y_pred))

    # Calculate mean and standard deviation
    r2_mean, r2_std = np.mean(r2_scores), np.std(r2_scores)
    rmse_mean, rmse_std = np.mean(rmse_scores), np.std(rmse_scores)
    mae_mean, mae_std = np.mean(mae_scores), np.std(mae_scores)
    rrmse = (rmse_mean / np.mean(y)) * 100

    # Fit regression on all data
    model.fit(X, y)
    coef = model.coef_[0]
    intercept = model.intercept_
    equation = f"AGB = {coef:.4f} * {idx_name} + {intercept:.4f}"

    results.append({
        'Vegetation Index': idx_name,
        'R² (mean ± std)': f"{r2_mean:.4f} ± {r2_std:.4f}",
        'RMSE (mean ± std)': f"{rmse_mean:.4f} ± {rmse_std:.4f}",
        'MAE (mean ± std)': f"{mae_mean:.4f} ± {mae_std:.4f}",
        'RRMSE (%)': f"{rrmse:.2f}",
        'Regression Equation': equation
    })

# ==============================
# 5. Aggregate and output results
# ==============================
results_df = pd.DataFrame(results)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)

print("\n===== 5-fold Cross Validation Results for Each Vegetation Index (R², RMSE, MAE) =====")
print(results_df.to_string(index=False))
print("===============================================================================\n")

# ==============================
# 6. Save as CSV file (auto-create directory)
# ==============================
output_csv_path = r"E:\jupyter_data\Hapke\Submit\data\processed\CV_results\ave_b_indices_CV_results.csv"
output_dir = os.path.dirname(output_csv_path)
os.makedirs(output_dir, exist_ok=True)  # Auto-create directory

results_df.to_csv(output_csv_path, index=False, encoding='utf_8_sig')

print(f"Results have been successfully saved to:\n{output_csv_path}\n")


===== 5-fold Cross Validation Results for Each Vegetation Index (R², RMSE, MAE) =====
   Vegetation Index  R² (mean ± std) RMSE (mean ± std) MAE (mean ± std) RRMSE (%)                            Regression Equation
         ave_b_NDVI -0.0846 ± 0.0577  27.6469 ± 6.6575 24.4647 ± 6.6258     41.19         AGB = 151.9578 * ave_b_NDVI + -65.5581
    ave_b_ND800_680 -0.0711 ± 0.0723  27.3576 ± 6.0822 24.2566 ± 5.9064     40.76   AGB = 231.0097 * ave_b_ND800_680 + -126.0662
      ave_b_mND_705 -0.0738 ± 0.1623  27.2491 ± 5.7479 23.8882 ± 5.7429     40.60     AGB = 293.4918 * ave_b_mND_705 + -106.2535
           ave_b_DD -0.0566 ± 0.1844  27.0693 ± 6.1264 23.5014 ± 6.0711     40.33            AGB = 593.2969 * ave_b_DD + 32.9726
      ave_b_mSR_705 -0.0739 ± 0.1718  27.2360 ± 5.7443 23.8776 ± 5.7259     40.58       AGB = 25.2319 * ave_b_mSR_705 + -31.4940
         ave_b_DATT -0.0150 ± 0.0745  26.8247 ± 6.8095 23.4800 ± 6.4564     39.96           AGB = -1.2782 * ave_b_DATT + 89.3577
ave_b_Chl_

In [24]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ==============================
# 1. Read data
# ==============================
file_path = r"E:\jupyter_data\Hapke\Submit\data\processed\vegetation_indices\sun_indices.csv"
df = pd.read_csv(file_path)

# ==============================
# 2. Select independent variables (all columns starting with sun_) and dependent variable
# ==============================
veg_cols = [col for col in df.columns if col.startswith('sun_')]
X_dict = {col: df[[col]] for col in veg_cols}
y = df['AGB'].values  # Convert to numpy array for easier computation

# ==============================
# 3. Cross-validation settings
# ==============================
kf = KFold(n_splits=5, shuffle=True, random_state=40)
model = LinearRegression()
results = []

# ==============================
# 4. 5-fold cross-validation for each vegetation index
# ==============================
for idx_name, X in X_dict.items():
    r2_scores = []
    rmse_scores = []
    mae_scores = []

    # Manual loop to also get predictions
    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        r2_scores.append(r2_score(y_test, y_pred))
        rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))
        mae_scores.append(mean_absolute_error(y_test, y_pred))

    # Calculate mean and std
    r2_mean, r2_std = np.mean(r2_scores), np.std(r2_scores)
    rmse_mean, rmse_std = np.mean(rmse_scores), np.std(rmse_scores)
    mae_mean, mae_std = np.mean(mae_scores), np.std(mae_scores)
    rrmse = (rmse_mean / np.mean(y)) * 100

    # Fit regression on all data
    model.fit(X, y)
    coef = model.coef_[0]
    intercept = model.intercept_
    equation = f"AGB = {coef:.4f} * {idx_name} + {intercept:.4f}"

    results.append({
        'Vegetation Index': idx_name,
        'R² (mean ± std)': f"{r2_mean:.4f} ± {r2_std:.4f}",
        'RMSE (mean ± std)': f"{rmse_mean:.4f} ± {rmse_std:.4f}",
        'MAE (mean ± std)': f"{mae_mean:.4f} ± {mae_std:.4f}",
        'RRMSE (%)': f"{rrmse:.2f}",
        'Regression Equation': equation
    })

# ==============================
# 5. Aggregate and output results
# ==============================
results_df = pd.DataFrame(results)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)

print("\n===== 5-fold Cross Validation Results for Each Vegetation Index (R², RMSE, MAE) =====")
print(results_df.to_string(index=False))
print("===============================================================================\n")

# ==============================
# 6. Save as CSV file (auto-create directory)
# ==============================
output_csv_path = r"E:\jupyter_data\Hapke\Submit\data\processed\CV_results\sun_indices_CV_results.csv"
output_dir = os.path.dirname(output_csv_path)
os.makedirs(output_dir, exist_ok=True)  # Auto-create directory

results_df.to_csv(output_csv_path, index=False, encoding='utf_8_sig')

print(f"Results have been successfully saved to:\n{output_csv_path}\n")


===== 5-fold Cross Validation Results for Each Vegetation Index (R², RMSE, MAE) =====
 Vegetation Index  R² (mean ± std) RMSE (mean ± std) MAE (mean ± std) RRMSE (%)                           Regression Equation
         sun_NDVI -0.0113 ± 0.1038  26.9504 ± 7.7607 23.1002 ± 8.0447     40.15         AGB = -190.3439 * sun_NDVI + 228.0485
    sun_ND800_680 -0.0392 ± 0.0978  27.2969 ± 7.7349 23.6322 ± 7.9170     40.67    AGB = -109.7930 * sun_ND800_680 + 156.5718
      sun_mND_705 -0.0927 ± 0.0920  27.9156 ± 7.5974 24.3989 ± 7.5119     41.59        AGB = -50.5279 * sun_mND_705 + 96.4522
           sun_DD -0.1279 ± 0.1006  28.2450 ± 7.1801 24.6603 ± 7.3297     42.08             AGB = -39.7215 * sun_DD + 70.1443
      sun_mSR_705 -0.0855 ± 0.0862  27.8277 ± 7.5535 24.2767 ± 7.4545     41.46         AGB = -5.1382 * sun_mSR_705 + 86.5770
         sun_DATT -0.0941 ± 0.0979  27.7428 ± 6.6832 24.7357 ± 6.6887     41.33            AGB = -1.7932 * sun_DATT + 87.4373
sun_Chl_Rred_edge -0.0559 ± 0.1

In [25]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ==============================
# 1. Load data
# ==============================
file_path = r"E:\jupyter_data\Hapke\Submit\data\processed\vegetation_indices\sun_a_indices.csv"
df = pd.read_csv(file_path)

# ==============================
# 2. Select independent variables (all columns starting with sun_a_) and dependent variable
# ==============================
veg_cols = [col for col in df.columns if col.startswith('sun_a_')]
X_dict = {col: df[[col]] for col in veg_cols}
y = df['AGB'].values  # Convert to numpy array for later calculation

# ==============================
# 3. Cross-validation setup
# ==============================
kf = KFold(n_splits=5, shuffle=True, random_state=40)
model = LinearRegression()
results = []

# ==============================
# 4. 5-fold CV for each vegetation index
# ==============================
for idx_name, X in X_dict.items():
    r2_scores = []
    rmse_scores = []
    mae_scores = []

    # Manually loop to obtain predictions
    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        r2_scores.append(r2_score(y_test, y_pred))
        rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))
        mae_scores.append(mean_absolute_error(y_test, y_pred))

    # Calculate mean and standard deviation
    r2_mean, r2_std = np.mean(r2_scores), np.std(r2_scores)
    rmse_mean, rmse_std = np.mean(rmse_scores), np.std(rmse_scores)
    mae_mean, mae_std = np.mean(mae_scores), np.std(mae_scores)
    rrmse = (rmse_mean / np.mean(y)) * 100

    # Fit regression on all data
    model.fit(X, y)
    coef = model.coef_[0]
    intercept = model.intercept_
    equation = f"AGB = {coef:.4f} * {idx_name} + {intercept:.4f}"

    results.append({
        'Vegetation Index': idx_name,
        'R² (mean ± std)': f"{r2_mean:.4f} ± {r2_std:.4f}",
        'RMSE (mean ± std)': f"{rmse_mean:.4f} ± {rmse_std:.4f}",
        'MAE (mean ± std)': f"{mae_mean:.4f} ± {mae_std:.4f}",
        'RRMSE (%)': f"{rrmse:.2f}",
        'Regression Equation': equation
    })

# ==============================
# 5. Aggregate and output results
# ==============================
results_df = pd.DataFrame(results)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)

print("\n===== 5-fold Cross Validation Results for Each Vegetation Index (R², RMSE, MAE) =====")
print(results_df.to_string(index=False))
print("===============================================================================\n")

# ==============================
# 6. Save as CSV file (auto-create directory)
# ==============================
output_csv_path = r"E:\jupyter_data\Hapke\Submit\data\processed\CV_results\sun_a_indices_CV_results.csv"
output_dir = os.path.dirname(output_csv_path)
os.makedirs(output_dir, exist_ok=True)  # Auto-create directory

results_df.to_csv(output_csv_path, index=False, encoding='utf_8_sig')

print(f"Results have been successfully saved to:\n{output_csv_path}\n")


===== 5-fold Cross Validation Results for Each Vegetation Index (R², RMSE, MAE) =====
   Vegetation Index  R² (mean ± std) RMSE (mean ± std) MAE (mean ± std) RRMSE (%)                             Regression Equation
         sun_a_NDVI  0.1237 ± 0.0922  25.0103 ± 7.0384 20.8031 ± 7.4213     37.26         AGB = -393.3330 * sun_a_NDVI + 394.1878
    sun_a_ND800_680  0.1283 ± 0.1062  24.9872 ± 7.2486 20.7111 ± 7.5288     37.23    AGB = -322.1714 * sun_a_ND800_680 + 323.9924
      sun_a_mND_705  0.1208 ± 0.1596  25.1016 ± 7.6194 20.7090 ± 7.3989     37.40      AGB = -292.3530 * sun_a_mND_705 + 231.0329
           sun_a_DD  0.0875 ± 0.2203  25.5071 ± 7.8860 20.9236 ± 7.4735     38.00           AGB = -605.4328 * sun_a_DD + 109.8593
      sun_a_mSR_705  0.1214 ± 0.1625  25.0714 ± 7.5545 20.5862 ± 7.3638     37.35       AGB = -28.6392 * sun_a_mSR_705 + 169.6258
         sun_a_DATT -0.0892 ± 0.0717  27.6985 ± 6.6404 24.3558 ± 6.8475     41.27            AGB = -0.9460 * sun_a_DATT + 77.3342
sun

In [26]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ==============================
# 1. Read data
# ==============================
file_path = r"E:\jupyter_data\Hapke\Submit\data\processed\vegetation_indices\sun_b_indices.csv"
df = pd.read_csv(file_path)

# ==============================
# 2. Select independent variables (all columns starting with sun_b_) and dependent variable
# ==============================
veg_cols = [col for col in df.columns if col.startswith('sun_b_')]
X_dict = {col: df[[col]] for col in veg_cols}
y = df['AGB'].values  # Convert to numpy array for subsequent calculations

# ==============================
# 3. Cross-validation setting
# ==============================
kf = KFold(n_splits=5, shuffle=True, random_state=40)
model = LinearRegression()
results = []

# ==============================
# 4. 5-fold cross-validation for each vegetation index
# ==============================
for idx_name, X in X_dict.items():
    r2_scores = []
    rmse_scores = []
    mae_scores = []

    # Manual loop to also collect predictions
    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        r2_scores.append(r2_score(y_test, y_pred))
        rmse_scores.append(np.sqrt(mean_squared_error(y_test, y_pred)))
        mae_scores.append(mean_absolute_error(y_test, y_pred))

    # Calculate mean and standard deviation
    r2_mean, r2_std = np.mean(r2_scores), np.std(r2_scores)
    rmse_mean, rmse_std = np.mean(rmse_scores), np.std(rmse_scores)
    mae_mean, mae_std = np.mean(mae_scores), np.std(mae_scores)
    rrmse = (rmse_mean / np.mean(y)) * 100

    # Fit regression model on all data
    model.fit(X, y)
    coef = model.coef_[0]
    intercept = model.intercept_
    equation = f"AGB = {coef:.4f} * {idx_name} + {intercept:.4f}"

    results.append({
        'Vegetation Index': idx_name,
        'R² (mean ± std)': f"{r2_mean:.4f} ± {r2_std:.4f}",
        'RMSE (mean ± std)': f"{rmse_mean:.4f} ± {rmse_std:.4f}",
        'MAE (mean ± std)': f"{mae_mean:.4f} ± {mae_std:.4f}",
        'RRMSE (%)': f"{rrmse:.2f}",
        'Regression Equation': equation
    })

# ==============================
# 5. Aggregate and output results
# ==============================
results_df = pd.DataFrame(results)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)

print("\n===== 5-fold Cross Validation Results for Each Vegetation Index (R², RMSE, MAE) =====")
print(results_df.to_string(index=False))
print("===============================================================================\n")

# ==============================
# 6. Save as CSV file (auto-create directory)
# ==============================
output_csv_path = r"E:\jupyter_data\Hapke\Submit\data\processed\CV_results\sun_b_indices_CV_results.csv"
output_dir = os.path.dirname(output_csv_path)
os.makedirs(output_dir, exist_ok=True)  # Auto-create directory

results_df.to_csv(output_csv_path, index=False, encoding='utf_8_sig')

print(f"Results have been successfully saved to:\n{output_csv_path}\n")


===== 5-fold Cross Validation Results for Each Vegetation Index (R², RMSE, MAE) =====
   Vegetation Index  R² (mean ± std) RMSE (mean ± std) MAE (mean ± std) RRMSE (%)                            Regression Equation
         sun_b_NDVI -0.0710 ± 0.0723  27.3784 ± 6.2007 24.2328 ± 6.1030     40.79        AGB = 232.8575 * sun_b_NDVI + -133.2331
    sun_b_ND800_680 -0.0648 ± 0.0952  27.2069 ± 5.8189 24.0607 ± 5.6385     40.53   AGB = 247.1743 * sun_b_ND800_680 + -138.6543
      sun_b_mND_705 -0.0923 ± 0.1652  27.4288 ± 5.5903 24.1233 ± 5.7114     40.86      AGB = 269.6215 * sun_b_mND_705 + -94.7473
           sun_b_DD -0.0982 ± 0.2481  27.3317 ± 5.4907 23.7664 ± 5.4346     40.72            AGB = 670.8335 * sun_b_DD + 12.4061
      sun_b_mSR_705 -0.0958 ± 0.1869  27.4268 ± 5.5381 24.0878 ± 5.6989     40.86       AGB = 22.3195 * sun_b_mSR_705 + -22.8727
         sun_b_DATT -0.0780 ± 0.0704  27.5654 ± 6.5999 24.4898 ± 6.3890     41.07           AGB = -2.5732 * sun_b_DATT + 97.8822
sun_b_Chl_

In [None]:
# PLS

In [11]:
import pandas as pd
import numpy as np
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# ----- 1. Read data -----
file_path = r"E:\jupyter_data\Hapke\Submit\data\processed\analyzes\w.csv"
df = pd.read_csv(file_path)

X = df.filter(regex='^w_')          # all hyperspectral bands
y = df['AGB'].values                # target variable

# ----- 2. Model & Cross-validation -----
pls = PLSRegression(n_components=1, scale=True)
cv  = KFold(n_splits=5, shuffle=True, random_state=37)

r2_list   = []
rmse_list = []
mae_list  = []

for _, (tr_idx, te_idx) in enumerate(cv.split(X), 1):
    X_tr, X_te = X.iloc[tr_idx], X.iloc[te_idx]
    y_tr, y_te = y[tr_idx], y[te_idx]

    pls.fit(X_tr, y_tr)
    y_pred = pls.predict(X_te)

    r2   = r2_score(y_te, y_pred)
    rmse = np.sqrt(mean_squared_error(y_te, y_pred))
    mae  = mean_absolute_error(y_te, y_pred)

    r2_list.append(r2)
    rmse_list.append(rmse)
    mae_list.append(mae)

# ----- 3. Output only mean CV results -----
mean_r2   = np.mean(r2_list)
mean_rmse = np.mean(rmse_list)
mean_mae  = np.mean(mae_list)

results = pd.DataFrame({
    "Mean R2":   [f"{mean_r2:.4f}"],
    "Mean RMSE": [f"{mean_rmse:.4f}"],
    "Mean MAE":  [f"{mean_mae:.4f}"]
})

print("\n===== 5-Fold Cross-Validation Mean Results =====")
print(results.to_string(index=False))
print("===============================================\n")


===== 5-Fold Cross-Validation Mean Results =====
Mean R2 Mean RMSE Mean MAE
 0.1568   25.1403  21.0676



In [12]:
import pandas as pd
import numpy as np
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# 1. Load data
file_path = r"E:\jupyter_data\Hapke\Submit\data\processed\analyzes\w.csv"
df = pd.read_csv(file_path)

# Select only 774, 822, 914nm bands, column names like w_{wl}nm
target_wavelengths = [774, 822, 914]
cols = [f"w_{wl}nm" for wl in target_wavelengths]

# Check if all columns exist
missing = [c for c in cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns: {missing}")

X = df[cols].copy()
y = df['AGB'].values

# 2. Model & Cross-Validation
pls = PLSRegression(n_components=1, scale=True)
cv = KFold(n_splits=5, shuffle=True, random_state=37)

r2_list = []
rmse_list = []
mae_list = []

for _, (tr_idx, te_idx) in enumerate(cv.split(X), 1):
    X_tr, X_te = X.iloc[tr_idx], X.iloc[te_idx]
    y_tr, y_te = y[tr_idx], y[te_idx]

    pls.fit(X_tr, y_tr)
    y_pred = pls.predict(X_te)

    r2 = r2_score(y_te, y_pred)
    rmse = np.sqrt(mean_squared_error(y_te, y_pred))
    mae = mean_absolute_error(y_te, y_pred)

    r2_list.append(r2)
    rmse_list.append(rmse)
    mae_list.append(mae)

# 3. Output 5-fold mean CV results as table only
mean_r2 = np.mean(r2_list)
mean_rmse = np.mean(rmse_list)
mean_mae = np.mean(mae_list)

results = pd.DataFrame({
    "Mean R2":   [f"{mean_r2:.4f}"],
    "Mean RMSE": [f"{mean_rmse:.4f}"],
    "Mean MAE":  [f"{mean_mae:.4f}"]
})

print(results.to_string(index=False))

Mean R2 Mean RMSE Mean MAE
 0.1993   24.4692  20.3151


In [None]:
# lasso

In [16]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import warnings
from sklearn.exceptions import ConvergenceWarning

# ========== 1. Load data ==========
file_path = r"E:\jupyter_data\Hapke\Submit\data\processed\analyzes\w.csv"
df = pd.read_csv(file_path)

# Use all hyperspectral bands
X = df.filter(regex='^w_').values  # All columns starting with "w_"
y = df['AGB'].values

def run_fixed_seed(seed=37):
    """
    Use Pipeline for standardization + LassoCV to auto-select alpha.
    Suppress ConvergenceWarnings for clarity, but recommend mitigation.
    """
    # Suppress only sklearn's ConvergenceWarning
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=ConvergenceWarning)
        
        model = Pipeline([
            ('scaler', StandardScaler()),
            ('lasso', LassoCV(
                cv=5,                  # 5-fold CV inside LassoCV for alpha selection
                n_alphas=100,          # Generate 100 alphas
                max_iter=20000,        # <-- Increase max_iter to help convergence
                random_state=42        # Fix the random seed inside LassoCV
            ))
        ])
        
        # Outer CV for final validation
        cv_outer = KFold(n_splits=5, shuffle=True, random_state=seed)
        
        r2s, rmses, maes = [], [], []
        
        for train_idx, test_idx in cv_outer.split(X):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            
            model.fit(X_train, y_train)
            pred = model.predict(X_test)
            
            r2s.append(r2_score(y_test, pred))
            rmses.append(np.sqrt(mean_squared_error(y_test, pred)))
            maes.append(mean_absolute_error(y_test, pred))
        
        return {
            "Mean R2": np.mean(r2s),
            "Mean RMSE": np.mean(rmses),
            "Mean MAE": np.mean(maes)
        }


summary = run_fixed_seed(seed=37)

# Output 5-fold mean CV results (R2, RMSE, MAE) as table
results = pd.DataFrame({
    "Mean R2":   [f"{summary['Mean R2']:.4f}"],
    "Mean RMSE": [f"{summary['Mean RMSE']:.4f}"],
    "Mean MAE":  [f"{summary['Mean MAE']:.4f}"]
})

print(results.to_string(index=False))


Mean R2 Mean RMSE Mean MAE
 0.2298   23.7558  19.7559


In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# ==================== 1. Read data ====================
file_path = r"E:\jupyter_data\Hapke\Submit\data\processed\analyzes\w.csv"
df = pd.read_csv(file_path)

# Only select 774, 822, 914nm bands
target_wavelengths = [774, 822, 914]
cols = [f"w_{wl}nm" for wl in target_wavelengths]
X = df[cols].values
y = df['AGB'].values

def run_fixed_seed(seed=37):
    """
    Use Pipeline to automatically standardize + LassoCV auto-selects alpha.
    """
    model = Pipeline([
        ('scaler', StandardScaler()),
        ('lasso', LassoCV(
            cv=5,
            n_alphas=100,
            max_iter=10000,
            random_state=37
        ))
    ])

    cv_outer = KFold(n_splits=5, shuffle=True, random_state=seed)

    r2s, rmses, maes = [], [], []

    for train_idx, test_idx in cv_outer.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model.fit(X_train, y_train)
        pred = model.predict(X_test)

        r2s.append(r2_score(y_test, pred))
        rmses.append(np.sqrt(mean_squared_error(y_test, pred)))
        maes.append(mean_absolute_error(y_test, pred))

    return {
        "Mean R2":   np.mean(r2s),
        "Mean RMSE": np.mean(rmses),
        "Mean MAE":  np.mean(maes)
    }


summary = run_fixed_seed(seed=37)

# Output 5-fold mean CV results (R2, RMSE, MAE) as table
results = pd.DataFrame({
    "Mean R2":   [f"{summary['Mean R2']:.4f}"],
    "Mean RMSE": [f"{summary['Mean RMSE']:.4f}"],
    "Mean MAE":  [f"{summary['Mean MAE']:.4f}"]
})

print(results.to_string(index=False))

Mean R2 Mean RMSE Mean MAE
 0.3948   21.0642  17.7139


In [None]:
# RF

In [17]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# ==================== 1. Load data ====================
file_path = r"E:\jupyter_data\Hapke\Submit\data\processed\analyzes\w.csv"
df = pd.read_csv(file_path)

X = df.filter(regex='^w_').values          # All hyperspectral bands → numpy array
y = df['AGB'].values

print(f"Number of samples: {X.shape[0]}, Number of bands: {X.shape[1]}\n")

# ==================== 2. RF single seed evaluation ==================
def run_single_seed(seed):
    """
    Apply RF regression with 5-fold CV and return metrics
    """
    model = RandomForestRegressor(
        n_estimators=100,
        n_jobs=-1,
        random_state=seed
    )
    cv_outer = KFold(n_splits=5, shuffle=True, random_state=seed)
    
    r2s, rmses, maes = [], [], []
    
    for train_idx, test_idx in cv_outer.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        
        r2s.append(r2_score(y_test, pred))
        rmses.append(np.sqrt(mean_squared_error(y_test, pred)))
        maes.append(mean_absolute_error(y_test, pred))
        
    return {
        "Mean R2": np.mean(r2s),
        "Mean RMSE": np.mean(rmses),
        "Mean MAE": np.mean(maes)
    }

# ==================== 3. Fixed random_state = 37 ====================
seed = 37
summary = run_single_seed(seed)

# Output 5-fold mean CV results (R2, RMSE, MAE) as table
results = pd.DataFrame({
    "Mean R2":   [f"{summary['Mean R2']:.4f}"],
    "Mean RMSE": [f"{summary['Mean RMSE']:.4f}"],
    "Mean MAE":  [f"{summary['Mean MAE']:.4f}"]
})

print(results.to_string(index=False))

Number of samples: 75, Number of bands: 144

Mean R2 Mean RMSE Mean MAE
 0.3865   21.4274  17.6906


In [18]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# ==================== 1. Read data ====================
file_path = r"E:\jupyter_data\Hapke\Submit\data\processed\analyzes\w.csv"
df = pd.read_csv(file_path)

# Select only 774, 822, 914 nm bands
target_wavelengths = [774, 822, 914]
cols = [f"w_{wl}nm" for wl in target_wavelengths]
X = df[cols].values
y = df['AGB'].values

print(f"Number of samples: {X.shape[0]}, Number of bands: {X.shape[1]}\n")

# ==================== 2. RF single seed evaluation ==================
def run_single_seed(seed):
    """
    Apply RF regression with 5-fold CV and return metrics
    """
    model = RandomForestRegressor(
        n_estimators=100,
        n_jobs=-1,
        random_state=seed
    )
    cv_outer = KFold(n_splits=5, shuffle=True, random_state=seed)
    
    r2s, rmses, maes = [], [], []
    
    for train_idx, test_idx in cv_outer.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        
        r2s.append(r2_score(y_test, pred))
        rmses.append(np.sqrt(mean_squared_error(y_test, pred)))
        maes.append(mean_absolute_error(y_test, pred))
        
    return {
        "Mean R2": np.mean(r2s),
        "Mean RMSE": np.mean(rmses),
        "Mean MAE": np.mean(maes)
    }

# ==================== 3. Fixed random_state = 37 ====================
seed = 37
summary = run_single_seed(seed)

# Output 5-fold mean CV results (R2, RMSE, MAE) as table
results = pd.DataFrame({
    "Mean R2":   [f"{summary['Mean R2']:.4f}"],
    "Mean RMSE": [f"{summary['Mean RMSE']:.4f}"],
    "Mean MAE":  [f"{summary['Mean MAE']:.4f}"]
})

print(results.to_string(index=False))

Number of samples: 75, Number of bands: 3

Mean R2 Mean RMSE Mean MAE
 0.1553   25.1125  20.8274
