In [10]:
import pandas as pd
import numpy as np
from scipy.signal import savgol_filter
import os

# Define file paths
input_file = r"E:\jupyter_data\Hapke\Submit\data\processed\Hapke_result\hapke_parameters_merged.csv"
shice_file = r"E:\jupyter_data\Hapke\Submit\data\Original\shice_merged.csv"
output_dir = r"E:\jupyter_data\Hapke\Submit\data\processed\analyzes"
output_file = os.path.join(output_dir, "w.csv")
os.makedirs(output_dir, exist_ok=True)

# Load main data
df = pd.read_csv(input_file)

# Identify wavelength band columns
band_cols = [col for col in df.columns if col.startswith('w_') and col.endswith('nm')]
print(f"Found {len(band_cols)} wavelength band columns")

# Extract w matrix: rows=samples, columns=bands
w_matrix = df[band_cols].values  # shape: (n_samples, n_bands)

# Set SG parameters
window_size = 5
poly_order = 2

# Ensure window does not exceed number of bands
n_bands = w_matrix.shape[1]
if window_size > n_bands:
    window_size = n_bands if n_bands % 2 == 1 else n_bands - 1
if window_size < 3:
    raise ValueError("Too few bands to apply smoothing")

# Apply SG smoothing for each sample (row)
smoothed_matrix = np.zeros_like(w_matrix)
for i in range(w_matrix.shape[0]):
    smoothed_matrix[i] = savgol_filter(w_matrix[i], window_size, poly_order)

# Insert smoothed data back into DataFrame
df_smoothed = df.copy()
df_smoothed[band_cols] = smoothed_matrix

# Read shice_merged.csv, compatible with multiple encodings
shice_encoding_tried = False
encodings_to_try = ['utf-8', 'gbk', 'gb2312', 'latin1']
for enc in encodings_to_try:
    try:
        df_shice = pd.read_csv(shice_file, encoding=enc)
        print(f"Successfully read shice_merged.csv using encoding {enc}")
        shice_encoding_tried = True
        break
    except UnicodeDecodeError:
        continue
if not shice_encoding_tried:
    raise RuntimeError("Unable to decode shice_merged.csv, please check the file encoding. Tried: " + ", ".join(encodings_to_try))

# Check for required columns in shice_merged
needed_cols = {'Original_Tree_ID', 'New_Tree_ID', 'AGB'}
missing_in_shice = needed_cols - set(df_shice.columns)
if missing_in_shice:
    raise KeyError(f"shice_merged.csv is missing required fields: {missing_in_shice}")

# Use Original_Tree_ID to map New_Tree_ID and AGB from shice_merged
# The New_Tree_ID in the result always comes from shice_merged.csv
df_smoothed = df_smoothed.copy()
df_smoothed['New_Tree_ID'] = np.nan
df_smoothed['AGB'] = np.nan

# Set Original_Tree_ID as the index in shice_merged for fast lookup
shice_lookup = df_shice.set_index('Original_Tree_ID')

# Use Series.map for order and NaN safety
df_smoothed['New_Tree_ID'] = df_smoothed['Original_Tree_ID'].map(shice_lookup['New_Tree_ID'])
df_smoothed['AGB'] = df_smoothed['Original_Tree_ID'].map(shice_lookup['AGB'])

# Build final column order
final_cols = ['New_Tree_ID', 'Original_Tree_ID', 'AGB'] + band_cols
missing_in_df = [col for col in final_cols if col not in df_smoothed.columns]
if missing_in_df:
    print(f"Warning: Some columns are missing: {missing_in_df}, filling with NaN")
    for col in missing_in_df:
        df_smoothed[col] = np.nan

# Reorder columns
df_output = df_smoothed[[col for col in final_cols if col in df_smoothed.columns]]

# Save output
df_output.to_csv(output_file, index=False)

print(f"Smoothing complete! Processed {w_matrix.shape[0]} samples, {n_bands} bands")
print(f"Saved to: {output_file}")

Found 144 wavelength band columns
Successfully read shice_merged.csv using encoding gbk
Smoothing complete! Processed 75 samples, 144 bands
Saved to: E:\jupyter_data\Hapke\Submit\data\processed\analyzes\w.csv


In [None]:
import numpy as np
import pandas as pd
import os
import re
import time
from tqdm import tqdm
from joblib import Parallel, delayed
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import itertools
import warnings

# ==================== Configure Paths (Please make sure the paths are correct) ====================
INPUT_FILE_PATH = r"E:\jupyter_data\Hapke\Submit\data\processed\analyzes\w.csv"
OUTPUT_DIR = r"E:\jupyter_data\Hapke\Submit\data\processed\TBI"

os.makedirs(OUTPUT_DIR, exist_ok=True)
warnings.filterwarnings('ignore', category=RuntimeWarning)

start_total_time = time.time()

# ==================== 37 TBI Formulas ====================
def tbi1(Ii, Ij, Ik): return (Ii - Ij) / (Ik + 1e-10)
def tbi2(Ii, Ij, Ik): return Ik / (Ii - Ij + 1e-10)
def tbi3(Ii, Ij, Ik): return Ii / (Ij + Ik + 1e-10)
def tbi4(Ii, Ij, Ik): return (Ii + Ij) / (Ik + 1e-10)
def tbi5(Ii, Ij, Ik): return (Ii + Ij) * Ik
def tbi6(Ii, Ij, Ik): return (Ii - Ij) * Ik
def tbi7(Ii, Ij, Ik): return Ii * Ij * Ik
def tbi8(Ii, Ij, Ik): return Ii * Ij / (Ik + 1e-10)
def tbi9(Ii, Ij, Ik): return Ii / ((Ij * Ik) + 1e-10)
def tbi10(Ii, Ij, Ik): return Ii + Ij + Ik
def tbi11(Ii, Ij, Ik): return Ii - Ij - Ik
def tbi12(Ii, Ij, Ik): return Ii - 2 * Ij + Ik
def tbi13(Ii, Ij, Ik): return (Ii - Ij) / (Ij - Ik + 1e-10)
def tbi14(Ii, Ij, Ik): return (Ii + Ij) / (Ij + Ik + 1e-10)
def tbi15(Ii, Ij, Ik): return (Ii - Ij) / (Ij + Ik + 1e-10)
def tbi16(Ii, Ij, Ik): return (Ii + Ij) / (Ij - Ik + 1e-10)
def tbi17(Ii, Ij, Ik): return Ii / (Ii + Ij + Ik + 1e-10)
def tbi18(Ii, Ij, Ik): return (Ii - Ij) / (Ii + Ij + Ik + 1e-10)
def tbi19(Ii, Ij, Ik): return (Ii + Ij) / (Ii + Ij + Ik + 1e-10)
def tbi20(Ii, Ij, Ik): return (Ii - Ij - Ik) / (Ii + Ij + Ik + 1e-10)
def tbi21(Ii, Ij, Ik): return (Ii - 2 * Ij + Ik) / (Ii + Ij + Ik + 1e-10)
def tbi22(Ii, Ij, Ik): return Ii / (Ii - 2 * Ij + Ik + 1e-10)
def tbi23(Ii, Ij, Ik): return Ij / (Ii - 2 * Ij + Ik + 1e-10)
def tbi24(Ii, Ij, Ik): return (Ii - Ij) / (Ii - 2 * Ij + Ik + 1e-10)
def tbi25(Ii, Ij, Ik): return (Ii - Ik) / (Ii - 2 * Ij + Ik + 1e-10)
def tbi26(Ii, Ij, Ik): return (Ii + Ij) / (Ii - 2 * Ij + Ik + 1e-10)
def tbi27(Ii, Ij, Ik): return (Ii + Ik) / (Ii - 2 * Ij + Ik + 1e-10)
def tbi28(Ii, Ij, Ik): return (Ii - Ij - Ik) / (Ii - 2 * Ij + Ik + 1e-10)
def tbi29(Ii, Ij, Ik): return (Ii + Ij + Ik) / (Ii - 2 * Ij + Ik + 1e-10)
def tbi30(Ii, Ij, Ik): return Ii / (Ii - Ij - Ik + 1e-10)
def tbi31(Ii, Ij, Ik): return Ij / (Ii - Ij - Ik + 1e-10)
def tbi32(Ii, Ij, Ik): return (Ii - Ij) / (Ii - Ij - Ik + 1e-10)
def tbi33(Ii, Ij, Ik): return (Ij - Ik) / (Ii - Ij - Ik + 1e-10)
def tbi34(Ii, Ij, Ik): return (Ii + Ij) / (Ii - Ij - Ik + 1e-10)
def tbi35(Ii, Ij, Ik): return (Ik + Ij) / (Ii - Ij - Ik + 1e-10)
def tbi36(Ii, Ij, Ik): return (Ii + Ij + Ik) / (Ii - Ij - Ik + 1e-10)
def tbi37(Ii, Ij, Ik): return (Ii - 2 * Ij + Ik) / (Ii - Ij - Ik + 1e-10)

formulas = [tbi1, tbi2, tbi3, tbi4, tbi5, tbi6, tbi7, tbi8, tbi9, tbi10,
            tbi11, tbi12, tbi13, tbi14, tbi15, tbi16, tbi17, tbi18, tbi19, tbi20,
            tbi21, tbi22, tbi23, tbi24, tbi25, tbi26, tbi27, tbi28, tbi29, tbi30,
            tbi31, tbi32, tbi33, tbi34, tbi35, tbi36, tbi37]
formula_names = [f"TBI{i}" for i in range(1, 38)]

# ==================== Parallel Calculation for TBI ====================
def compute_tbi(args):
    i, j, k, X, y, formulas, formula_names, y_mean = args
    results = []
    Ii, Ij, Ik = X[:, i], X[:, j], X[:, k]
    
    for idx, formula in enumerate(formulas):
        try:
            tbi = formula(Ii, Ij, Ik)
            finite_mask = np.isfinite(tbi) & np.isfinite(y)
            if np.sum(finite_mask) < max(10, len(y) // 3):
                continue

            tbi_valid = tbi[finite_mask]
            y_valid = y[finite_mask]
            if len(np.unique(tbi_valid)) <= 1:
                continue

            model = LinearRegression().fit(tbi_valid.reshape(-1, 1), y_valid)
            y_pred = model.predict(tbi_valid.reshape(-1, 1))

            r2 = r2_score(y_valid, y_pred)
            rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
            mae = mean_absolute_error(y_valid, y_pred)
            rRMSE = rmse / y_mean if y_mean > 0 else np.inf
            corr = np.corrcoef(tbi_valid, y_valid)[0, 1]

            results.append((i, j, k, idx, r2, rmse, rRMSE, mae, formula_names[idx], corr))
        except Exception:
            continue
    return results

# ==================== Main Function ====================
def process_single_file():
    if not os.path.exists(INPUT_FILE_PATH):
        print(f"Error: File does not exist!\n{INPUT_FILE_PATH}")
        return

    file_start_time = time.time()
    print(f"\nStart processing file: {INPUT_FILE_PATH}")

    try:
        data = pd.read_csv(INPUT_FILE_PATH).astype(np.float32)
        print(f"Data shape: {data.shape}")
    except Exception as e:
        print(f"Failed to read data: {e}")
        return

    if 'AGB' not in data.columns:
        print("Error: Missing column 'AGB'")
        return

    y = data['AGB'].values
    y_mean = np.mean(y)
    print(f"Mean AGB: {y_mean:.2f}")

    # Extract spectral bands
    spectral_columns = []
    wavelengths = []
    for col in data.columns:
        match = re.match(r'.*?(\d+)nm$', col)
        if match:
            spectral_columns.append(col)
            wavelengths.append(int(match.group(1)))
    
    if not wavelengths:
        print("Error: No spectral bands detected")
        return

    wavelengths = np.array(wavelengths)
    print(f"Detected {len(wavelengths)} bands, range: {wavelengths.min()} - {wavelengths.max()} nm")
    X = data[spectral_columns].values

    # All permutations of three different bands (i != j != k, order matters)
    band_indices = list(range(len(wavelengths)))
    band_combos = list(itertools.permutations(band_indices, 3))
    print(f"Total combinations: {len(band_combos):,} (P(n,3) = n×(n-1)×(n-2))")

    # === Parallel Calculation ===
    tasks = [(i, j, k, X, y, formulas, formula_names, y_mean) for i, j, k in band_combos]
    results_parallel = Parallel(n_jobs=-1, backend='loky')(
        delayed(compute_tbi)(task) for task in tqdm(tasks, desc="Calculating 37 types of TBI")
    )

    # === Merge Results ===
    all_data = []
    best_r2 = -1
    best_rmse = float('inf')
    best_rrmse = float('inf')
    best_mae = float('inf')
    best_combo = None
    best_tbi = None

    for res in results_parallel:
        for item in res:
            i, j, k, fidx, r2, rmse, rrmse, mae, tbi_name, corr = item
            wi, wj, wk = wavelengths[i], wavelengths[j], wavelengths[k]
            entry = {
                'Wavelength_i': wi, 'Wavelength_j': wj, 'Wavelength_k': wk,
                'TBI_Type': tbi_name, 'R2': r2, 'RMSE': rmse, 'rRMSE': rrmse,
                'MAE': mae, 'Correlation': corr
            }
            all_data.append(entry)

            if r2 > best_r2 or (abs(r2 - best_r2) < 1e-6 and rrmse < best_rrmse):
                best_r2, best_rmse, best_rrmse, best_mae = r2, rmse, rrmse, mae
                best_combo = (wi, wj, wk)
                best_tbi = tbi_name

    base_name = os.path.splitext(os.path.basename(INPUT_FILE_PATH))[0]

    # === Only save Top100 summary file ===
    all_df = pd.DataFrame(all_data)
    if not all_df.empty:
        top100 = all_df.sort_values(by=['R2', 'rRMSE', 'MAE'], ascending=[False, True, True]).head(100)
        top100_file = os.path.join(OUTPUT_DIR, "w_top100_R2.csv")
        top100.to_csv(top100_file, index=False)
        print("Top100 has been saved as w_top100_R2.csv")
    else:
        print("No valid results, Top100 was not generated")

    # === Output results ===
    print("\n" + "="*70)
    print(f"File analysis finished: {base_name}")
    if best_combo:
        print(f"Best R²   : {best_r2:.4f}")
        print(f"Best RMSE : {best_rmse:.4f}")
        print(f"Best rRMSE: {best_rrmse:.4f} ({best_rrmse*100:.2f}%)")
        print(f"Best MAE  : {best_mae:.4f}")
        print(f"Best TBI  : {best_tbi}")
        print(f"Best bands: {best_combo}")
    else:
        print("No valid model found")
    print(f"Total valid combinations: {len(all_data)}")
    print(f"Processing time  : {time.time() - file_start_time:.2f} seconds")
    print("="*70)

# ==================== Run ====================
if __name__ == "__main__":
    process_single_file()
    total_time = time.time() - start_total_time
    print(f"\nTotal runtime: {total_time:.2f} seconds ({total_time/3600:.2f} hours)")