Section 1(Normalization)

In [None]:
# Min-Max Normalization for Hyperspectral Data
import pandas as pd                       # For reading/writing Excel/CSV and handling DataFrames
from sklearn.preprocessing import MinMaxScaler   # For Min-Max scaling of features

#  Load the dataset 
input_path = r"D:/Z Education/University/4-1 Course/Thesis/Hyper leaf dataset/hyperspectral.xlsx"  # Path to Excel file
df = pd.read_excel(input_path)            # Read the Excel file into a DataFrame

print("Original Columns:", df.columns.tolist())   # Show all column names for verification
print("Shape before:", df.shape)                  # Print dataset shape (rows, columns) before preprocessing

#  Drop unwanted columns 
drop_cols = ['Image ID', 'Minimum Reflectance', 'Otsu Threshold']  # Columns to remove
df = df.drop(columns=drop_cols, errors='ignore')  # Drop them; ignore errors if any column not present

#  Separate target column 
target_col = 'Yield'                      # This is the target variable
y = df[target_col]                         # Store Yield separately (we don’t normalize it)
X = df.drop(columns=[target_col])          # All other columns are features to normalize

#  Min-Max normalization 
scaler = MinMaxScaler()                    # Create MinMaxScaler instance (default range: 0–1)
X_scaled = scaler.fit_transform(X)         # Fit scaler to features & transform them into scaled array

#  Reconstruct DataFrame 
normalized_df = pd.DataFrame(X_scaled, columns=X.columns)  # Convert scaled array back to DataFrame with original column names
normalized_df[target_col] = y                               # Add back the original Yield column (unscaled)

print("Shape after normalization:", normalized_df.shape)    # Print new shape after dropping columns and adding scaled features

#  Save processed file 
output_path = "normalized_minmax_with_target.csv"           # Output file name (will save in current working directory)
normalized_df.to_csv(output_path, index=False)              # Save normalized data to CSV without row index

print(f"✅ Normalized file saved as: {output_path}")         # Confirmation message after saving


Section 2(KS)

In [None]:
# Data partition using KS (Kennard–Stone) method
import pandas as pd                       # For handling CSV files and tabular data
import numpy as np                        # For numerical operations
from scipy.spatial.distance import cdist  # To compute pairwise Euclidean distances

# Load dataset from CSV file
input_path = r"D:/Z Education/University/4-1 Course/Thesis/Hyper leaf dataset/normalized_minmax_with_target.csv"
df = pd.read_csv(input_path)              # Read CSV into pandas DataFrame

# Identify columns to keep (not for splitting)
keep_cols = ['Image ID', 'Yield', 'Minimum Reflectance', 'Otsu'] # Keep only the columns that actually exist in the dataset 

keep_cols_existing = [col for col in keep_cols if col in df.columns]  # Select remaining columns (hyperspectral bands) for splitting

X_cols = [col for col in df.columns if col not in keep_cols_existing]  
X = df[X_cols].values                     # Extract band values as NumPy array for KS algorithm

# Define Kennard–Stone algorithm function
def kennard_stone(X, n_samples):
    n_total = X.shape[0]                              # Total number of samples
    dist_matrix = cdist(X, X, metric='euclidean')     # Compute pairwise Euclidean distance matrix
    
    # Step 1: select two samples farthest apart
    i1, i2 = np.unravel_index(np.argmax(dist_matrix, axis=None), dist_matrix.shape)  
    selected = [i1, i2]                               # Start with two farthest points
    remaining = list(set(range(n_total)) - set(selected))  # Remaining samples to select from
    
    # Step 2: iteratively select sample farthest from selected set
    while len(selected) < n_samples:
        # Compute minimum distance of each remaining sample to any selected sample
        dist_to_selected = np.min(dist_matrix[remaining][:, selected], axis=1)  
        next_idx = remaining[np.argmax(dist_to_selected)]  # Pick the one with maximum distance
        selected.append(next_idx)                           # Add to selected list
        remaining.remove(next_idx)                          # Remove from remaining
    
    return selected  # Return list of selected indices for training set

# Split dataset into training and test sets
n_train = int(0.8 * X.shape[0])   # 80% of samples for training
train_idx = kennard_stone(X, n_train)  # Get training indices using KS algorithm
test_idx = list(set(range(X.shape[0])) - set(train_idx))  # Remaining samples for testing

# Create DataFrames for train and test sets
df_train = df.iloc[train_idx].reset_index(drop=True)  # Training set
df_test = df.iloc[test_idx].reset_index(drop=True)    # Test set

# Save train and test sets to CSV files
df_train.to_csv(r"D:/Z Education/University/4-1 Course/Thesis/Hyper leaf dataset/train_ks.csv", index=False)
df_test.to_csv(r"D:/Z Education/University/4-1 Course/Thesis/Hyper leaf dataset/test_ks.csv", index=False)

# Print confirmation and dataset shapes
print("✅ Kennard–Stone split done!")  
print("Train set shape:", df_train.shape)
print("Test set shape :", df_test.shape)


Section 3(PLSR)

In [None]:
# Train using PLSR (Partial Least Squares Regression) technique
import pandas as pd  # For data handling
import numpy as np  # For numerical operations
from sklearn.cross_decomposition import PLSRegression  # PLS regression model
from sklearn.preprocessing import StandardScaler  # Feature scaling
from sklearn.metrics import mean_squared_error, r2_score  # Evaluation metrics

# Load train and test datasets
train_df = pd.read_csv(r"D:/Z Education/University/4-1 Course/Thesis/Hyper leaf dataset/train_ks.csv")  # Load training CSV
test_df = pd.read_csv(r"D:/Z Education/University/4-1 Course/Thesis/Hyper leaf dataset/test_ks.csv")  # Load testing CSV

print("Train shape:", train_df.shape)  # Show training data shape
print("Test shape:", test_df.shape)  # Show testing data shape

# Separate features (X) and target (y)
target_column = "Yield"  # Replace with your actual target column name
X_train = train_df.drop(columns=[target_column])  # Features for training
y_train = train_df[target_column]  # Target for training

X_test = test_df.drop(columns=[target_column])  # Features for testing
y_test = test_df[target_column]  # Target for testing

# Standardize the feature data
scaler = StandardScaler()  # Initialize scaler
X_train_scaled = scaler.fit_transform(X_train)  # Fit and transform training features
X_test_scaled = scaler.transform(X_test)  # Transform testing features with same scaler

# Train PLSR model with up to 10 components
n_components = min(X_train.shape[1], 10)  # Choose number of components (max 10 or number of features)
pls = PLSRegression(n_components=n_components)  # Initialize PLS model
pls.fit(X_train_scaled, y_train)  # Fit PLS model to training data

# Make predictions
y_pred_train = pls.predict(X_train_scaled)  # Predict on training set
y_pred_test = pls.predict(X_test_scaled)  # Predict on testing set

# Evaluate model performance
train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))  # RMSE for training
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))  # RMSE for testing

train_r2 = r2_score(y_train, y_pred_train)  # R² for training
test_r2 = r2_score(y_test, y_pred_test)  # R² for testing

train_rpd = np.std(y_train, ddof=1) / train_rmse  # RPD for training
test_rpd = np.std(y_test, ddof=1) / test_rmse  # RPD for testing

print("\nPLSR Model Evaluation:")  # Print evaluation header
print(f"Train R²             : {train_r2:.4f}")  # Display training R²
print(f"Test R²              : {test_r2:.4f}")  # Display testing R²
print(f"Train RMSE           : {train_rmse:.4f}")  # Display training RMSE
print(f"Test RMSE            : {test_rmse:.4f}")  # Display testing RMSE
print(f"Train RPD            : {train_rpd:.4f}")  # Display training RPD
print(f"Test RPD             : {test_rpd:.4f}")  # Display testing RPD


Section 4(RF)

In [None]:
# Train using RF (Random Forest) model
import pandas as pd  # For data handling
import numpy as np  # For numerical operations
from sklearn.ensemble import RandomForestRegressor  # Random Forest Regressor
from sklearn.model_selection import train_test_split  # Train/test split if needed
from sklearn.preprocessing import StandardScaler  # Feature scaling
from sklearn.metrics import mean_squared_error, r2_score  # Evaluation metrics

# Load datasets
train_df = pd.read_csv(r"D:/Z Education/University/4-1 Course/Thesis/Hyper leaf dataset/train_ks.csv")  # Load training CSV
test_df = pd.read_csv(r"D:/Z Education/University/4-1 Course/Thesis/Hyper leaf dataset/test_ks.csv")  # Load testing CSV

target_column = "Yield"  # Define target column
X_train = train_df.drop(columns=[target_column]).values  # Features for training
y_train = train_df[target_column].values  # Target for training

X_test = test_df.drop(columns=[target_column]).values  # Features for testing
y_test = test_df[target_column].values  # Target for testing

# Standardize features (optional but recommended for some models)
scaler = StandardScaler()  # Initialize scaler
X_train_scaled = scaler.fit_transform(X_train)  # Fit and transform training features
X_test_scaled = scaler.transform(X_test)  # Transform testing features with same scaler

# CPU Random Forest Model
rf = RandomForestRegressor(
    n_estimators=500,  # Number of trees
    max_depth=30,  # Maximum depth of each tree
    max_features='sqrt',  # Max features considered at each split
    random_state=42,  # Ensure reproducibility
    n_jobs=-1  # Use all CPU cores
)

# Train the model
rf.fit(X_train_scaled, y_train)  # Fit model to training data

# Predictions
y_pred_train = rf.predict(X_train_scaled)  # Predict on training set
y_pred_test = rf.predict(X_test_scaled)  # Predict on testing set

# Evaluation function
def evaluate(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))  # Compute RMSE
    r2 = r2_score(y_true, y_pred)  # Compute R² score
    rpd = np.std(y_true, ddof=1) / rmse  # Compute RPD
    return r2, rmse, rpd  # Return evaluation metrics

train_r2, train_rmse, train_rpd = evaluate(y_train, y_pred_train)  # Evaluate training performance
test_r2, test_rmse, test_rpd = evaluate(y_test, y_pred_test)  # Evaluate testing performance

print("\nRandom Forest Model Evaluation:")  # Print evaluation header
print(f"Train R²   : {train_r2:.4f}")  # Display training R²
print(f"Test R²    : {test_r2:.4f}")  # Display testing R²
print(f"Train RMSE : {train_rmse:.4f}")  # Display training RMSE
print(f"Test RMSE  : {test_rmse:.4f}")  # Display testing RMSE
print(f"Train RPD  : {train_rpd:.4f}")  # Display training RPD
print(f"Test RPD   : {test_rpd:.4f}")  # Display testing RPD



Section 5(CNN-1D)

In [None]:
# CNN-1D (Convolutional Neural Network – One Dimensional)
import pandas as pd                                      # For handling CSV files and tabular data
import numpy as np                                       # For numerical operations
from sklearn.preprocessing import StandardScaler         # For feature standardization
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix, ConfusionMatrixDisplay  # Evaluation metrics
import matplotlib.pyplot as plt                           # For plotting
from tensorflow.keras.models import Sequential, load_model  # Build and load Keras models
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization  # CNN layers
from tensorflow.keras.callbacks import EarlyStopping     # Early stopping during training
import pickle                                           # For saving and loading scaler objects

# Load CSV files containing KS-split data
train_path = r"D:/Z Education/University/4-1 Course/Thesis/Hyper leaf dataset/train_ks.csv"  # Path to training data
test_path = r"D:/Z Education/University/4-1 Course/Thesis/Hyper leaf dataset/test_ks.csv"    # Path to test data

train_df = pd.read_csv(train_path)                       # Load training data into a DataFrame
test_df = pd.read_csv(test_path)                         # Load test data into a DataFrame

# Define features (X) and target (y)
target_column = "Yield"                                  # Target column to predict
X_train = train_df.drop(columns=[target_column]).values  # Extract features from training set
y_train = train_df[target_column].values                 # Extract target values from training set
X_test = test_df.drop(columns=[target_column]).values    # Extract features from test set
y_test = test_df[target_column].values                   # Extract target values from test set

# Standardize features to zero mean and unit variance
scaler_X = StandardScaler()                               # Initialize StandardScaler
X_train_scaled = scaler_X.fit_transform(X_train)          # Fit on training data and transform
X_test_scaled = scaler_X.transform(X_test)                # Transform test data with same scaler

# Save the scaler object for later reuse
with open("scaler_X.pkl", "wb") as f:                     # Open file to save scaler
    pickle.dump(scaler_X, f)                              # Save scaler object

# Reshape data for 1D-CNN input: (samples, timesteps, features=1)
X_train_scaled = X_train_scaled[..., np.newaxis]          # Add extra dimension for CNN input
X_test_scaled = X_test_scaled[..., np.newaxis]            # Add extra dimension for CNN input

# Build 1D-CNN model using Keras Sequential API
model = Sequential([                                      # Initialize sequential model
    Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train_scaled.shape[1],1)),  # Conv layer with 64 filters
    BatchNormalization(),                                 # Normalize activations
    MaxPooling1D(pool_size=2),                            # Downsample by factor of 2
    Dropout(0.2),                                         # Dropout for regularization

    Conv1D(128, kernel_size=3, activation='relu'),       # Second Conv layer with 128 filters
    BatchNormalization(),                                 # Normalize activations
    MaxPooling1D(pool_size=2),                            # Downsample
    Dropout(0.2),                                         # Dropout

    Flatten(),                                            # Flatten output to feed Dense layers
    Dense(128, activation='relu'),                        # Fully connected Dense layer
    Dropout(0.2),                                         # Dropout
    Dense(1, activation='linear')                         # Output layer for regression
])

# Compile model with optimizer and loss function
model.compile(optimizer='adam', loss='mse', metrics=['mae'])  # Use Adam optimizer and MSE loss
early_stop = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)  # Stop if val_loss doesn't improve

# Train the model
history = model.fit(
    X_train_scaled, y_train,           # Training features and target
    epochs=100,                        # Maximum number of epochs
    batch_size=32,                     # Batch size
    validation_split=0.2,              # Use 20% of training data as validation
    verbose=1,                         # Show progress
    callbacks=[early_stop]             # Early stopping callback
)

# Save final trained model
model.save("CNN1D_final_model.keras") # Save model for reuse
print("Model saved as CNN1D_final_model.keras")  # Print confirmation

# Reuse saved model
model_loaded = load_model("CNN1D_final_model.keras")  # Load saved model
scaler_loaded = pickle.load(open("scaler_X.pkl", "rb"))  # Load saved scaler

# Apply scaler and reshape data for prediction
X_train_scaled_loaded = scaler_loaded.transform(X_train)[..., np.newaxis]  # Scale and reshape training data
X_test_scaled_loaded = scaler_loaded.transform(X_test)[..., np.newaxis]    # Scale and reshape test data

# Predict target values
y_pred_train = model_loaded.predict(X_train_scaled_loaded).flatten()  # Training predictions
y_pred_test = model_loaded.predict(X_test_scaled_loaded).flatten()    # Test predictions

# Function to evaluate predictions
def evaluate(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))  # Compute RMSE
    r2 = r2_score(y_true, y_pred)                       # Compute R² score
    rpd = np.std(y_true, ddof=1) / rmse                # Compute RPD (Ratio of Performance to Deviation)
    return r2, rmse, rpd                                # Return metrics

# Evaluate train and test predictions
train_r2, train_rmse, train_rpd = evaluate(y_train, y_pred_train)  # Train metrics
test_r2, test_rmse, test_rpd = evaluate(y_test, y_pred_test)       # Test metrics

# Print evaluation results
print("\nLoaded 1D-CNN Model Evaluation:")
print(f"Train R²   : {train_r2:.4f}")        # Print R² for training
print(f"Test R²    : {test_r2:.4f}")         # Print R² for testing
print(f"Train RMSE : {train_rmse:.4f}")      # Print RMSE for training
print(f"Test RMSE  : {test_rmse:.4f}")       # Print RMSE for testing
print(f"Train RPD  : {train_rpd:.4f}")       # Print RPD for training
print(f"Test RPD   : {test_rpd:.4f}")        # Print RPD for testing

# Create 2x2 grid plots for visualization
fig, axes = plt.subplots(2, 2, figsize=(14,12))  # Create figure with 2x2 subplots

# 1. Train scatter plot: predicted vs actual
axes[0,0].scatter(y_train, y_pred_train, color='blue', alpha=0.6, label='Train Predicted')  # Scatter plot
z_train = np.polyfit(y_train, y_pred_train, 1)      # Fit line for predictions
p_train = np.poly1d(z_train)
axes[0,0].plot(y_train, p_train(y_train), "blue", linestyle='--', label="Train Best Fit")  # Plot best fit line
axes[0,0].plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], "green", linestyle='-', label="Ideal Fit")  # Ideal y=x line
axes[0,0].set_title("Training Set: Predicted vs Actual")  # Set title
axes[0,0].set_xlabel("Actual Values")                     # X-axis label
axes[0,0].set_ylabel("Predicted Values")                  # Y-axis label
axes[0,0].legend()                                        # Show legend
axes[0,0].grid(True)                                      # Show grid

# 2. Test scatter plot: predicted vs actual
axes[0,1].scatter(y_test, y_pred_test, color='red', alpha=0.6, label='Test Predicted')      # Scatter plot
z_test = np.polyfit(y_test, y_pred_test, 1)       # Fit line
p_test = np.poly1d(z_test)
axes[0,1].plot(y_test, p_test(y_test), "orange", linestyle='--', label="Test Best Fit")    # Best fit line
axes[0,1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "green", linestyle='-', label="Ideal Fit")  # Ideal line
axes[0,1].set_title("Test Set: Predicted vs Actual")  # Title
axes[0,1].set_xlabel("Actual Values")                 # X label
axes[0,1].set_ylabel("Predicted Values")              # Y label
axes[0,1].legend()                                    # Legend
axes[0,1].grid(True)                                  # Grid

# 3. Training & validation loss over epochs
axes[1,0].plot(history.history['loss'], label='Training Loss', color='blue')  # Training loss plot
axes[1,0].plot(history.history['val_loss'], label='Validation Loss', color='orange')  # Validation loss
axes[1,0].set_title("1D-CNN Training & Validation Loss")   # Title
axes[1,0].set_xlabel("Epochs")                              # X-axis label
axes[1,0].set_ylabel("MSE Loss")                             # Y-axis label
axes[1,0].legend()                                           # Legend
axes[1,0].grid(True)                                         # Grid

# 4. Confusion matrix for regression-as-classification
y_test_class = (y_test >= np.median(y_test)).astype(int)          # Convert regression target to binary
y_pred_test_class = (y_pred_test >= np.median(y_test)).astype(int)  # Convert predictions
cm = confusion_matrix(y_test_class, y_pred_test_class)            # Compute confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Low Yield", "High Yield"])  # Display
disp.plot(ax=axes[1,1], cmap=plt.cm.Blues, values_format="d")     # Plot confusion matrix
axes[1,1].set_title("Confusion Matrix")                            # Set title

plt.tight_layout()  # Adjust layout to avoid overlap
plt.show()          # Show all plots


Test Section