In [None]:
from google.colab import drive
drive.mount('/mnt/drive')

In [None]:
# Load your dataset
file_path = '/mnt/drive/MyDrive/canola/canola_spectrum_index.csv'  # Replace with your file path


# Best Hyper Parameter

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import accuracy_score

# Load your data
data = pd.read_csv(file_path)

# Separate inputs and output
X = data.drop(columns=['salinity', 'variety'])
y = data['salinity']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7641)

# Define the model
model = RidgeClassifier(random_state=7641)

# Define the parameter grid for random search
param_distributions = {
    'alpha': np.logspace(-4, 4, 50),  # Ridge regularization strength
    'tol': [1e-4, 1e-3, 1e-2, 1e-1, 1],  # Tolerance for stopping criteria
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'lbfgs'],  # Different solvers
}

# Perform Randomized Search with cross-validation
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_distributions,
    n_iter=100,  # Number of random configurations to sample
    cv=5,  # 5-fold cross-validation
    scoring='accuracy',  # Evaluation metric
    n_jobs=-1,  # Use all available cores
    random_state=7641,  # For reproducibility
    verbose=1  # Show progress
)

# Fit the model
random_search.fit(X_train, y_train)

# Get the best parameters and accuracy
best_model = random_search.best_estimator_
print(f"Best Parameters: {random_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {random_search.best_score_:.4f}")

# Predict on the test set
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy on Test Set: {test_accuracy:.4f}")

# Get feature importance (coefficients)
feature_importance = np.mean(np.abs(best_model.coef_), axis=0)

# Create a dataframe for feature importance
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importance
})

# Sort the features by importance and display the top 10
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print(importance_df.head(10))


# Best Index

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import accuracy_score
import itertools
import numpy as np

# Load your data
data = pd.read_csv('/mnt/drive/MyDrive/canola/canola_spectrum_index.csv')

# Separate inputs and output
X = data.drop(columns=['salinity', 'variety'])
# X = data.filter(regex="^Band")
y = data['salinity']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7641)

# Define the important features (your custom bands)
# important_features = ['CIRE', 'Band_180', 'Band_223', 'Band_247', 'Band_181'
#         ]
important_features = ['Band_180', 'Band_178', 'Band_181', 'Band_222', 'Band_179', 'Band_201','Band_223'
        ]

# List of formulas with placeholders (X, Y, Z, W)
formulas = [
    "(1 / X) - (1 / Y)",
    "(Z - W) / (Z + X)",
    "((1 / X) - (1 / Y)) / Z",
    "((0.1 * Z) - W) / ((0.1 * Z) + W)",
    "(X * Y) / Z",
    "(Z) / (X) - 1",
    "(X / Y) - 1",
    "(1 / X) - (1 / Z)",
    "(1 / W) - (1 / Y)",
    "(Z * X) / (Y * Y)",
    "(X - Y) / (X - W)",
    "(Y * Z) / (X * X)",
    "2.5 * (Z - X) / (Z + 6 * X - 7.5 * W + 1)",
    "2 * X - Y - Z",
    "(2 * X - Y - Z) / (2 * X + Y + Z)",
    "(Z - (X + Y)) / (Z + (X + Y))",
    "(Z - X) / (Z + X + 0.16)",
    "(X - (Y + Z)) / (X + (Y + Z))",
    "(Z - Y) / (Z + X)",
    "((X - Y) - 0.2 * (X - Z)) * (X / Y)",
    "(X**2 - Y**2) / (X**2 + Y**2)",
    "(Z - X) / (Z + X - 2 * W)",
    "0.5 * (2 * Z + 1 - np.sqrt((2 * Z + 1)**2 - 8 * (Z - X)))",
    "(Z - X) / (Z + X)",
    "(Z - Y) / (Z + Y)",
    "(Z - X) / (Z + X + 0.16)",
    "(Y - X) / Z",
    "(Z - X) / np.sqrt(Z + X)",
    "(X**2 - Y * Z) / (X**2 + Y * Z)",
    "0.5 * (120 * (Z - X) - 200 * (Y - X))"
]

# Function to evaluate a given formula's contribution to model accuracy
def evaluate_formula(X_train, X_test, y_train, y_test, new_feature_train, new_feature_test, formula):
    # Add the new feature to the dataset
    X_train_extended = X_train.copy()
    X_test_extended = X_test.copy()
    X_train_extended[formula] = new_feature_train
    X_test_extended[formula] = new_feature_test

 # Train the RidgeClassifier with hyper-tuned parameters
model = RidgeClassifier(alpha=0.12648552168552957, tol=0.1, solver='cholesky', random_state=7641)
model.fit(X_train_extended, y_train)


    # Predict and calculate accuracy
    y_pred = model.predict(X_test_extended)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

# Evaluate all combinations of X, Y, Z, W with important features
best_formula = None
best_accuracy = 0

total_formulas_tried = 0

for formula in formulas:
    # Iterate over all combinations of your important features in place of X, Y, Z, W
    for x, y, z, w in itertools.permutations(important_features, 4):
        # Replace placeholders (X, Y, Z, W) with actual feature names
        adjusted_formula = formula.replace('X', x).replace('Y', y).replace('Z', z).replace('W', w)

        # Calculate the new feature values for both training and testing sets using pandas.eval()
        try:
            new_feature_train = X_train.eval(adjusted_formula, engine='python')
            new_feature_test = X_test.eval(adjusted_formula, engine='python')
        except (ZeroDivisionError, TypeError, NameError, ValueError):
            continue  # Skip this formula if it causes an error (e.g., division by zero, invalid operations)

        # Evaluate the accuracy of the new feature added to the model
        accuracy = evaluate_formula(X_train, X_test, y_train, y_test, new_feature_train, new_feature_test, adjusted_formula)

        # Update the best formula if this one is better
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_formula = adjusted_formula
            print(f"New Best Formula: {best_formula} -> Accuracy: {best_accuracy:.4f}")

        total_formulas_tried += 1

print(f"\nTotal Formulas Tried: {total_formulas_tried}")
print(f"Best Formula Found: {best_formula} -> Accuracy: {best_accuracy:.4f}")

# Final Model with Best Formula
X_train[best_formula] = X_train.eval(best_formula, engine='python')
X_test[best_formula] = X_test.eval(best_formula, engine='python')

model = RidgeClassifier(alpha=1.0, random_state=7641)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

final_accuracy = accuracy_score(y_test, y_pred)
print(f"\nFinal Model Accuracy with Best Formula: {final_accuracy:.4f}")

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import accuracy_score
import itertools
import numpy as np

# Load your data
data = pd.read_csv('/mnt/drive/MyDrive/canola/canola_spectrum_index_AND_newindexafterhypertuning.csv')

# Separate inputs and output
# X = data.drop(columns=['salinity', 'variety'])
X = data[
    ['NEW_index', 'CVI', 'RGBVI', 'CIG', 'BWDRVI', 'Band_179', 'Band_180', 'Band_178', 'Band_181', 'Band_222', 'Band_223',
    'Band_201', 'Band_200', 'Band_203', 'Band_202', 'Band_256',
    'Band_238', 'Band_207',  'Band_172',  'Band_206',
     'Band_210'
    ]]
y = data['salinity']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7641)

# Define the important features (your custom bands)
important_features = ['Band_179', 'Band_180', 'Band_178', 'Band_181', 'Band_222', 'Band_223',
    'Band_201', 'Band_200', 'Band_203', 'Band_202', 'Band_256',
    'Band_238', 'Band_207',  'Band_172',  'Band_206',
     'Band_210']

# List of formulas with placeholders (X, Y, Z, W), optimized by removing identical or redundant ones
formulas = [
    "(1 / X) - (1 / Y)",
    "(Z - W) / (Z + X)",
    "((1 / X) - (1 / Y)) / Z",
    "((0.1 * Z) - W) / ((0.1 * Z) + W)",
    "(X * Y) / Z",
    "(Z / X) - 1",
    "(X / Y) - 1",
    "(1 / X) - (1 / Z)",
    "(1 / W) - (1 / Y)",
    "(Z * X) / (Y * Y)",
    "(X - Y) / (X - W)",
    "(Y * Z) / (X * X)",
    "2.5 * (Z - X) / (Z + 6 * X - 7.5 * W + 1)",
    "2 * X - Y - Z",
    "(2 * X - Y - Z) / (2 * X + Y + Z)",
    "(Z - (X + Y)) / (Z + (X + Y))",
    "(Z - X) / (Z + X + 0.16)",
    "(X - (Y + Z)) / (X + (Y + Z))",
    "(Z - Y) / (Z + X)",
    "((X - Y) - 0.2 * (X - Z)) * (X / Y)",
    "(X**2 - Y**2) / (X**2 + Y**2)",
    "(Z - X) / (Z + X - 2 * W)",
    "0.5 * (2 * Z + 1 - np.sqrt((2 * Z + 1)**2 - 8 * (Z - X)))",
    "(Z - Y) / (Z + Y)",
    "(Y - X) / Z",
    "(Z - X) / np.sqrt(Z + X)",
    "(X**2 - Y * Z) / (X**2 + Y * Z)",
    "0.5 * (120 * (Z - X) - 200 * (Y - X))"
]

# Function to evaluate a given formula's contribution to model accuracy
def evaluate_formula(X_train, X_test, y_train, y_test, new_feature_train, new_feature_test, formula):
    # Add the new feature to the dataset
    X_train_extended = X_train.copy()
    X_test_extended = X_test.copy()
    X_train_extended[formula] = new_feature_train
    X_test_extended[formula] = new_feature_test

    # Train the RidgeClassifier with hyper-tuned parameters
    model = RidgeClassifier(alpha=0.12648552168552957, tol=0.1, solver='cholesky', random_state=7641)
    model.fit(X_train_extended, y_train)

    # Predict and calculate accuracy
    y_pred = model.predict(X_test_extended)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

# Evaluate all combinations of X, Y, Z, W with important features
best_formula = None
best_accuracy = 0

total_formulas_tried = 0

for formula in formulas:
    # Iterate over all combinations of your important features in place of X, Y, Z, W
    for x, y, z, w in itertools.permutations(important_features, 4):
        # Replace placeholders (X, Y, Z, W) with actual feature names
        adjusted_formula = formula.replace('X', x).replace('Y', y).replace('Z', z).replace('W', w)

        # Calculate the new feature values for both training and testing sets using pandas.eval()
        try:
            new_feature_train = X_train.eval(adjusted_formula, engine='python')
            new_feature_test = X_test.eval(adjusted_formula, engine='python')
        except (ZeroDivisionError, TypeError, NameError, ValueError):
            continue  # Skip this formula if it causes an error (e.g., division by zero, invalid operations)

        # Evaluate the accuracy of the new feature added to the model
        accuracy = evaluate_formula(X_train, X_test, y_train, y_test, new_feature_train, new_feature_test, adjusted_formula)

        # Update the best formula if this one is better
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_formula = adjusted_formula
            print(f"New Best Formula: {best_formula} -> Accuracy: {best_accuracy:.4f}")

        total_formulas_tried += 1

print(f"\nTotal Formulas Tried: {total_formulas_tried}")
print(f"Best Formula Found: {best_formula} -> Accuracy: {best_accuracy:.4f}")

# Final Model with Best Formula
X_train[best_formula] = X_train.eval(best_formula, engine='python')
X_test[best_formula] = X_test.eval(best_formula, engine='python')

# Re-train the model with the best formula included
model = RidgeClassifier(alpha=0.12648552168552957, tol=0.1, solver='cholesky', random_state=7641)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

final_accuracy = accuracy_score(y_test, y_pred)
print(f"\nFinal Model Accuracy with Best Formula: {final_accuracy:.4f}")

In [None]:
# Add the new feature (best formula) to the original data
data[best_formula] = data.eval(best_formula, engine='python')

# Save the updated DataFrame to a new CSV file
data.to_csv('/mnt/drive/MyDrive/canola/canola_spectrum_index_new_index.csv', index=False)

print(f"The new feature '{best_formula}' has been added as the last column and saved to 'updated_file_with_new_band.csv'.")
