In [26]:
# Imports
import os
import gc
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score, confusion_matrix
)
from sklearn.metrics import balanced_accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC

print("All Imports OK")

All Imports OK


In [2]:
# Setup directories to save outputs
os.makedirs("confusion_matrices", exist_ok=True)
os.makedirs("results", exist_ok=True)

In [3]:
# Load dataframe
df = pd.read_csv('../../data/data-1.csv.gz')

In [4]:
# Set random state for reproducibility
RANDOM_STATE = 10291999

In [27]:
# Create X and Y columns and define train/test split

# Feature engineering
feature_cols = [col for col in df.columns if col.startswith('Band_')] + ['p_x', 'p_y']
X = df[feature_cols]
y = pd.factorize(df['land_type'])[0]  # Convert to numeric

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

# Store class labels for proper evaluation
class_names = np.array(sorted(df['land_type'].unique()))  # for reference
classes = np.array(range(len(class_names)))  # for sklearn functions
print(f"Classes ({len(classes)}): {classes}")

Classes (8): [0 1 2 3 4 5 6 7]
Numeric labels range: 0 to 7
Class counts: [38424 50657 38900 24495 26314 15153 10128 11533]


In [6]:
# Pre-processing: Scaling and PCA setup

# Scaling for models that need it (LDA, Logistic, QDA, k-NN, SVM)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Unscaled for tree-based models (Random Forest, GBDT)
X_train_tree = X_train.copy()
X_test_tree = X_test.copy()

# PCA setup
pca = PCA(n_components=10, random_state=RANDOM_STATE)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [7]:
# Define functions for model evaluation and saving outputs

def evaluate_model(model, X_test, y_test, model_name, variant="raw", class_labels=None, auto_save=True, results_file="all_results.csv"):
    """

    Parameters:
    -----------
    model : fitted sklearn model
        The trained model to evaluate
    X_test : array-like
        Test features
    y_test : array-like
        True test labels
    model_name : str
        Name of the model (e.g., "LDA", "Random Forest")
    variant : str
        Either "raw" or "pca10" to track preprocessing variant
    class_labels : array-like, optional
        Class labels in order. If None, will be inferred from y_test

    Returns:
    --------
    dict : Dictionary with all required metrics
    """

    # Make predictions
    y_pred = model.predict(X_test)

    # Get class probabilities if available
    if hasattr(model, 'predict_proba'):
        y_proba = model.predict_proba(X_test)
    else:
        raise ValueError(f"Model {model_name} doesn't support predict_proba - needed for AUC calculation")

    # Get class labels if not provided
    if class_labels is None:
        class_labels = np.unique(y_test)

    # 1. Overall accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # 2. Overall misclassification error rate
    misclass_error = 1.0 - accuracy

    # 3. Average balanced accuracy (one-vs-rest for each class)
    balanced_acc = balanced_accuracy_score(y_test, y_pred)

    # 4. Average F1 score (macro-averaged)
    f1_macro = f1_score(y_test, y_pred, average='macro')

    # 5. Average AUC (one-vs-rest for each class, then averaged)
    # For multiclass: treat each class as positive vs all others, then average
    auc_scores = []
    y_test_array = np.array(y_test)

    for i, class_label in enumerate(class_labels):
        # Create binary labels: current class vs all others
        y_binary = (y_test_array == class_label).astype(int)

        # Get probability for this class
        if len(y_proba.shape) > 1 and y_proba.shape[1] > i:
            y_prob_class = y_proba[:, i]
        else:
            continue  # Skip if class not in predictions

        # Calculate AUC for this class
        try:
            auc_class = roc_auc_score(y_binary, y_prob_class)
            auc_scores.append(auc_class)
        except ValueError:
            # Skip if class has no positive samples in test set
            continue

    auc_macro = np.mean(auc_scores) if auc_scores else np.nan

    # 6. Confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred, labels=class_labels)

    # Compile results
    results = {
        'Model': model_name,
        'Variant': variant,
        'Accuracy': accuracy,
        'Misclass_Error': misclass_error,
        'Balanced_Accuracy': balanced_acc,
        'F1_Macro': f1_macro,
        'AUC_Macro': auc_macro,
        'Confusion_Matrix': conf_matrix
    }

    if auto_save:
        df = pd.DataFrame([{k: v for k, v in results.items() if k != 'Confusion_Matrix'}])
        results_path = os.path.join("results", results_file)
        df.to_csv(results_path, mode='a', header=not os.path.exists(results_path), index=False)
        save_confusion_matrix(results)

    # Print summary
    print(f"{model_name} ({variant}):")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  Misclass Error: {misclass_error:.4f}")
    print(f"  Balanced Accuracy: {balanced_acc:.4f}")
    print(f"  F1 (macro): {f1_macro:.4f}")
    print(f"  AUC (macro): {auc_macro:.4f}")
    print()

    return results

def save_results_table(results_list, filename="results_summary.csv"):
    """
    Save all model results to a CSV table.

    Parameters:
    -----------
    results_list : list of dicts
        List of result dictionaries from evaluate_model()
    filename : str
        Output CSV filename
    """
    # Convert to DataFrame (excluding confusion matrices for the summary table)
    df_results = []
    for result in results_list:
        row = {k: v for k, v in result.items() if k != 'Confusion_Matrix'}
        df_results.append(row)

    df = pd.DataFrame(df_results)

    # Sort by variant and then by balanced accuracy
    df = df.sort_values(['Variant', 'Balanced_Accuracy'], ascending=[True, False])

    # Save to CSV
    df.to_csv(filename, index=False)
    print(f"Results saved to {filename}")

    return df

def save_confusion_matrix(result, output_dir="confusion_matrices"):
    """
    Save individual confusion matrix to CSV.

    Parameters:
    -----------
    result : dict
        Result dictionary from evaluate_model()
    output_dir : str
        Directory to save confusion matrices
    """
    os.makedirs(output_dir, exist_ok=True)

    model_name = result['Model'].replace(' ', '_')
    variant = result['Variant']

    # Convert confusion matrix to DataFrame with class labels
    conf_matrix = result['Confusion_Matrix']

    # You'll need to pass class_labels to this function or store them in results
    # For now, using generic labels
    n_classes = conf_matrix.shape[0]
    class_names = [f"Class_{i}" for i in range(n_classes)]

    conf_df = pd.DataFrame(
        conf_matrix,
        index=[f"True_{name}" for name in class_names],
        columns=[f"Pred_{name}" for name in class_names]
    )

    filename = f"{output_dir}/confmat_{model_name}_{variant}.csv"
    conf_df.to_csv(filename)
    print(f"Confusion matrix saved to {filename}")

'\n# After training your model:\nresults = evaluate_model(lda, X_test_scaled, y_test, "LDA", variant="raw")\n\n# Collect all results:\nall_results = []\nall_results.append(results)\n\n# Save summary table:\nsave_results_table(all_results, "model_comparison.csv")\n\n# Save confusion matrix:\nsave_confusion_matrix(results)\n'

In [8]:
# =============================================================================
# LINEAR MODELS (Raw Features) - Need Scaling
# =============================================================================

print("Training Linear Models (Raw Features)...")
print("=" * 50)

# 1. Linear Discriminant Analysis (LDA)
print("Training LDA...")
lda = LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto')
lda.fit(X_train_scaled, y_train)
evaluate_model(lda, X_test_scaled, y_test, "LDA", variant="raw", class_labels=classes)

# 2. Logistic Regression
print("Training Logistic Regression...")
logreg = LogisticRegression(
    solver='lbfgs',
    max_iter=1000,
    random_state=RANDOM_STATE
)
logreg.fit(X_train_scaled, y_train)
evaluate_model(logreg, X_test_scaled, y_test, "Logistic Regression", variant="raw", class_labels=classes)

# 3. Quadratic Discriminant Analysis (QDA)
print("Training QDA...")
qda = QuadraticDiscriminantAnalysis(reg_param=0.1)  # regularization for stability
qda.fit(X_train_scaled, y_train)
evaluate_model(qda, X_test_scaled, y_test, "QDA", variant="raw", class_labels=classes)

print("Linear models (raw features) completed!")
print("=" * 50)

Training Linear Models (Raw Features)...
Training LDA...
Confusion matrix saved to confusion_matrices/confmat_LDA_raw.csv
LDA (raw):
  Accuracy: 0.8627
  Misclass Error: 0.1373
  Balanced Accuracy: 0.8597
  F1 (macro): 0.8611
  AUC (macro): 0.9910

Training Logistic Regression...
Confusion matrix saved to confusion_matrices/confmat_Logistic_Regression_raw.csv
Logistic Regression (raw):
  Accuracy: 0.9924
  Misclass Error: 0.0076
  Balanced Accuracy: 0.9917
  F1 (macro): 0.9917
  AUC (macro): 1.0000

Training QDA...
Confusion matrix saved to confusion_matrices/confmat_QDA_raw.csv
QDA (raw):
  Accuracy: 0.9532
  Misclass Error: 0.0468
  Balanced Accuracy: 0.9491
  F1 (macro): 0.9492
  AUC (macro): 0.9981

Linear models (raw features) completed!


In [9]:
# =============================================================================
# DISTANCE-BASED MODELS (Raw Features) - Need Scaling
# =============================================================================

print("Training Distance-Based Models (Raw Features)...")
print("=" * 50)

# 4. k-Nearest Neighbors (k-NN)
print("Training k-NN...")
knn = KNeighborsClassifier(
    n_neighbors=5,
    weights='distance',  # weight by inverse distance
    n_jobs=-1  # use all cores for faster computation
)
knn.fit(X_train_scaled, y_train)
evaluate_model(knn, X_test_scaled, y_test, "k-NN", variant="raw", class_labels=classes)

# 4. RBF SVM
print("Training RBF SVM...")
# Set environment variables for better BLAS performance
os.environ['OMP_NUM_THREADS'] = '8'  # Use all CPU cores
os.environ['MKL_NUM_THREADS'] = '8'  # Intel MKL optimization
os.environ['OPENBLAS_NUM_THREADS'] = '8'  # OpenBLAS optimization

# 2. Memory cleanup
gc.collect()  # Free up any unused memory

# 3. Data type optimization (float32 uses half the memory of float64)
X_train_scaled_opt = X_train_scaled.astype(np.float32)
X_test_scaled_opt = X_test_scaled.astype(np.float32)

# 4. RBF SVM
svm_optimized = SVC(
    kernel='rbf',                # Keep RBF kernel as required
    C=1.0,                      # Standard C parameter
    gamma='scale',              # Standard gamma parameter
    probability=True,           # Needed for predict_proba
    cache_size=4500,            # Increase cache
    shrinking=True,             # Enable shrinking heuristic for faster convergence
    tol=1e-3,                   # Standard tolerance (explicit for clarity)
    max_iter=10000,                # Iteration limit
    random_state=RANDOM_STATE   # Reproducibility
)
svm_optimized.fit(X_train_scaled_opt, y_train)
evaluate_model(svm_optimized, X_test_scaled_opt, y_test, "SVM", variant="raw", class_labels=classes)

# 5. Clean up optimized arrays if memory is tight
del X_train_scaled_opt, X_test_scaled_opt
gc.collect()

print("Distance-based models (raw features) completed!")
print("=" * 50)

Training Distance-Based Models (Raw Features)...
Training k-NN...
Confusion matrix saved to confusion_matrices/confmat_k-NN_raw.csv
k-NN (raw):
  Accuracy: 0.9693
  Misclass Error: 0.0307
  Balanced Accuracy: 0.9693
  F1 (macro): 0.9697
  AUC (macro): 0.9984

Training optimized RBF SVM...
Converting data to float32 for memory efficiency...
Memory reduction: 289.5MB -> 144.8MB
Training optimized RBF SVM...
Starting SVM training (optimized)...
SVM training completed! Running evaluation...
Confusion matrix saved to confusion_matrices/confmat_SVM_raw.csv
SVM (raw):
  Accuracy: 0.9899
  Misclass Error: 0.0101
  Balanced Accuracy: 0.9895
  F1 (macro): 0.9897
  AUC (macro): 0.9999

SVM optimization completed!


In [11]:
# =============================================================================
# TREE-BASED MODELS (Raw Features) - No Scaling Needed
# =============================================================================

print("Training Tree-Based Models (Raw Features)...")
print("=" * 50)

# 6. Random Forest
print("Training Random Forest...")
rf = RandomForestClassifier(
    n_estimators=200,  # more trees for better performance
    max_depth=None,    # let trees grow deep
    min_samples_split=2,
    n_jobs=-1,  # use all cores
    random_state=RANDOM_STATE
)
rf.fit(X_train_tree, y_train)
evaluate_model(rf, X_test_tree, y_test, "Random Forest", variant="raw", class_labels=classes)

# 7. Gradient Boosting Decision Trees (GBDT)
print("Training GBDT...")
gbdt = GradientBoostingClassifier(
    n_estimators=100,  # reduce if too slow
    learning_rate=0.1,
    max_depth=3,
    random_state=RANDOM_STATE
)
gbdt.fit(X_train_tree, y_train)
evaluate_model(gbdt, X_test_tree, y_test, "GBDT", variant="raw", class_labels=classes)

print("Tree-based models (raw features) completed!")
print("=" * 50)

Training Tree-Based Models (Raw Features)...
Training Random Forest...
Confusion matrix saved to confusion_matrices/confmat_Random_Forest_raw.csv
Random Forest (raw):
  Accuracy: 0.9843
  Misclass Error: 0.0157
  Balanced Accuracy: 0.9830
  F1 (macro): 0.9829
  AUC (macro): 0.9999

Training GBDT (this may take longer)...
Confusion matrix saved to confusion_matrices/confmat_GBDT_raw.csv
GBDT (raw):
  Accuracy: 0.9608
  Misclass Error: 0.0392
  Balanced Accuracy: 0.9631
  F1 (macro): 0.9635
  AUC (macro): 0.9992

Tree-based models (raw features) completed!


In [12]:
# =============================================================================
# LINEAR MODELS (PCA Features) - 10 Components
# =============================================================================

print("Training Linear Models (PCA-10 Features)")
print("=" * 50)

# System optimizations
os.environ['OMP_NUM_THREADS'] = '8'
os.environ['MKL_NUM_THREADS'] = '8'
os.environ['OPENBLAS_NUM_THREADS'] = '8'
gc.collect()

# Convert PCA data to float32 for memory efficiency
X_train_pca_opt = X_train_pca.astype(np.float32)
X_test_pca_opt = X_test_pca.astype(np.float32)

# 1. LDA + PCA(10)
print("Training LDA + PCA(10)...")
lda_pca = LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto')
lda_pca.fit(X_train_pca_opt, y_train)
evaluate_model(lda_pca, X_test_pca_opt, y_test, "LDA", variant="pca10", class_labels=classes)

# 2. Logistic Regression + PCA(10)
print("Training Logistic Regression + PCA(10)...")
logreg_pca = LogisticRegression(
    solver='lbfgs',
    max_iter=1000,
    random_state=RANDOM_STATE
)
logreg_pca.fit(X_train_pca_opt, y_train)
evaluate_model(logreg_pca, X_test_pca_opt, y_test, "Logistic Regression", variant="pca10", class_labels=classes)

# 3. QDA + PCA(10)
print("Training QDA + PCA(10)...")
qda_pca = QuadraticDiscriminantAnalysis(reg_param=0.1)
qda_pca.fit(X_train_pca_opt, y_train)
evaluate_model(qda_pca, X_test_pca_opt, y_test, "QDA", variant="pca10", class_labels=classes)

print("Linear models (PCA-10 features) completed!")
print("=" * 50)

Training Linear Models (PCA-10 Features) - Optimized...
Converting PCA data to float32...
PCA Memory reduction: 13.2MB -> 6.6MB
Training LDA + PCA(10)...
Confusion matrix saved to confusion_matrices/confmat_LDA_pca10.csv
LDA (pca10):
  Accuracy: 0.8397
  Misclass Error: 0.1603
  Balanced Accuracy: 0.8304
  F1 (macro): 0.8354
  AUC (macro): 0.9883

Training Logistic Regression + PCA(10)...
Confusion matrix saved to confusion_matrices/confmat_Logistic_Regression_pca10.csv
Logistic Regression (pca10):
  Accuracy: 0.9905
  Misclass Error: 0.0095
  Balanced Accuracy: 0.9907
  F1 (macro): 0.9907
  AUC (macro): 0.9999

Training QDA + PCA(10)...
Confusion matrix saved to confusion_matrices/confmat_QDA_pca10.csv
QDA (pca10):
  Accuracy: 0.9531
  Misclass Error: 0.0469
  Balanced Accuracy: 0.9497
  F1 (macro): 0.9492
  AUC (macro): 0.9981

Linear models (PCA-10 features) completed!


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
# =============================================================================
# DISTANCE-BASED MODELS (PCA Features) - 10 Components
# =============================================================================

print("Training Distance-Based Models (PCA-10 Features) - Optimized...")
print("=" * 50)

# 4. k-NN + PCA(10)
print("Training k-NN + PCA(10)...")
knn_pca = KNeighborsClassifier(
    n_neighbors=5,
    weights='distance',
    algorithm='ball_tree',  # Optimized for PCA features
    leaf_size=30,
    n_jobs=-1
)
knn_pca.fit(X_train_pca_opt, y_train)
evaluate_model(knn_pca, X_test_pca_opt, y_test, "k-NN", variant="pca10", class_labels=classes)

# 5. SVM + PCA(10)
print("Training SVM + PCA(10)...")
svm_pca = SVC(
    kernel='rbf',
    C=1.0,
    gamma='scale',
    probability=True,
    cache_size=2000,
    shrinking=True,
    tol=1e-3,
    random_state=RANDOM_STATE
)
svm_pca.fit(X_train_pca_opt, y_train)
evaluate_model(svm_pca, X_test_pca_opt, y_test, "SVM", variant="pca10", class_labels=classes)

print("Distance-based models (PCA-10 features) completed!")
print("=" * 50)

Training Distance-Based Models (PCA-10 Features) - Optimized...
Training k-NN + PCA(10)...
Confusion matrix saved to confusion_matrices/confmat_k-NN_pca10.csv
k-NN (pca10):
  Accuracy: 0.9704
  Misclass Error: 0.0296
  Balanced Accuracy: 0.9702
  F1 (macro): 0.9706
  AUC (macro): 0.9986

Training SVM + PCA(10) - Optimized (should be fast)...
Confusion matrix saved to confusion_matrices/confmat_SVM_pca10.csv
SVM (pca10):
  Accuracy: 0.9888
  Misclass Error: 0.0112
  Balanced Accuracy: 0.9887
  F1 (macro): 0.9889
  AUC (macro): 0.9999

Distance-based models (PCA-10 features) completed!


In [23]:
# =============================================================================
# TREE-BASED MODELS (PCA Features) - 10 Components
# =============================================================================

print("Training Tree-Based Models (PCA-10 Features) - Optimized...")
print("=" * 50)

# System optimization for tree models
os.environ['OMP_NUM_THREADS'] = '8'
gc.collect()

# 6. Random Forest + PCA(10)
print("Training Random Forest + PCA(10)...")
rf_pca = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    min_samples_split=2,
    max_features='sqrt',
    n_jobs=-1,
    random_state=RANDOM_STATE
)
rf_pca.fit(X_train_pca_opt, y_train)
evaluate_model(rf_pca, X_test_pca_opt, y_test, "Random Forest", variant="pca10", class_labels=classes)

# 7. GBDT + PCA(10)
print("Training GBDT + PCA(10)...")
gbdt_pca = GradientBoostingClassifier(
    n_estimators=100,       # Should be fast with only 10 features
    learning_rate=0.1,
    max_depth=3,
    subsample=0.8,
    random_state=RANDOM_STATE
)
gbdt_pca.fit(X_train_pca_opt, y_train)
evaluate_model(gbdt_pca, X_test_pca_opt, y_test, "GBDT", variant="pca10", class_labels=classes)

# Clean up optimized PCA arrays
del X_train_pca_opt, X_test_pca_opt
gc.collect()

print("Tree-based models (PCA-10 features) completed!")
print("All 14 model variants completed!")
print("=" * 50)

Training Tree-Based Models (PCA-10 Features) - Optimized...
Using optimized PCA data for tree models...
Training Random Forest + PCA(10)...
Confusion matrix saved to confusion_matrices/confmat_Random_Forest_pca10.csv
Random Forest (pca10):
  Accuracy: 0.9773
  Misclass Error: 0.0227
  Balanced Accuracy: 0.9739
  F1 (macro): 0.9741
  AUC (macro): 0.9997

Training GBDT + PCA(10)...


KeyboardInterrupt: 

In [25]:
# =============================================================================
# TASK 1.4: GLACIER ICE BINARY CLASSIFICATION (RAW FEATURES)
# =============================================================================

print("=" * 60)
print("TASK 1.4: GLACIER ICE vs ALL OTHER LAND TYPES (RAW FEATURES)")
print("=" * 60)

# Define the positive class for glacier detection
glacier_positive = "snow / ice"
_, original_classes = pd.factorize(df['land_type'])
correct_glacier_idx = list(original_classes).index(glacier_positive)

# Create binary target arrays using existing train/test split
y_train_binary = (y_train == correct_glacier_idx).astype(int)
y_test_binary = (y_test == correct_glacier_idx).astype(int)

# Check class distribution
glacier_train_count = np.sum(y_train_binary)
glacier_test_count = np.sum(y_test_binary)
total_train = len(y_train_binary)
total_test = len(y_test_binary)

# Create optimized raw features for binary classification
X_train_scaled_opt = X_train_scaled.astype(np.float32)
X_test_scaled_opt = X_test_scaled.astype(np.float32)

# Prepare unscaled data for tree models
X_train_tree_opt = X_train.values.astype(np.float32)
X_test_tree_opt = X_test.values.astype(np.float32)

# System optimizations
os.environ['OMP_NUM_THREADS'] = '8'
os.environ['MKL_NUM_THREADS'] = '8'
os.environ['OPENBLAS_NUM_THREADS'] = '8'
gc.collect()

print(f"\nTraining classifiers for binary glacier detection (RAW FEATURES)...")
print(f"Primary metric: F1 score (glacier = positive class)")
print("=" * 60)

# Store results for comparison
glacier_corrected_results = []

def evaluate_binary_classifier_corrected(model, X_test, y_test, model_name, use_tree_data=False):
    """
    Evaluate binary classifier focusing on F1 score for glacier detection.
    """
    from sklearn.metrics import precision_recall_fscore_support

    y_pred = model.predict(X_test)

    # F1 score with glacier as positive class
    precision, recall, f1, support = precision_recall_fscore_support(
        y_test, y_pred, pos_label=1, average='binary'
    )

    # Overall accuracy for context
    accuracy = accuracy_score(y_test, y_pred)

    # Store results
    result = {
        'Model': model_name,
        'Features': 'Raw',
        'F1_Glacier': f1,
        'Precision_Glacier': precision,
        'Recall_Glacier': recall,
        'Accuracy': accuracy
    }

    glacier_corrected_results.append(result)

    # Print results
    data_type = "Tree" if use_tree_data else "Scaled"
    print(f"{model_name} (Raw-{data_type}):")
    print(f"  F1 (glacier): {f1:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  Accuracy: {accuracy:.4f}")
    print()

    return result

# =============================================================================
# TRAIN ALL 7 CLASSIFIERS FOR GLACIER DETECTION (RAW FEATURES)
# =============================================================================

# 1. Linear Discriminant Analysis
print("1. Training LDA for glacier detection (Raw)...")
lda_binary = LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto')
lda_binary.fit(X_train_scaled_opt, y_train_binary)
evaluate_binary_classifier_corrected(lda_binary, X_test_scaled_opt, y_test_binary, "LDA")

# 2. Logistic Regression (with balanced class weights)
print("2. Training Logistic Regression for glacier detection (Raw-Corrected)...")
logreg_binary = LogisticRegression(
    solver='lbfgs',
    max_iter=1000,
    class_weight='balanced',
    random_state=RANDOM_STATE
)
logreg_binary.fit(X_train_scaled_opt, y_train_binary)
evaluate_binary_classifier_corrected(logreg_binary, X_test_scaled_opt, y_test_binary, "Logistic Regression")

# 3. Quadratic Discriminant Analysis
print("3. Training QDA for glacier detection (Raw)...")
qda_binary = QuadraticDiscriminantAnalysis(reg_param=0.1)
qda_binary.fit(X_train_scaled_opt, y_train_binary)
evaluate_binary_classifier_corrected(qda_binary, X_test_scaled_opt, y_test_binary, "QDA")

# 4. k-Nearest Neighbors
print("4. Training k-NN for glacier detection (Raw)...")
knn_binary = KNeighborsClassifier(
    n_neighbors=5,
    weights='distance',
    algorithm='ball_tree',
    n_jobs=-1
)
knn_binary.fit(X_train_scaled_opt, y_train_binary)
evaluate_binary_classifier_corrected(knn_binary, X_test_scaled_opt, y_test_binary, "k-NN")

# 5. Support Vector Machine (with balanced class weights and optimizations)
print("5. Training SVM for glacier detection (Raw)...")
svm_binary = SVC(
    kernel='rbf',
    C=1.0,
    gamma='scale',
    class_weight='balanced',
    cache_size=4000,
    shrinking=True,
    random_state=RANDOM_STATE
)
svm_binary.fit(X_train_scaled_opt, y_train_binary)
evaluate_binary_classifier_corrected(svm_binary, X_test_scaled_opt, y_test_binary, "SVM")

# 6. Random Forest (with balanced class weights)
print("6. Training Random Forest for glacier detection (Raw)...")
rf_binary = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    class_weight='balanced',
    n_jobs=-1,
    random_state=RANDOM_STATE
)
rf_binary.fit(X_train_tree_opt, y_train_binary)
evaluate_binary_classifier_corrected(rf_binary, X_test_tree_opt, y_test_binary, "Random Forest", use_tree_data=True)

# 7. Gradient Boosting Decision Trees
print("7. Training GBDT for glacier detection (Raw)...")
gbdt_binary = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=RANDOM_STATE
)
gbdt_binary.fit(X_train_tree_opt, y_train_binary)
evaluate_binary_classifier_corrected(gbdt_binary, X_test_tree_opt, y_test_binary, "GBDT", use_tree_data=True)

# =============================================================================
# RESULTS SUMMARY (RAW FEATURES - CORRECTED)
# =============================================================================
print("=" * 60)
print("GLACIER DETECTION RESULTS SUMMARY (RAW FEATURES - CORRECTED)")
print("=" * 60)

# Create results DataFrame
glacier_corrected_df = pd.DataFrame(glacier_corrected_results)
glacier_corrected_df = glacier_corrected_df.sort_values('F1_Glacier', ascending=False)

# Display results
print(f"Ranked by F1 Score (glacier = positive class, CORRECTED Raw features):")
print()
for idx, row in glacier_corrected_df.iterrows():
    print(f"{row['Model']:20} | F1: {row['F1_Glacier']:.4f} | Precision: {row['Precision_Glacier']:.4f} | Recall: {row['Recall_Glacier']:.4f}")

# Save results
os.makedirs("results", exist_ok=True)
glacier_corrected_df.to_csv("results/glacier_binary_corrected_results.csv", index=False)
print(f"\nResults saved to results/glacier_binary_corrected_results.csv")

# Identify top 3 performers (matching task requirements)
print(f"\nTop 3 classifiers for glacier detection (CORRECTED - should match friend's results):")
for i, (idx, row) in enumerate(glacier_corrected_df.head(3).iterrows(), 1):
    print(f"{i}. {row['Model']} (F1: {row['F1_Glacier']:.4f})")

print("=" * 60)

TASK 1.4: GLACIER ICE vs ALL OTHER LAND TYPES (RAW FEATURES)
Positive class (glacier): 'snow / ice'
Correct glacier class index (from factorize): 7

CORRECTED Binary Class Distribution:
Training set: 9,226 glacier / 163,257 other (5.3% glacier)
Test set: 2,307 glacier / 40,814 other (5.4% glacier)
Total glacier samples: 11,533 (should match ~11,533 from dataset)

Creating optimized raw features for glacier detection...
Scaled features shape: (172483, 220)
Tree features shape: (172483, 220)

Training classifiers for binary glacier detection (RAW FEATURES - CORRECTED)...
Primary metric: F1 score (glacier = positive class)
1. Training LDA for glacier detection (Raw-Corrected)...
LDA (Raw-Scaled):
  F1 (glacier): 0.9363
  Precision: 0.9452
  Recall: 0.9276
  Accuracy: 0.9933

2. Training Logistic Regression for glacier detection (Raw-Corrected)...
Logistic Regression (Raw-Scaled):
  F1 (glacier): 0.9899
  Precision: 0.9804
  Recall: 0.9996
  Accuracy: 0.9989

3. Training QDA for glacier de