In [1]:
import os
import pandas as pd
import geopandas as gpd
import numpy as np
from sklearn.cluster import DBSCAN
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

# 1. Load relevant functions

# 1.1 Load the function to group the DataFrame into fire events

In [2]:
def group_into_fire_events(df):
    """
    This function takes in a DataFrame and returns a copy of the DataFrame with three additional columns:
    'date_cluster', 'regional_cluster', and 'event_id'.
    
    Args:
        df (pd.DataFrame): The input DataFrame
        
    Returns:
        df (pd.DataFrame): The output DataFrame with three additional columns: 'date_cluster', 'regional_cluster', and 'event_id'.
    """
    
    df = df.copy()
    df.rename(columns={'latitude_left': 'latitude', 'longitude_left': 'longitude'}, inplace=True)
    
    # Convert 'acq_date' to datetime
    df['date'] = pd.to_datetime(df['date'])
    
    df = df.sort_values(by='date')
    
    # Create 'date_cluster' column based on consecutive dates
    df['date_cluster'] = (df['date'].diff().dt.days > 1).cumsum()
    
    # Function to apply DBSCAN on each consecutive date cluster
    def apply_dbscan(group):
        coords = group[['latitude', 'longitude']].values 
        # Apply DBSCAN, hyperparameters are the same as Climada
        db = DBSCAN(eps=15/111.12, min_samples=1).fit(coords)
        group['regional_cluster'] = db.labels_
        return group
    
    # Apply DBSCAN on each date cluster
    df = df.groupby('date_cluster').apply(apply_dbscan)
    
    df.reset_index(drop=True, inplace=True)
    
    # Create a unique 'event_id' for each unique combination of 'date_cluster' and 'regional_cluster'
    df['event_id'] = df.groupby(['date_cluster', 'regional_cluster']).ngroup()
    
    df = df.sort_values(by='event_id').reset_index(drop=True)
    
    return df

# 1.2 Load the function to shuffle and split data into training and testing according to event_id

In [14]:
def split_event_ids(event_id_pairs, test_size, random_seed):
    """
    Splits event_id pairs into training and test sets based on the specified test size.

    Args:
        event_id_pairs (list of tuples): List of event_id and their corresponding row counts.
        test_size (float): The proportion of the test set.
        random_seed (int): The seed for the random number generator.

    Returns:
        train_event_ids (list): List of event_ids for the training set.
        test_event_ids (list): List of event_ids for the test set.
    """
    np.random.seed(random_seed)
    np.random.shuffle(event_id_pairs)
    
    test_event_ids = []
    current_test_rows = 0
    total_rows = sum(count for _, count in event_id_pairs)
    test_rows_target = int(total_rows * test_size)
    
    for event_id, count in event_id_pairs:
        current_test_rows += count
        test_event_ids.append(event_id)
        if current_test_rows >= test_rows_target:
            break
    
    train_event_ids = [event_id for event_id, _ in event_id_pairs if event_id not in test_event_ids]
    
    return train_event_ids, test_event_ids

def shuffle_and_split(df, test_size=0.1, random_seed=42):
    """
    This function shuffles the DataFrame with respect to 'event_id' and then splits the data
    into training and test sets with approximately 10% test size with respect to 'event_id'.
    
    Args:
        df (pd.DataFrame): The input DataFrame
        test_size (float): The proportion of the test set
        random_seed (int): The seed for the random number generator
        
    Returns:
        X_train (pd.DataFrame): Training features
        X_test (pd.DataFrame): Test features
        y_train (pd.Series): Training labels
        y_test (pd.Series): Test labels
    """
    # Shuffle the dataframe based on event_id
    event_counts = df['event_id'].value_counts().sort_index()
    event_id_pairs = list(zip(event_counts.index, event_counts.values))
    
    train_event_ids, test_event_ids = split_event_ids(event_id_pairs, test_size, random_seed)
    
    X_train = df[df['event_id'].isin(train_event_ids)].drop(columns=['ignited'])
    y_train = df[df['event_id'].isin(train_event_ids)]['ignited']
    X_test = df[df['event_id'].isin(test_event_ids)].drop(columns=['ignited'])
    y_test = df[df['event_id'].isin(test_event_ids)]['ignited']
    
    return X_train, X_test, y_train, y_test



# Example usage:
# folder = '../../climada_petals/data/wildfire/outputs/'
# year = 2013
# file_path = os.path.join(folder, str(year), f'ignited_eu_{year}_gdf')
#     
# # Load the DataFrame from the file
# df = gpd.read_file(file_path)
# X_train, X_test, y_train, y_test = shuffle_and_split(df)


KeyboardInterrupt



In [5]:
def custom_cv_split(X, y, n_splits=5, split_by_col='event_id', random_state=42):
    """
    This function creates cross-validation splits based on a specified column (e.g., 'event_id'),
    ensuring that all rows with the same value in the specified column are kept together in the same fold.

    Args:
        X (pd.DataFrame): Features
        y (pd.Series): Labels
        n_splits (int): Number of folds
        split_by_col (str): Column name to split by (e.g., 'event_id')
        random_state (int): Random seed for reproducibility

    Yields:
        train_indices, val_indices (np.ndarray): Indices for training and validation sets for each fold
    """
    # Ensure reproducibility
    np.random.seed(random_state)
    
    # Get unique event IDs and shuffle them
    unique_event_ids = X[split_by_col].unique()
    np.random.shuffle(unique_event_ids)
    
    # Calculate the fold sizes
    fold_sizes = np.full(n_splits, len(unique_event_ids) // n_splits, dtype=int)
    fold_sizes[:len(unique_event_ids) % n_splits] += 1
    
    current = 0
    for fold_size in fold_sizes:
        val_event_ids = unique_event_ids[current:current + fold_size]
        train_event_ids = np.setdiff1d(unique_event_ids, val_event_ids)
        
        train_indices = X[X[split_by_col].isin(train_event_ids)].index
        val_indices = X[X[split_by_col].isin(val_event_ids)].index
        
        yield train_indices, val_indices
        current += fold_size


In [31]:
# import pandas as pd
# import numpy as np
# 
# def custom_cv_split(X, y, n_splits=5, random_state=42):
#     unique_event_ids = X['event_id'].unique()
#     np.random.seed(random_state)
#     np.random.shuffle(unique_event_ids)
#     fold_sizes = np.full(n_splits, len(unique_event_ids) // n_splits, dtype=int)
#     fold_sizes[:len(unique_event_ids) % n_splits] += 1
#     current = 0
#     for fold_size in fold_sizes:
#         test_event_ids = unique_event_ids[current:current + fold_size]
#         train_event_ids = np.setdiff1d(unique_event_ids, test_event_ids)
#         train_indices = X[X['event_id'].isin(train_event_ids)].index
#         test_indices = X[X['event_id'].isin(test_event_ids)].index
#         yield train_indices, test_indices
#         current += fold_size
# 
# def test_custom_cv_split():
#     # Generate a sample dataframe
#     data = {
#         'event_id': [0, 0, 0, 1, 2, 2, 3, 3, 4, 5, 5, 5, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8],
#         'feature1': range(22),
#         'feature2': range(22, 44),
#         'ignited': [1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1]
#     }
#     df = pd.DataFrame(data)
#     X = df.drop(columns=['ignited'])
#     y = df['ignited']
#     
#     # Test 1: Each split respects the event_id boundaries
#     for train_indices, test_indices in custom_cv_split(X, y, n_splits=5):
#         train_event_ids = X.loc[train_indices, 'event_id'].unique()
#         test_event_ids = X.loc[test_indices, 'event_id'].unique()
#         assert len(set(train_event_ids).intersection(set(test_event_ids))) == 0, "Overlap of event_ids between train and test sets"
#     
#     # Test 2: There is no overlap between training and validation sets for each split
#     for train_indices, test_indices in custom_cv_split(X, y, n_splits=5):
#         assert len(set(train_indices).intersection(set(test_indices))) == 0, "Overlap of indices between train and test sets"
#     
#     # Test 3: The number of splits is correct
#     splits = list(custom_cv_split(X, y, n_splits=5))
#     assert len(splits) == 5, "Number of splits is incorrect"
#     
#     # Test 4: The total number of rows in the splits matches the total number of rows in the input data
#     test_indices_combined = np.concatenate([test_indices for _, test_indices in splits])
#     assert len(test_indices_combined) == len(X), "Total number of rows in the splits does not match the total number of rows in the input data"
#     
#     # Test 5: Handle small number of splits
#     small_splits = list(custom_cv_split(X, y, n_splits=2))
#     assert len(small_splits) == 2, "Small number of splits test failed: Number of splits is incorrect"
#     
#     # Test 6: Correct splitting for small dataset
#     small_data = {
#         'event_id': [0, 0, 1, 1],
#         'feature1': [10, 20, 30, 40],
#         'feature2': [50, 60, 70, 80],
#         'ignited': [1, 0, 1, 0]
#     }
#     small_df = pd.DataFrame(small_data)
#     X_small = small_df.drop(columns=['ignited'])
#     y_small = small_df['ignited']
#     small_splits = list(custom_cv_split(X_small, y_small, n_splits=2))
#     for train_indices, test_indices in small_splits:
#         train_event_ids = X_small.loc[train_indices, 'event_id'].unique()
#         test_event_ids = X_small.loc[test_indices, 'event_id'].unique()
#         assert len(set(train_event_ids).intersection(set(test_event_ids))) == 0, "Small dataset test failed: Overlap of event_ids between train and test sets"
#     
#     print("All tests passed!")
# 
# # Run the tests
# test_custom_cv_split()


All tests passed!


# 1.3 Load the function to train and evaluate ML models

In [6]:
def train_and_evaluate_models(df, random_state=42):
    # Preprocess the dataframe
    df = pd.get_dummies(df, columns=['land_cover'], prefix='land_cover')
    df['date'] = pd.to_datetime(df['date'])
    df['month'] = df['date'].dt.month
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df = df.drop(columns=['month', 'date', 'distance_km', 'confidence', 'geometry'])
    df['fwi'].fillna(0, inplace=True)

    # Split the data
    X_train_val, X_test, y_train_val, y_test = shuffle_and_split(df, test_size=0.1, random_seed=random_state)
    X_train_val = X_train_val.drop(columns=['latitude', 'longitude', 'brightness', 'bright_t31'])
    X_test = X_test.drop(columns=['latitude', 'longitude', 'brightness', 'bright_t31'])

    # Initialize dictionaries to store the best models, hyperparameters, and scores
    classifier_names = ["LogisticRegression", "XGBClassifier"]
    best_models = {name: None for name in classifier_names}
    best_params = {name: None for name in classifier_names}
    best_scores = {name: 0 for name in classifier_names}

    # Define objective functions for Optuna
    def logisticregression_objective(trial):
        classifier_name = "LogisticRegression"
        logistic_c = trial.suggest_float('logistic_c', 1e-5, 1e5, log=True)
        classifier_obj = LogisticRegression(C=logistic_c, random_state=random_state)
        model_pipeline = make_pipeline(StandardScaler(), classifier_obj)
        scores = cross_val_score(model_pipeline, X_train_val, y_train_val, cv=list(custom_cv_split(X_train_val, y_train_val, n_splits=5, split_by_col='event_id', random_state=random_state)), n_jobs=-1, scoring='roc_auc')
        score = scores.mean()
        if score > best_scores[classifier_name]:
            best_scores[classifier_name] = score
            best_params[classifier_name] = trial.params
            best_models[classifier_name] = classifier_obj # classifier_obj remain untrained
        return score

    def xgboost_objective(trial):
        classifier_name = "XGBClassifier"
        xgb_params = {
            'n_estimators': trial.suggest_int('n_estimators', 200, 2000),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1e-1, log=True),
            'subsample': trial.suggest_float('subsample', 0.2, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'gamma': trial.suggest_float('gamma', 0, 5),
            'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
            'reg_lambda': trial.suggest_float('reg_lambda', 0, 1),
            'random_state': random_state
        }
        classifier_obj = XGBClassifier(**xgb_params)
        scores = cross_val_score(classifier_obj, X_train_val, y_train_val, cv=list(custom_cv_split(X_train_val, y_train_val, n_splits=5, split_by_col='event_id', random_state=random_state)), n_jobs=-1, scoring='roc_auc')
        score = scores.mean()
        if score > best_scores[classifier_name]:
            best_scores[classifier_name] = score
            best_params[classifier_name] = trial.params
            best_models[classifier_name] = classifier_obj # classifier_obj remain untrained
        return score

    # Optimize hyperparameters with Optuna
    study = optuna.create_study(sampler=optuna.samplers.TPESampler(), direction='maximize')
    study.optimize(logisticregression_objective, n_trials=20)
    study.optimize(xgboost_objective, n_trials=100)

    results = {}
    for name in classifier_names:
        if name == "LogisticRegression":
            best_models[name].set_params(max_iter=200)
            model_pipeline = make_pipeline(StandardScaler(), best_models[name])
        else:
            model_pipeline = best_models[name]

        model_pipeline.fit(X_train_val, y_train_val)
        train_accuracy = model_pipeline.score(X_train_val, y_train_val)
        test_accuracy = model_pipeline.score(X_test, y_test)
        y_pred = model_pipeline.predict(X_test)
        classification_rep = classification_report(y_test, y_pred)
        confusion_mat = confusion_matrix(y_test, y_pred)

        results[name] = {
            "train_accuracy": train_accuracy,
            "test_accuracy": test_accuracy,
            "classification_report": classification_rep,
            "confusion_matrix": confusion_mat
        }

        if name == "LogisticRegression":
            feature_importance = best_models[name].coef_[0]
        else:
            feature_importance = best_models[name].feature_importances_

        feature_importance_df = pd.DataFrame({'feature': X_train_val.columns, 'importance': feature_importance})
        feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)
        results[name]["feature_importance"] = feature_importance_df

    return results



# 2. Run ML model for each year

In [11]:
import os

folder = '../../climada_petals/data/wildfire/outputs/'

In [12]:
os.path.exists(folder)

True

In [13]:
years = np.arange(2013, 2014)

# Initialize an empty DataFrame to store the concatenated DataFrames
gdf_all_years = pd.DataFrame()

for year in years:
    '''Step1: Load gdf'''
    # construct the file path
    file_path = os.path.join(folder, str(year), f'ignited_eu_{year}_gdf')
    
    # Load the DataFrame from the file
    gdf = gpd.read_file(file_path)
    
    # Concatenate the loaded DataFrame with the initial DataFrame
    gdf_all_years = pd.concat([gdf_all_years, gdf])
    
    gdf_all_years = gpd.GeoDataFrame(gdf_all_years, geometry=gdf_all_years.geometry, crs=gdf_all_years.crs)
    
    '''Step2: Group gdf into fire events'''
    gdf_all_years = group_into_fire_events(gdf_all_years)
    
    '''Step3: Split data into training and testing sets and train and evaluate models'''
    results = train_and_evaluate_models(gdf_all_years)
    
    break

[I 2024-07-24 22:14:55,359] A new study created in memory with name: no-name-f9b40153-cda1-4872-bb56-5a4fea779556
[W 2024-07-24 22:14:56,539] Trial 0 failed with parameters: {'logistic_c': 27.965002487341778} because of the following error: IndexError('indices are out-of-bounds').
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "C:\Users\zhong\.conda\envs\climada_env\lib\site-packages\joblib\externals\loky\process_executor.py", line 463, in _process_worker
    r = call_item()
  File "C:\Users\zhong\.conda\envs\climada_env\lib\site-packages\joblib\externals\loky\process_executor.py", line 291, in __call__
    return self.fn(*self.args, **self.kwargs)
  File "C:\Users\zhong\.conda\envs\climada_env\lib\site-packages\joblib\parallel.py", line 598, in __call__
    return [func(*args, **kwargs)
  File "C:\Users\zhong\.conda\envs\climada_env\lib\site-packages\joblib\parallel.py", line 598, in <listcomp>
    return [func(*args, **kwargs)


IndexError: indices are out-of-bounds

In [44]:
for model, metrics in results.items():
    print(f"Results for {model}:")
    print("Training Accuracy:", metrics['train_accuracy'])
    print("Test Accuracy:", metrics['test_accuracy'])
    print("Classification Report:\n", metrics['classification_report'])
    print("Confusion Matrix:\n", metrics['confusion_matrix'])
    print("Feature Importance:\n", metrics['feature_importance'])
    print('------------------------------------')

Results for LogisticRegression:
Training Accuracy: 0.7700218833679412
Test Accuracy: 0.7675706285786195
Classification Report:
               precision    recall  f1-score   support

       False       0.76      0.85      0.80      8629
        True       0.79      0.67      0.73      7264

    accuracy                           0.77     15893
   macro avg       0.77      0.76      0.76     15893
weighted avg       0.77      0.77      0.77     15893

Confusion Matrix:
 [[7297 1332]
 [2362 4902]]
Feature Importance:
            feature  importance
0              fwi    0.199789
4    land_cover_40    0.141254
5    land_cover_50    0.112411
9    land_cover_90    0.023947
19  land_cover_126    0.019786
2    land_cover_20    0.016416
18  land_cover_125    0.008038
12  land_cover_112    0.000000
3    land_cover_30   -0.000506
17  land_cover_124   -0.001221
15  land_cover_116   -0.015308
16  land_cover_121   -0.019716
10  land_cover_100   -0.021827
7    land_cover_70   -0.039546
1        elev