# 🚱 Water Shortage Prediction at Hi!ckathon 2024
  

## Data Preprocessing

### Import Libraries

In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from tqdm import tqdm

# Set pandas to display larger dataframes without truncation
pd.set_option('display.max_rows', None)     # To display all rows
pd.set_option('display.max_columns', None)  # To display all columns
pd.set_option('display.width', None)        # To prevent line wrapping

### Load Data

In [3]:
X_train = pd.read_csv('../data/X_train.csv')
y_train = pd.read_csv('../data/y_train.csv')

y_train = y_train['piezo_groundwater_level_category']

In [4]:
mapping = {
    'Very Low': 0,
    'Low': 1,
    'Average': 2,
    'High': 3,
    'Very High': 4
}
y_train = y_train.map(mapping)

In [5]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3, random_state=42)


In [6]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np
from tqdm import tqdm

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [200, 300],                # Number of trees
    'max_depth': [None, 10, 20],               # Maximum depth of trees
    'min_samples_split': [2, 5, 10],           # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4]              # Minimum number of samples required at a leaf node
}

# Initialize the RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV with TimeSeriesSplit as the cross-validation strategy
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           scoring='accuracy', cv=3, n_jobs=5, verbose=2)

# Perform GridSearchCV within the training set
grid_search.fit(X_train, y_train)

# Print the best parameters found by GridSearchCV
print(f"Best Parameters: {grid_search.best_params_}")

# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Perform predictions on the test set (unseen data)
y_pred = best_model.predict(X_test)

# Calculate accuracy for the final model on the test set
accuracy = accuracy_score(y_test, y_pred)

# Print the test set accuracy
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Fitting 3 folds for each of 54 candidates, totalling 162 fits


TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}

In [9]:
# Get the feature importances from the best model
feature_importances = best_model.feature_importances_

# Get the feature names from the original dataset
feature_names = X_train.columns

# Create a DataFrame with the feature names and their importance
feature_importances_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importances})

# Sort the DataFrame in descending order of the feature importance
feature_importances_df = feature_importances_df.sort_values('importance', ascending=False)

# Display the top 10 most important features
feature_importances_df

Unnamed: 0,feature,importance
0,piezo_station_investigation_depth,0.034977
37,hydro_observation_result_elab,0.029992
43,distance_piezo_hydro,0.027533
1,piezo_station_altitude,0.027487
76,altitude_difference,0.026762
56,prelev_other_volume_sum,0.026705
2,piezo_station_longitude,0.026016
36,distance_piezo_meteo,0.025845
3,piezo_station_latitude,0.024395
57,insee_pop_commune,0.024208


In [6]:
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np
from tqdm import tqdm

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3, random_state=42, shuffle=False)

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],           # Number of trees
    'max_depth': [None, 10, 20, 30],           # Maximum depth of trees
    'min_samples_split': [2, 5, 10],           # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4]              # Minimum number of samples required at a leaf node
}

# Initialize TimeSeriesSplit for cross-validation 
tscv = TimeSeriesSplit(n_splits=3)

# Initialize the RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV with TimeSeriesSplit as the cross-validation strategy
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           scoring='accuracy', cv=tscv, n_jobs=5, verbose=2)

# Perform GridSearchCV within the training set
grid_search.fit(X_train, y_train)

# Print the best parameters found by GridSearchCV
print(f"Best Parameters: {grid_search.best_params_}")

# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Perform predictions on the test set (unseen data)
y_pred = best_model.predict(X_test)

# Calculate accuracy for the final model on the test set
accuracy = accuracy_score(y_test, y_pred)

# Print the test set accuracy
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Fitting 3 folds for each of 108 candidates, totalling 324 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  14.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  31.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  32.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  52.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.2min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=  54.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  17.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  36.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.9min
[CV] END m



[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  55.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=  35.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=  51.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.2min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=  16.9s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 2.9min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=  35.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.9min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.9min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_es

In [7]:
# Get the feature importances from the best model
feature_importances = best_model.feature_importances_

# Get the feature names from the original dataset
feature_names = X_train.columns

# Create a DataFrame with the feature names and their importance
feature_importances_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importances})

# Sort the DataFrame in descending order of the feature importance
feature_importances_df = feature_importances_df.sort_values('importance', ascending=False)

# Display the top 10 most important features
feature_importances_df

Unnamed: 0,feature,importance
0,piezo_station_investigation_depth,0.039851
37,hydro_observation_result_elab,0.039411
1,piezo_station_altitude,0.031775
42,distance_piezo_hydro,0.030894
55,prelev_other_volume_sum,0.030344
36,distance_piezo_meteo,0.029094
2,piezo_station_longitude,0.029022
3,piezo_station_latitude,0.027241
56,insee_pop_commune,0.026614
51,prelev_volume_2,0.021458


In [9]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import make_scorer, accuracy_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
import numpy as np
from sklearn.model_selection import cross_validate

# Initialize models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=200, random_state=42),
    #'SVM': SVC(kernel='rbf', C=1, gamma='auto'),
    'XGBoost': xgb.XGBClassifier(n_estimators=200, random_state=42),
    'LightGBM': lgb.LGBMClassifier(n_estimators=200),
    'CatBoost': CatBoostClassifier(iterations=200, silent=True),
    'AdaBoost': AdaBoostClassifier(n_estimators=200)
}

# Cross-validation setup
scoring = ['accuracy', 'f1_macro', 'precision_macro', 'recall_macro', 'roc_auc_ovr']

# Custom scorer for multiclass AUC-ROC
def roc_auc_ovr(y_true, y_pred):
    return roc_auc_score(y_true, y_pred, multi_class='ovr', average='macro', labels=np.unique(y_true))

# Dictionary to store results
results = {}

# Cross-validation loop
for model_name, model in models.items():
    print(f"Training {model_name}...") 
    
    # Perform cross-validation and evaluate multiple metrics
    cv_results = cross_validate(model, X_train, y_train, cv=3, scoring=scoring, n_jobs=3, verbose=1)
    
    # Extract mean and standard deviation for each metric
    accuracy_mean, accuracy_std = cv_results['test_accuracy'].mean(), cv_results['test_accuracy'].std()
    f1_mean, f1_std = cv_results['test_f1_macro'].mean(), cv_results['test_f1_macro'].std()
    precision_mean, precision_std = cv_results['test_precision_macro'].mean(), cv_results['test_precision_macro'].std()
    recall_mean, recall_std = cv_results['test_recall_macro'].mean(), cv_results['test_recall_macro'].std()
    roc_auc_mean, roc_auc_std = cv_results['test_roc_auc_ovr'].mean(), cv_results['test_roc_auc_ovr'].std()

    # Print results directly after each model's evaluation
    print(f"{model_name}:")
    print(f"  Accuracy: {accuracy_mean:.4f} ± {accuracy_std:.4f}")
    print(f"  F1 Score: {f1_mean:.4f} ± {f1_std:.4f}")
    print(f"  Precision: {precision_mean:.4f} ± {precision_std:.4f}")
    print(f"  Recall: {recall_mean:.4f} ± {recall_std:.4f}")
    print(f"  AUC-ROC: {roc_auc_mean:.4f} ± {roc_auc_std:.4f}")
    print()


Training Random Forest...


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}