# Necessary library installations

In [None]:
# Installation of AutoML SOTA library and dask
!pip install autogluon
!pip install dask

# Some simple Feature Engineering and Data Cleaning

In [None]:
import numpy as np
import pandas as pd
import os
from autogluon.tabular  import TabularDataset, TabularPredictor


X_train = pd.read_csv('path_to_x_train_file', low_memory=False)
X_test = pd.read_csv('path_to_x_test_file', low_memory=False)

In [None]:
# Remove columns with a missing rate higher than 50%
missing_rates = X_train.isnull().mean()
columns_to_keep = missing_rates[missing_rates <= 0.5].index

X_train=X_train[columns_to_keep[1:]]
X_test = X_test[columns_to_keep[1:-1]]

In [None]:
# Convert INSEE columns to float64 data type
insee_columns = X_train.filter(regex='^insee_').columns
X_train[insee_columns] = X_train[insee_columns].apply(pd.to_numeric, errors='coerce')
X_test[insee_columns] = X_test[insee_columns].apply(pd.to_numeric, errors='coerce')
print(X_train[insee_columns].dtypes)

insee_%_agri              float64
insee_pop_commune         float64
insee_med_living_level    float64
insee_%_ind               float64
insee_%_const             float64
dtype: object


In [None]:
non_numeric_cols = X_test.select_dtypes(include=['object', 'category']).columns
time_cols = [col for col in X_train.columns if 'date' in col.lower()]
categorical_cols = [col for col in non_numeric_cols if col not in time_cols]

In [None]:
# Define some preprocessing functions
def date(df):
    df['piezo_measurement_date'] = pd.to_datetime(df['piezo_measurement_date'])
    df['year'] = df['piezo_measurement_date'].dt.year
    df['month'] = df['piezo_measurement_date'].dt.month
    df['day'] = df['piezo_measurement_date'].dt.day
    df = df.drop(columns=['piezo_measurement_date'])
    return df

def engineer_features(df):

    for window in [7, 14]:
        df[f'temp_avg_{window}d'] = df.groupby('piezo_station_bss_code')['meteo_temperature_avg'] \
            .rolling(window=window, min_periods=1) \
            .mean() \
            .reset_index(0, drop=True)

        df[f'rain_sum_{window}d'] = df.groupby('piezo_station_bss_code')['meteo_rain_height'] \
            .rolling(window=window, min_periods=1) \
            .sum() \
            .reset_index(0, drop=True)
    df['rain_temp_interaction'] = df['meteo_rain_height'] * df['meteo_temperature_avg']
    return df

In [None]:
columns_to_drop = ['piezo_station_update_date', 'meteo_date', 'hydro_observation_date_elab']
X_train = X_train.drop(columns=columns_to_drop)
X_test = X_test.drop(columns=columns_to_drop)

# DateTime preprocessing
X_train=date(X_train)
X_test=date(X_test)

# Preliminary Feature Engineering
X_train=engineer_features(X_train)
X_test=engineer_features(X_test)

In [None]:
non_numeric_cols = X_test.select_dtypes(include=['object', 'category']).columns
numeric_cols=X_train.select_dtypes(include=['number']).columns
time_cols = [col for col in X_train.columns if 'date' in col.lower()]
categorical_cols = [col for col in non_numeric_cols if col not in time_cols]

In [None]:
for col in numeric_cols:
    X_train[col].fillna(X_train[col].median(), inplace=True)
    X_test[col].fillna(X_test[col].median(), inplace=True)
for col in categorical_cols:
    X_train[col].fillna('nan', inplace=True)
    X_test[col].fillna('nan', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(X_train[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(X_test[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we 

# Model Training

In [None]:
validation_df = X_train.sample(frac=0.2, random_state=42)
train_df = X_train.drop(validation_df.index)

In [None]:
# Define our stacking ensembly method predictor
predictor = TabularPredictor(
    label='piezo_groundwater_level_category',
    eval_metric='f1_weighted',
    problem_type='multiclass'
)

No path specified. Models will be saved in: "AutogluonModels/ag-20241201_100925"


In [None]:
predictor.fit(
    train_data=train_df,
    presets='best_quality',
    time_limit=3600*3,
    hyperparameters={
        'CAT': {
            'iterations': 100,
            'early_stopping_rounds': 20,
            'learning_rate': 0.1
        },
        'GBM': {
            'n_estimators': 100,
            'early_stopping_rounds': 20,
            'learning_rate': 0.1},
        'XGB':{
            'n_estimators': 100,
            'early_stopping_rounds': 20,
            'learning_rate': 0.1
        },
        'FASTAI': {
        }
    }
)

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun 27 21:05:47 UTC 2024
CPU Count:          12
Memory Avail:       61.51 GB / 83.48 GB (73.7%)
Disk Space Avail:   197.66 GB / 235.68 GB (83.9%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked overfitting.
	Running DyStack for up to 2700s of 

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7836495d8f70>

In [None]:
predictor.leaderboard(data=validation_df,extra_metrics=['f1_weighted','precision_weighted','recall_weighted'])

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



Unnamed: 0,model,score_test,f1_weighted,precision_weighted,recall_weighted,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBM_BAG_L2,0.909365,0.909365,0.909504,0.909272,0.893485,f1_weighted,247.544981,218.208316,5554.367122,23.776085,30.366595,515.35555,2,True,7
1,WeightedEnsemble_L3,0.908798,0.908798,0.908947,0.908699,0.89364,f1_weighted,249.895154,220.851622,5716.071968,0.095779,0.810403,69.062979,3,True,9
2,NeuralNetFastAI_BAG_L2,0.904928,0.904928,0.905125,0.904804,0.89136,f1_weighted,297.42819,239.140563,6210.697756,73.659294,51.298842,1171.686183,2,True,6
3,NeuralNetFastAI_BAG_L1,0.903243,0.903243,0.903351,0.903173,0.890431,f1_weighted,74.078047,49.505973,3265.74237,74.078047,49.505973,3265.74237,1,True,1
4,WeightedEnsemble_L2,0.902696,0.902696,0.902806,0.902627,0.890626,f1_weighted,221.776691,186.777494,4640.009723,0.059086,0.800165,41.088707,2,True,5
5,CatBoost_BAG_L2,0.900801,0.900801,0.901891,0.900548,0.886651,f1_weighted,226.023289,189.674624,5131.653439,2.254394,1.832903,92.641867,2,True,8
6,LightGBM_BAG_L1,0.761017,0.761017,0.761627,0.761493,0.759515,f1_weighted,83.643595,105.279311,648.088736,83.643595,105.279311,648.088736,1,True,2
7,CatBoost_BAG_L1,0.504602,0.504602,0.514647,0.502902,0.503686,f1_weighted,2.051291,1.864392,440.090556,2.051291,1.864392,440.090556,1,True,3
8,XGBoost_BAG_L1,0.354566,0.354566,0.360113,0.355598,0.347141,f1_weighted,63.995963,31.192045,685.089911,63.995963,31.192045,685.089911,1,True,4


In [None]:
predictor.feature_importance(train_df)


These features in provided data are not utilized by the predictor and will be ignored: ['piezo_station_department_name', 'piezo_station_bss_id', 'piezo_bss_code', 'piezo_continuity_name', 'piezo_measure_nature_name', 'hydro_method_code', 'hydro_method_label']
Computing feature importance via permutation shuffling for 87 features using 5000 rows with 5 shuffle sets...
	2413.69s	= Expected runtime (482.74s per shuffle set)
	1060.84s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
piezo_station_bss_code,0.412016,0.010692,5.436743e-08,5,0.434030,0.390001
month,0.407270,0.008556,2.336284e-08,5,0.424887,0.389652
year,0.309584,0.007381,3.874980e-08,5,0.324782,0.294386
temp_avg_14d,0.070096,0.004046,1.326201e-06,5,0.078427,0.061766
rain_sum_14d,0.053010,0.002931,1.116272e-06,5,0.059045,0.046976
...,...,...,...,...,...,...
meteo_temperature_min,-0.000159,0.001248,6.049724e-01,5,0.002410,-0.002728
meteo_latitude,-0.000203,0.000768,7.069120e-01,5,0.001378,-0.001784
meteo_temperature_avg,-0.000240,0.000671,7.659617e-01,5,0.001141,-0.001622
insee_%_const,-0.000241,0.000171,9.829692e-01,5,0.000110,-0.000592


In [None]:
preds = predictor.predict(data=X_test)

id=pd.read_csv('/content/y_test_submission_example_Hi5.csv')
id['piezo_groundwater_level_category'] = preds
id.to_csv('all_data_best_q.csv',index=False)

# Saving model parameters

In [None]:
if os.path.exists('/content/AutogluonModels'):
    !tar -czvf autogluon_models.tar.gz /content/AutogluonModels
else:
    print("Path doesn't exist")

tar: Removing leading `/' from member names
/content/AutogluonModels/
/content/AutogluonModels/ag-20241201_095700/
/content/AutogluonModels/ag-20241201_095700/ds_sub_fit/
/content/AutogluonModels/ag-20241201_100925/
/content/AutogluonModels/ag-20241201_100925/version.txt
/content/AutogluonModels/ag-20241201_100925/learner.pkl
/content/AutogluonModels/ag-20241201_100925/predictor.pkl
/content/AutogluonModels/ag-20241201_100925/utils/
/content/AutogluonModels/ag-20241201_100925/utils/data/
/content/AutogluonModels/ag-20241201_100925/utils/data/y.pkl
/content/AutogluonModels/ag-20241201_100925/utils/data/X.pkl
/content/AutogluonModels/ag-20241201_100925/models/
/content/AutogluonModels/ag-20241201_100925/models/WeightedEnsemble_L3/
/content/AutogluonModels/ag-20241201_100925/models/WeightedEnsemble_L3/model.pkl
/content/AutogluonModels/ag-20241201_100925/models/WeightedEnsemble_L3/utils/
/content/AutogluonModels/ag-20241201_100925/models/WeightedEnsemble_L3/utils/oof.pkl
/content/Autogluo