<a href="https://colab.research.google.com/github/waddason/Hickathon5/blob/main/Dimitri.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from os.path import exists
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
import joblib  # or import pickle
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [5]:
def data_without_nan(df):
    # Identifier les variables quantitatives et qualitatives
    quantitative_vars = df.select_dtypes(include=['float64', 'int64']).columns
    qualitative_vars = df.select_dtypes(include=['object', 'category']).columns

    # Remplacer les NaN par la moyenne pour les variables quantitatives
    df[quantitative_vars] = df[quantitative_vars].apply(lambda col: col.fillna(col.mean()), axis=0)

    # Remplacer les NaN par la valeur la plus fréquente (mode) pour les variables qualitatives
    df[qualitative_vars] = df[qualitative_vars].apply(lambda col: col.fillna(col.mode()[0] if not col.mode().empty else 'Unknown'), axis=0)

    # Vérifier si des NaN restent dans le jeu de données
    remaining_nans = df.isna().sum().sum()
    print(f"Nombre total de NaN restants dans df : {remaining_nans}")

    return df

In [6]:
def prep_data(df):
    def fix_dates(df):
        df['piezo_measurement_date'] = df['piezo_measurement_date'].str.replace(r'CEST', '', regex=True).str.strip()
        df['piezo_measurement_date'] = pd.to_datetime(df['piezo_measurement_date'], errors='coerce')
        return df
    def fix_prelev(df):
        prelev_usage_labels = [
                    'EAU POTABLE',
                    'EAU TURBINEE (barrage)',
                    'CANAUX',
                    'INDUSTRIE et ACTIVITES ECONOMIQUES (hors irrigation, hors énergie)',
                    'IRRIGATION',
                    'ENERGIE',
                       ]
        df['prelev_TOTAL'] = df['prelev_volume_0'] + df['prelev_volume_1'] + df['prelev_volume_2'] + df['prelev_other_volume_sum']
        for usage in prelev_usage_labels:
            df[f'prelev_{usage}'] = (
                (df['prelev_volume_0'] * (df['prelev_usage_label_0'] == usage).astype(int)) +
                (df['prelev_volume_1'] * (df['prelev_usage_label_1'] == usage).astype(int)) +
                (df['prelev_volume_2'] * (df['prelev_usage_label_2'] == usage).astype(int))
            )
        df = df.drop(columns=['prelev_usage_label_0', 'prelev_usage_label_1', 'prelev_usage_label_2',
                                            'prelev_volume_0', 'prelev_volume_1', 'prelev_volume_2'])
        return df
    df = df.set_index('row_index')
    df = fix_dates(df)
    df = fix_prelev(df)
    df = df.loc[df['piezo_qualification'] == 'Correcte']
    df = df.drop(columns=['piezo_qualification'])
    df = df.loc[df['piezo_status'] == 'Donnée contrôlée niveau 2']
    df = df.drop(columns=['piezo_status'])
    df = df.loc[df['hydro_qualification_code'] == 20]
    df = df.drop(columns=['hydro_qualification_code'])
    df = df.reset_index(drop=True)
    y = df['piezo_groundwater_level_category']
    y = y.map({'Very Low' :-2, 'Low':-1, 'Average':0, 'High':1, 'Very High':2})
    x = df.drop(columns=['piezo_groundwater_level_category'])
    x.rename(columns={'piezo_measurement_date': 'date', 'piezo_station_bss_id': 'id'}, inplace=True)
    return x, y


In [7]:
file_names = ['X_train_Hi5.csv', 'X_test_Hi5.csv']
for file_name in file_names:
  if not exists(file_name):
    path_r = 'drive/MyDrive/Copy\ of\ ' + file_name
    path_w = './' + file_name
    !cp $path_r $path_w

In [8]:
columns_to_keep = ['row_index',
 'piezo_station_bss_id',
 'piezo_station_altitude',
 'piezo_station_longitude',
 'piezo_station_latitude',
 'piezo_measurement_date',
 'piezo_status',
 'piezo_qualification',
 'meteo_rain_height',
 'meteo_temperature_min',
 'meteo_temperature_max',
 'meteo_temperature_avg',
 'meteo_temperature_avg_threshold',
 'meteo_frost_duration',
 'meteo_amplitude_tn_tx',
 'meteo_temperature_avg_tntm',
 'meteo_evapotranspiration_grid',
 'hydro_observation_result_elab',
 'hydro_qualification_code',
 'distance_piezo_hydro',
 'distance_piezo_meteo',
 'prelev_volume_0',
 'prelev_usage_label_0',
 'prelev_volume_1',
 'prelev_usage_label_1',
 'prelev_volume_2',
 'prelev_usage_label_2',
 'prelev_other_volume_sum',
 'insee_pop_commune',
 'piezo_groundwater_level_category']




In [9]:
df = pd.read_csv('X_train_Hi5.csv', nrows=150000, usecols=columns_to_keep)
x, y = prep_data(df)


In [10]:
past = [10, 30, 90,]

In [11]:
pd.set_option('future.no_silent_downcasting', True)
id_list = x['id'].value_counts().index.tolist()
x_per_id = {}
i = 0
for id in id_list:
    i += 1
    if i % 250 == 1 or i == len(id_list):
        print(f'{i}/{len(id_list)}')
    df_idx = x.loc[x['id'] == id].copy()
    df_idx.reset_index(drop=False, inplace=True)
    df_idx.set_index('date', inplace=True)
    df_idx = df_idx[~df_idx.index.duplicated()]
    df_idx.sort_index(inplace=True)
    df_idx = pd.concat([df_idx, df_idx.shift(periods=past, freq='D')], axis=1)
    df_index = df_idx['index'].copy()
    for period in past:
        df_idx.drop(columns=[f'index_{period}'], inplace=True)
        df_idx.drop(columns=[f'id_{period}'], inplace=True)
    df_idx = df_idx.bfill()
    df_idx = df_idx.drop(columns=['index'])
    df_idx['index'] = df_index
    df_idx = df_idx.loc[df_idx['index'].notna()]
    x_per_id[id] = df_idx.copy()

1/1877
251/1877
501/1877
751/1877
1001/1877
1251/1877
1501/1877
1751/1877
1877/1877


In [12]:
new_x = pd.concat(x_per_id.values(), axis=0, ignore_index=True)
new_x['index'] = new_x['index'].astype(int)
new_x.set_index('index', inplace=True)
new_x = new_x.sort_index()
new_x = new_x.drop(columns=['id'])
new_y = y.loc[new_x.index]

In [13]:
x_train, x_valid, y_train, y_valid = train_test_split(new_x, new_y, test_size=0.2, random_state=42)

In [14]:
x_train = data_without_nan(x_train)
x_valid = data_without_nan(x_valid)

Nombre total de NaN restants dans df : 0
Nombre total de NaN restants dans df : 0


In [15]:
scaler = RobustScaler()
x_train_scaled = scaler.fit_transform(x_train)
joblib.dump(scaler, 'robust_scaler.pkl')
x_train = pd.DataFrame(x_train_scaled, columns=x_train.columns, index=x_train.index)
x_valid_scaled = scaler.transform(x_valid)
x_valid = pd.DataFrame(x_valid_scaled, columns=x_valid.columns, index=x_valid.index)

In [14]:
# Sanitize column names
x_train.columns = x_train.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
x_valid.columns = x_valid.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)


In [19]:
import cudf  # GPU-accelerated DataFrame library
import xgboost as xgb

# Convert pandas DataFrame to cuDF DataFrame
gpu_x_train = cudf.from_pandas(x_train)
gpu_x_valid = cudf.from_pandas(x_valid)

In [22]:
xgb = XGBRegressor(tree_method='hist', device='cuda', objective='reg:squarederror')
param_grid = {
    'n_estimators': [400, 500, 1000],
    'learning_rate': [0.3, 0.4, 0.9],
    'max_depth': [6, 7, 12],
}
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, scoring='neg_mean_absolute_error', cv=5)
grid_search.fit(gpu_x_train, y_train)
train_score = grid_search.score(gpu_x_train, y_train)
valid_score = grid_search.score(gpu_x_valid, y_valid)

print("Best Parameters:", grid_search.best_params_)
print("Training Score:", train_score)
print("Validation Score:", valid_score)

Best Parameters: {'learning_rate': 0.3, 'max_depth': 12, 'n_estimators': 1000}
Training Score: -0.009126919314402737
Validation Score: -0.22804921708231726


In [32]:
pred_valid = grid_search.predict(gpu_x_valid)
pred_valid = pd.Series(pred_valid, index=y_valid.index)
pred_valid = pred_valid.apply(lambda x: round(x))
accuracy = accuracy_score(y_valid, pred_valid)
print("Accuracy:", accuracy)

Accuracy: 0.8659206910090302
