In [33]:
import pandas as pd
import numpy as np

from xgboost import XGBClassifier

# Data loading and parsing

In [34]:
df = pd.read_csv(
  'X_test_Hi5.csv',
  dtype={
    'piezo_station_department_code': str,
    'piezo_qualification': 'category'
  },
  usecols=[
    'row_index',
    
    # PIEZOMETRY COLUMNS
    'piezo_measurement_date', 'piezo_station_longitude', 'piezo_station_latitude', 'piezo_qualification',
    
    # WEATHER COLUMNS
    'meteo_temperature_avg', 'meteo_humidity_avg', 'meteo_evapotranspiration_grid', 'meteo_rain_height', 'meteo_frost_duration',
    
    # HYDRO COLUMNS
    'hydro_observation_result_elab',
    
    # INSEE COLUMNS
    'insee_pop_commune', 'insee_%_agri', "insee_%_const", "insee_med_living_level", "insee_%_ind",
    
    # WITHDRAWALS
    'prelev_volume_0'
  ]
)

In [35]:
row_index = df.pop('row_index')

In [36]:
df['piezo_qualification'] = pd.factorize(df['piezo_qualification'])[0]
df['piezo_measurement_date'] = pd.to_datetime(df['piezo_measurement_date'])

# NaN replacement

In [37]:
for c in [
    "insee_%_ind",
    "insee_%_const",
    "insee_%_agri",
    "insee_med_living_level",
    "insee_pop_commune"
]:
    df_insee = df[c]
    df_insee = df_insee.replace("N/A - division par 0", None)
    df_insee = df_insee.replace("N/A - résultat non disponible", None)
    df_insee = df_insee.astype(float)
    df[c] = df_insee

In [38]:
df.set_index('piezo_measurement_date', inplace=True)
df.interpolate(method='time', inplace=True)
df.fillna(df.median(), inplace=True)
df.reset_index(inplace=True)

# Date conversion

In [39]:
import math

df['day'] = pd.factorize(df['piezo_measurement_date'].dt.strftime('%d-%m'))[0]
df['day'] = (df['day'] - df['day'].min()) / (df['day'].max() - df['day'].min())
df['day'] = np.cos(2 * math.pi * df['day'])
df.drop(['piezo_measurement_date'], axis=1, inplace=True)

# Normalization

In [40]:
df = (df - df.mean())/df.std()
nb_features = len(df.columns)

# Submission

In [41]:
loaded_model = XGBClassifier()
loaded_model.load_model("xgb_model.json")

In [42]:
cols_when_model_builds = loaded_model.get_booster().feature_names
df = df[cols_when_model_builds]

In [43]:
y_pred = loaded_model.predict(df)

In [44]:
pred_df = pd.DataFrame(data={
  'row_index': row_index,
  'piezo_groundwater_level_category': pd.Series(y_pred).map({
    0: 'Very Low',
    1: 'Low',
    2: 'Average',
    3: 'High',
    4: 'Very High'
  })
})

In [45]:
pred_df

Unnamed: 0,row_index,piezo_groundwater_level_category
0,2331795,Very Low
1,2331796,Very Low
2,2331797,Low
3,2331798,Average
4,2331799,Low
...,...,...
611203,3610818,Very Low
611204,3610819,Very Low
611205,3610820,Very Low
611206,3610821,Average


In [46]:
pred_df.to_csv('y_test_submission_2.csv', index=False)