# Tristan notebook

In [None]:
import pandas as pd

In [None]:
X_train_path = "data/X_train_Hi5.csv"
X_train = pd.read_csv(X_train_path)

In [None]:
X_train

In [None]:
X_train["piezo_station_bss_id"].nunique()

In [None]:
X_train["piezo_groundwater_level_category"].unique()

In [None]:
# dictionary to encode the target
target_cat = {'High':4, 'Very High':5, 'Very Low' :1, 'Low':2, 'Average':3}
target_level = {num: level for level, num in target_cat.items()}
target_level[0] = 'NaN'

## Station locations

In [None]:
stations_ids = X_train[["piezo_station_bss_id", "piezo_station_latitude", "piezo_station_longitude", "piezo_groundwater_level_category"]].drop_duplicates()
stations_coords = X_train[["piezo_station_bss_id", "piezo_station_latitude", "piezo_station_longitude"]].drop_duplicates()
stations_coords.index = stations_coords["piezo_station_bss_id"]
stations_coords = stations_coords.drop(columns=['piezo_station_bss_id'])
stations_coords.head()

In [None]:
stations_ids["level"] = stations_ids["piezo_groundwater_level_category"].apply(lambda x : target_cat.get(x, 0))

In [None]:
%pip install cartopy


In [None]:
# cartopy to display maps
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import matplotlib.pyplot as plt

In [None]:
def display_piezzo_stations(color_column=None):
    """Plot the weather station on a map of Europe"""
    # Load coordinates from other notebook

    fig = plt.figure( figsize=(12, 8))
    ax = fig.add_subplot(1, 1, 1, projection=ccrs.PlateCarree())
    ax.set_extent([-5, 10, 42, 52], crs=ccrs.PlateCarree())

    # Draw the background
    ax.add_feature(cfeature.LAND)
    ax.add_feature(cfeature.OCEAN)
    ax.add_feature(cfeature.COASTLINE)
    ax.add_feature(cfeature.BORDERS, linestyle=":")
    # ax.add_feature(cfeature.LAKES, alpha=0.5)
    ax.add_feature(cfeature.RIVERS)
    ax.gridlines(draw_labels=True)

    # Add stations' names and positions
    if color_column is None:
        stations_coords.plot.scatter(x="piezo_station_longitude", y="piezo_station_latitude", s=4, ax=ax, transform=ccrs.PlateCarree())
    else:
        stations_coords.plot.scatter(x="piezo_station_longitude", y="piezo_station_latitude", c=color_column, cmap='tab20' ,s=4, ax=ax, transform=ccrs.PlateCarree())

    ax.set_title("Piezzo stations locations")

    plt.show()

In [None]:
display_piezzo_stations()

## Covariance of stations

In [None]:
# subset of the stations
df_by_station=  X_train[["piezo_station_bss_id", "piezo_measurement_date", "piezo_groundwater_level_category"]]
print(df_by_station.shape)
print(df_by_station.index)


In [None]:
# create the level column
df_by_station["level"] = df_by_station["piezo_groundwater_level_category"].apply(lambda x : target_cat.get(x, 0))
df_by_station.head()


In [None]:
# pivot to get the time series
df_by_station = df_by_station.pivot_table(index= "piezo_measurement_date",columns="piezo_station_bss_id", values="level")
df_by_station = df_by_station.fillna(value=0)
print(df_by_station.shape)
df_by_station.head()

In [None]:
# plot the evolution over time of the first 10 stations
fig, ax = plt.subplots(1, 1, figsize=(20, 6))
df_by_station.loc[:, df_by_station.columns[:10]].plot(ax=ax)
ax.legend(title="station", fontsize="small")
ax.set_title("Ground Water level evolution over time in the train dataset (10 stations)")
ax.set_ylabel("level")
ax.set_yticks([i for i in range(6)], labels=[target_level[i] for i in range(6)])
plt.show()

In [None]:
df_by_station.to_numpy()

In [None]:
# Find clusters
from sklearn.cluster import DBSCAN
clustering = DBSCAN(eps=0.00000000001, min_samples=100)
clustering.fit(df_by_station.to_numpy())
print(clustering.labels_)
max(clustering.labels_)




In [None]:
from sklearn.cluster import k_means
centroid, label, intertia = k_means(df_by_station.T, n_clusters=20, random_state=1, n_init=10)
max(label)



In [None]:
display_piezzo_stations(label)

In [None]:
#from sklearn.model_selection import TimeSeriesSplit
#tss = TimeSeriesSplit(n_splits=5)  # Deflaut values are ok
for i, (train_index, test_index) in enumerate(tss.split(df_by_station)):