In [None]:
import numpy as np
import pylab as pl
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd

X_train = pd.read_csv('data/X_train_ready.csv')
X_test = pd.read_csv('data/X_test_ready.csv')
y_train = pd.read_csv('data/y_train_processed.csv')

In [None]:
print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)
print("y_train shape: ", y_train.shape)

In [None]:
# Keep only the rows that are in X
y_train = y_train[y_train["row_index"].isin(X_train["row_index"])]


In [None]:
print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)
print("y_train shape: ", y_train.shape)

In [None]:
# quelle colonne est manquante dans X_train ?
print("Columns missing in X_train: ", set(X_test.columns) - set(X_train.columns))
print("Columns missing in X_test: ", set(X_train.columns) - set(X_test.columns))

In [None]:
X_test = X_test.drop(columns=['piezo_measurement_date.1'])

In [None]:
print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)

# Vérification des NaN

print("X_train NaN: ", X_train.isnull().sum().sum())
print("X_test NaN: ", X_test.isnull().sum().sum())

In [None]:
# Simplifier les colonnes à conserver
columns_to_keep = [
    'piezo_station_bss_id', 'piezo_station_altitude', 'piezo_station_longitude', 'piezo_station_latitude',
    'piezo_measurement_date', 'hydro_delta_7d', 'hydro_delta_30d', 'hydro_delta_90d',
    'meteo_rain_height', 'meteo_snow_height', 'row_index'
]

# Filtrer les colonnes dans X_train et X_test
X_train = X_train[columns_to_keep]
X_test = X_test[columns_to_keep]

In [None]:
print(X_train.dtypes)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Normaliser les colonnes continues
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train.drop(columns=['piezo_measurement_date', 'row_index','piezo_station_bss_id']))
X_test = scaler.transform(X_test.drop(columns=['piezo_measurement_date', 'row_index','piezo_station_bss_id']))

# Préparer les cibles pour la classification (assurez-vous que y_train est au format entier)
y_train = y_train['piezo_groundwater_level_category'].astype(int)

# Diviser les données en ensemble d'entraînement et de validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [None]:
# shape of the data 
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)
print(y_train.shape)
print(y_val.shape)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Normaliser les colonnes continues
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Préparer les cibles pour la classification (assurez-vous que y_train est au format entier)
y_train = y_train.astype(int)

# Diviser les données en ensemble d'entraînement et de validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Définir la longueur des séquences
sequence_length = 30  # Par exemple, 30 jours

# Normalisation des données
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [None]:
y_train = np.array(y_train).flatten()
y_val = np.array(y_val).flatten()

print(f"y_train (après conversion): {y_train.shape}")
print(f"y_val (après conversion): {y_val.shape}")


In [None]:
def create_sequences(data, target, sequence_length):
    sequences = []
    targets = []
    if len(data) <= sequence_length:
        raise ValueError(f"La longueur des données ({len(data)}) est insuffisante pour créer des séquences de longueur {sequence_length}.")
    for i in range(len(data) - sequence_length):
        sequences.append(data[i:i + sequence_length])
        targets.append(target[i + sequence_length])
    return np.array(sequences), np.array(targets)


In [None]:
# Vérifiez que tout est conforme
print(f"X_train_scaled shape: {X_train_scaled.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_val_scaled shape: {X_val_scaled.shape}")
print(f"y_val shape: {y_val.shape}")

# Longueur des séquences
sequence_length = 30

# Génération des séquences
X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train, sequence_length)
X_val_seq, y_val_seq = create_sequences(X_val_scaled, y_val, sequence_length)

# Vérifiez les dimensions des séquences générées
print(f"X_train_seq shape: {X_train_seq.shape}")
print(f"y_train_seq shape: {y_train_seq.shape}")
print(f"X_val_seq shape: {X_val_seq.shape}")
print(f"y_val_seq shape: {y_val_seq.shape}")


In [None]:
# Définir le modèle LSTM
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(X_train_seq.shape[1], X_train_seq.shape[2]), return_sequences=True))
model.add(Dropout(0.2))  # Régularisation
model.add(LSTM(30, activation='relu', return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(1))  # Une seule sortie pour la régression

# Compiler le modèle
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

# Entraîner le modèle
history = model.fit(
    X_train_seq, y_train_seq,
    validation_data=(X_val_seq, y_val_seq),
    epochs=20,  # Nombre d'époques (ajustez selon les performances)
    batch_size=32,  # Taille des lots
    verbose=1
)

# Évaluation sur le jeu de validation
val_predictions = model.predict(X_val_seq)
mse = mean_squared_error(y_val_seq, val_predictions)
print(f"Mean Squared Error on Validation Set: {mse:.4f}")


In [None]:
# Entraînement du modèle
history = model.fit(
    X_train_seq, y_train_seq,
    epochs=50,  # Nombre d'époques
    batch_size=32,  # Taille des lots
    validation_data=(X_val_seq, y_val_seq),  # Jeu de validation
    verbose=1
)

# Évaluation du modèle
y_val_pred = model.predict(X_val_seq)
mse = mean_squared_error(y_val_seq, y_val_pred)
mae = mean_absolute_error(y_val_seq, y_val_pred)
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")

# Visualisation des pertes
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Loss (Entraînement)')
plt.plot(history.history['val_loss'], label='Loss (Validation)')
plt.title('Évolution de la perte au fil des époques')
plt.xlabel('Époques')
plt.ylabel('Loss')
plt.legend()
plt.grid()
plt.show()

# Tracer les prédictions par rapport aux vraies valeurs
plt.figure(figsize=(12, 6))
plt.plot(y_val_seq[:200], label='Vraies valeurs', linestyle='--', marker='o', alpha=0.7)
plt.plot(y_val_pred[:200], label='Prédictions', linewidth=2, alpha=0.7)
plt.title('Comparaison entre les vraies valeurs et les prédictions')
plt.xlabel('Échantillons')
plt.ylabel('Niveau de la nappe')
plt.legend()
plt.grid()
plt.show()


In [None]:
model.add(LSTM(50, activation='relu', input_shape=(X_train_seq.shape[1], X_train_seq.shape[2])))


# Submission

In [None]:
y_submission = xgb_model.predict(dtest)

# Adjust the target classes to start from 1
y_submission = y_submission + 1
print(y_submission[:20])


In [None]:
y_target = pd.Series(y_submission).map(target_level)

print(y_target[:10])

In [None]:
print(X_test_filtered.columns)


In [None]:
submission = pd.DataFrame([X_test_filtered["row_index"], y_target]).T
submission.columns = ["row_index", "piezo_groundwater_level_category"]
print(submission.head())

In [None]:
submission.to_csv("submission_g06_02_xgb.csv", index=False)