## 1.Data Preparation

In [None]:
import pandas as pd
import numpy as np

In [None]:
x_train = pd.read_csv('../data/X_train_Hi5.csv', low_memory=False)
x_test = pd.read_csv('../data/X_test_Hi5.csv', low_memory=False)

In [None]:
x_train_time = pd.to_datetime(x_train['piezo_measurement_date'])

target_cat = {'High':4, 'Very High':5, 'Very Low' :1, 'Low':2, 'Average':3}
target = x_train["piezo_groundwater_level_category"].apply(lambda x : target_cat.get(x, 0))

In [None]:
x_train = x_train.drop(['piezo_groundwater_level_category'],axis=1)

In [None]:
x_all = pd.concat([x_train, x_test], axis=0)

In [None]:
x_all['piezo_station_department_code'] = x_all['piezo_station_department_code'].replace(['2A','2B'],'20').astype('int16')
x_all['piezo_station_commune_code_insee'] = x_all['piezo_station_commune_code_insee'].str.replace('A', '0').str.replace('B', '0').fillna('00000').astype('int32')

x_all['insee_%_agri'] = x_all['insee_%_agri'].replace('N/A - division par 0',0).astype('float32')
x_all['insee_%_ind'] = x_all['insee_%_ind'].replace('N/A - division par 0',0).astype('float32')
x_all['insee_%_const'] = x_all['insee_%_const'].replace('N/A - division par 0',0).astype('float32')
x_all['insee_med_living_level'] = x_all['insee_med_living_level'].replace('N/A - résultat non disponible',0).astype('float32')

In [None]:
categorical_columns = [
    col for col in x_all.select_dtypes(include=['object']).columns
    if x_all[col].nunique() <= 10
]
encoded_data = pd.get_dummies(x_all[categorical_columns], columns=categorical_columns)

In [None]:
added_data = pd.DataFrame()
added_data['piezo_station_bdlisa_codes_TOP3'] = x_all['piezo_station_bdlisa_codes'].fillna("['000']").apply(lambda x: x[2:5]).astype('int16')
added_data['piezo_station_bdlisa_codes_LAST4'] = pd.factorize(x_all['piezo_station_bdlisa_codes'].fillna("['0']").apply(lambda x: x[6:]))[0].astype('int16')
added_data['piezo_station_bss_code_LASTslash'] = pd.factorize(x_all['piezo_station_bss_code'].apply(lambda x : x.split('/')[-1]))[0].astype('int16')
added_data['piezo_station_bss_id'] = pd.factorize(x_all['piezo_station_bss_code'])[0].astype('int16')
added_data['hydro_station_code'] = pd.factorize(x_all['hydro_station_code'])[0].astype('int16')
added_data['prelev_structure_code_0'] = pd.factorize(x_all['prelev_structure_code_0'])[0].astype('int16')
added_data['prelev_structure_code_1'] = pd.factorize(x_all['prelev_structure_code_1'])[0].astype('int16')
added_data['prelev_structure_code_2'] = pd.factorize(x_all['prelev_structure_code_2'])[0].astype('int16')

In [None]:
numeric_columns = x_all.select_dtypes(include=['number']).columns
numeric_data = x_all.loc[:, numeric_columns].copy()

for col in numeric_data.select_dtypes(include=['float']).columns:
    numeric_data[col] = numeric_data[col].astype('float32')

## 2.Data Preprocessing

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

scaler = MinMaxScaler()
normalized_data = pd.DataFrame(scaler.fit_transform(numeric_data), columns=numeric_data.columns)

standard_scaler = StandardScaler()
standardized_data = pd.DataFrame(standard_scaler.fit_transform(normalized_data), columns=numeric_data.columns)

In [None]:
del x_train, x_test, numeric_data, normalized_data, encoded_data, added_data

In [None]:
for col in standardized_data.select_dtypes(include=['float']).columns:
    standardized_data[col] = standardized_data[col].astype('float32')

In [None]:
final_numeric_data = pd.concat([standardized_data.reset_index(drop=True), encoded_data.reset_index(drop=True), added_data.reset_index(drop=True)], axis=1)

In [None]:
# del standardized_data, encoded_data

In [None]:
final_numeric_data['year'] = x_train_time.dt.year
final_numeric_data['month'] = x_train_time.dt.month
final_numeric_data['day'] = x_train_time.dt.day

In [None]:
filled_numeric_data = final_numeric_data.fillna(final_numeric_data.median())

## 3.Data Segmentation

### Validation

In [None]:
# tag = (final_numeric_data['month'].between(6, 9)) & (final_numeric_data['year'] == 2021)

# X_train = final_numeric_data[~tag]
# X_test = final_numeric_data[tag]

# y_train = target[~tag]
# y_test = target[tag]

### Train Test Split

In [None]:
X_train = filled_numeric_data[:2830316]
X_test = filled_numeric_data[-611208:]
y_train = target

## 4.Baseline Models

In [None]:
from sklearn.naive_bayes import GaussianNB
import time

nb_model = GaussianNB()

start_time = time.time()
nb_model.fit(X_train, y_train)
end_time = time.time()

y_pred_nb = nb_model.predict(X_test)

cm_nb = confusion_matrix(y_test, y_pred_nb)
print("Confusion Matrix (Naive Bayes):")
print(cm_nb)

print("\nClassification Report (Naive Bayes):")
print(classification_report(y_test, y_pred_nb))

print(f"Training and prediction time (Naive Bayes): {end_time - start_time:.2f} seconds")

In [None]:
from sklearn.neural_network import MLPClassifier
import time

mlp_model = MLPClassifier(alpha=1, max_iter=1000, random_state=42)

start_time = time.time()
mlp_model.fit(X_train, y_train)
end_time = time.time()

y_pred_mlp = mlp_model.predict(X_test)

cm_mlp = confusion_matrix(y_test, y_pred_mlp)
print("Confusion Matrix (MLP Classifier):")
print(cm_mlp)

print("\nClassification Report (MLP Classifier):")
print(classification_report(y_test, y_pred_mlp))

print(f"Training and prediction time (MLP Classifier): {end_time - start_time:.2f} seconds")

In [None]:
from sklearn.ensemble import RandomForestClassifier
import time

rf_model = RandomForestClassifier(random_state=9522)

start_time = time.time()
rf_model.fit(X_train, y_train)
end_time = time.time()

y_pred_rf = rf_model.predict(X_test)

print(f"Training and prediction time (Random Forest): {end_time - start_time:.2f} seconds")

In [None]:
reverse_target_cat = {v: k for k, v in target_cat.items()}

y_pred_rf_mapped = np.vectorize(reverse_target_cat.get)(y_pred_rf)

result = pd.DataFrame({
    # 'row_index': x_test['row_index'],
    'piezo_groundwater_level_category': y_pred_rf_mapped
})

result.to_csv('predictions_rfN1.csv', index=False)

## 5.Performance Attemps

In [None]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
import time
from sklearn.impute import SimpleImputer

print("Step 1: Removing high-missing-rate features...")

missing_threshold = 0.8
missing_ratios = X_train.isnull().mean()
low_missing_features = missing_ratios[missing_ratios < missing_threshold].index
X_train_reduced = X_train[low_missing_features]
X_test_reduced = X_test[low_missing_features]

print(f"Reduced features from {X_train.shape[1]} to {X_train_reduced.shape[1]} after high-missing-rate filtering.")

print("Step 2: Removing low-variance features...")

variance_threshold = 0.01
selector = VarianceThreshold(threshold=variance_threshold)
X_train_reduced = selector.fit_transform(X_train_reduced)
X_test_reduced = selector.transform(X_test_reduced)

print(f"Reduced features from {len(low_missing_features)} to {X_train_reduced.shape[1]} after low-variance filtering.")

imputer = SimpleImputer(strategy='mean')
X_train_reduced = imputer.fit_transform(X_train_reduced)
X_test_reduced = imputer.transform(X_test_reduced)

print("Step 3: Training the Random Forest model...")

rf_model = RandomForestClassifier(n_jobs=-1, random_state=42)

start_time = time.time()
rf_model.fit(X_train_reduced, y_train)
end_time = time.time()

print("Step 4: Predicting and evaluating the model...")

y_pred_rf = rf_model.predict(X_test_reduced)

cm_rf = confusion_matrix(y_test, y_pred_rf)
print("Confusion Matrix (Random Forest):")
print(cm_rf)

print("\nClassification Report (Random Forest):")
print(classification_report(y_test, y_pred_rf))

print(f"Training and prediction completed in {end_time - start_time:.2f} seconds.")