## 1.Data Preparation

In [None]:
import pandas as pd
import numpy as np

In [None]:
x_train = pd.read_csv('../data/X_train_Hi5.csv', low_memory=False)
x_train = pd.read_csv('../data/X_test_Hi5.csv', low_memory=False)

In [None]:
x_train.shape

In [None]:
x_train_time = pd.to_datetime(x_train['piezo_measurement_date'])
target_cat = {'High':4, 'Very High':5, 'Very Low' :1, 'Low':2, 'Average':3}
target = x_train["piezo_groundwater_level_category"].apply(lambda x : target_cat.get(x, 0))

In [None]:
x_train = x_train.drop(['piezo_groundwater_level_category'],axis=1)

In [None]:
x_all = pd.concat([x_train, x_test], axis=0)

In [None]:
x_train['piezo_station_department_code'] = x_train['piezo_station_department_code'].replace(['2A','2B'],'20').astype('int16')
x_train['piezo_station_commune_code_insee'] = x_train['piezo_station_commune_code_insee'].str.replace('A', '0').str.replace('B', '0').astype('int32')

In [None]:
x_train['insee_%_agri'] = x_train['insee_%_agri'].replace('N/A - division par 0',0).astype('float32')
x_train['insee_%_ind'] = x_train['insee_%_ind'].replace('N/A - division par 0',0).astype('float32')
x_train['insee_%_const'] = x_train['insee_%_const'].replace('N/A - division par 0',0).astype('float32')
x_train['insee_med_living_level'] = x_train['insee_med_living_level'].replace('N/A - résultat non disponible',0).astype('float32')

In [None]:
# !pip install category_encoders

In [None]:
from category_encoders import TargetEncoder

categorical_columns = [
    col for col in x_train.select_dtypes(include=['object']).columns
    if x_train[col].nunique() <= 10
]

encoder = TargetEncoder()
encoded_data = encoder.fit_transform(x_train[categorical_columns], target)

In [None]:
added_data = pd.DataFrame()
added_data['piezo_station_bdlisa_codes_TOP3'] = x_train['piezo_station_bdlisa_codes'].fillna("['000']").apply(lambda x: x[2:5]).astype('int16')
added_data['piezo_station_bdlisa_codes_LAST4'] = pd.factorize(x_train['piezo_station_bdlisa_codes'].fillna("['0']").apply(lambda x: x[6:]))[0].astype('int16')
added_data['piezo_station_bss_code_LASTslash'] = pd.factorize(x_train['piezo_station_bss_code'].apply(lambda x : x.split('/')[-1]))[0].astype('int16')
added_data['piezo_station_bss_id'] = pd.factorize(x_train['piezo_station_bss_code'])[0].astype('int16')
added_data['hydro_station_code'] = pd.factorize(x_train['hydro_station_code'])[0].astype('int16')
added_data['prelev_structure_code_0'] = pd.factorize(x_train['prelev_structure_code_0'])[0].astype('int16')
added_data['prelev_structure_code_1'] = pd.factorize(x_train['prelev_structure_code_1'])[0].astype('int16')
added_data['prelev_structure_code_2'] = pd.factorize(x_train['prelev_structure_code_2'])[0].astype('int16')

In [None]:
numeric_columns = x_train.select_dtypes(include=['number']).columns
numeric_data = x_train[numeric_columns]

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

scaler = MinMaxScaler()
normalized_data = pd.DataFrame(scaler.fit_transform(numeric_data), columns=numeric_columns)

standard_scaler = StandardScaler()
standardized_data = pd.DataFrame(standard_scaler.fit_transform(normalized_data), columns=numeric_columns)

In [None]:
del x_train, numeric_data, normalized_data

In [None]:
for col in standardized_data.select_dtypes(include=['float']).columns:
    standardized_data[col] = standardized_data[col].astype('float32')

In [None]:
final_numeric_data = pd.concat([standardized_data, encoded_data, added_data], axis=1)

In [None]:
del standardized_data, encoded_data

In [None]:
final_numeric_data['year'] = x_train_time.dt.year
final_numeric_data['month'] = x_train_time.dt.month
final_numeric_data['day'] = x_train_time.dt.day

In [None]:
filled_numeric_data = final_numeric_data.fillna(final_numeric_data.median())

In [None]:
tag = (final_numeric_data['month'].between(6, 9)) & (final_numeric_data['year'] == 2021)

X_train = final_numeric_data[~tag]
X_test = final_numeric_data[tag]

y_train = target[~tag]
y_test = target[tag]

In [None]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
import time
from sklearn.impute import SimpleImputer

# Step 1: 删除高缺失率特征
print("Step 1: Removing high-missing-rate features...")

missing_threshold = 0.5  # 设置缺失率阈值为 50%
missing_ratios = X_train.isnull().mean()  # 计算每列的缺失率
low_missing_features = missing_ratios[missing_ratios < missing_threshold].index  # 保留缺失率低于阈值的特征
X_train_reduced = X_train[low_missing_features]
X_test_reduced = X_test[low_missing_features]

print(f"Reduced features from {X_train.shape[1]} to {X_train_reduced.shape[1]} after high-missing-rate filtering.")

# Step 2: 删除低方差特征
print("Step 2: Removing low-variance features...")

variance_threshold = 0.01  # 设置方差阈值
selector = VarianceThreshold(threshold=variance_threshold)
X_train_reduced = selector.fit_transform(X_train_reduced)
X_test_reduced = selector.transform(X_test_reduced)

print(f"Reduced features from {len(low_missing_features)} to {X_train_reduced.shape[1]} after low-variance filtering.")

imputer = SimpleImputer(strategy='mean')
X_train_reduced = imputer.fit_transform(X_train_reduced)
X_test_reduced = imputer.transform(X_test_reduced)

# Step 3: 训练随机森林模型
print("Step 3: Training the Random Forest model...")

rf_model = RandomForestClassifier(n_jobs=-1, random_state=42)

# 记录训练时间
start_time = time.time()
rf_model.fit(X_train_reduced, y_train)
end_time = time.time()

# Step 4: 预测和评估
print("Step 4: Predicting and evaluating the model...")

# 预测
y_pred_rf = rf_model.predict(X_test_reduced)

# 评估模型
cm_rf = confusion_matrix(y_test, y_pred_rf)
print("Confusion Matrix (Random Forest):")
print(cm_rf)

print("\nClassification Report (Random Forest):")
print(classification_report(y_test, y_pred_rf))

# 输出训练时间
print(f"Training and prediction completed in {end_time - start_time:.2f} seconds.")


In [None]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
import time

print("Step 1: Removing low-variance features...")

selector = VarianceThreshold(threshold=0.01)
X_train_reduced = selector.fit_transform(X_train)
X_test_reduced = selector.transform(X_test)

print(f"Reduced features from {X_train.shape[1]} to {X_train_reduced.shape[1]} after low-variance filtering.")

print("Step 2: Training the Random Forest model...")

rf_model = RandomForestClassifier(n_jobs=-1, random_state=42)

start_time = time.time()
rf_model.fit(X_train_reduced, y_train)
end_time = time.time()

print("Step 3: Predicting and evaluating the model...")

y_pred_rf = rf_model.predict(X_test_reduced)

cm_rf = confusion_matrix(y_test, y_pred_rf)
print("Confusion Matrix (Random Forest):")
print(cm_rf)

print("\nClassification Report (Random Forest):")
print(classification_report(y_test, y_pred_rf))

print(f"Training and prediction completed in {end_time - start_time:.2f} seconds.")

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

cm_rf = confusion_matrix(y_test, y_pred_rf)
print("Confusion Matrix (Random Forest with selected features):")
print(cm_rf)

print("\nClassification Report (Random Forest with selected features):")
print(classification_report(y_test, y_pred_rf))

print(f"Training and prediction time (Random Forest): {end_time - start_time:.2f} seconds")


In [None]:
rf_model = RandomForestClassifier(random_state=42)

start_time = time.time()
rf_model.fit(X_train, y_train)
end_time = time.time()


y_pred_rf = rf_model.predict(X_test)

cm_rf = confusion_matrix(y_test, y_pred_rf)
print("Confusion Matrix (Random Forest with selected features):")
print(cm_rf)

print("\nClassification Report (Random Forest with selected features):")
print(classification_report(y_test, y_pred_rf))

print(f"Training and prediction time (Random Forest): {end_time - start_time:.2f} seconds")

In [None]:
from sklearn.feature_selection import RFE

rfe = RFE(estimator=rf_model, n_features_to_select=50)
rfe.fit(X_train, y_train)

selected_features = rfe.support_
X_train_selected = X_train[:, selected_features]
X_test_selected = X_test[:, selected_features]

In [None]:
rf_model = RandomForestClassifier(random_state=42)

start_time = time.time()
rf_model.fit(X_train_selected, y_train)
end_time = time.time()


y_pred_rf = rf_model.predict(X_test_selected)

cm_rf = confusion_matrix(y_test, y_pred_rf)
print("Confusion Matrix (Random Forest with selected features):")
print(cm_rf)

print("\nClassification Report (Random Forest with selected features):")
print(classification_report(y_test, y_pred_rf))

print(f"Training and prediction time (Random Forest): {end_time - start_time:.2f} seconds")

In [None]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
import time

# 初始化随机森林分类器，并启用多线程
rf_model = RandomForestClassifier(n_jobs=-1, random_state=42)

# 初始化 RFE，选择前 50 个重要特征
rfe = RFE(estimator=rf_model, n_features_to_select=50)

# 特征选择
start_time = time.time()
rfe.fit(X_train, y_train)  # RFE 自动使用多线程加速
end_time = time.time()

# 提取选择的特征
selected_features = rfe.support_  # 布尔值数组，True 表示被选择的特征
X_train_selected = X_train[:, selected_features]
X_test_selected = X_test[:, selected_features]

print(f"Feature selection time: {end_time - start_time:.2f} seconds")

# 重新训练模型并评估
start_time = time.time()
rf_model.fit(X_train_selected, y_train)  # 再次利用多线程训练模型
y_pred_rf = rf_model.predict(X_test_selected)
end_time = time.time()

# 输出评估指标
cm_rf = confusion_matrix(y_test, y_pred_rf)
print("Confusion Matrix (Random Forest with selected features):")
print(cm_rf)

print("\nClassification Report (Random Forest with selected features):")
print(classification_report(y_test, y_pred_rf))

print(f"Training and prediction time (Random Forest): {end_time - start_time:.2f} seconds")

In [None]:
selector = VarianceThreshold(threshold=0.01)
X_train_reduced = selector.fit_transform(X_train)
# X_test_reduced = selector.transform(X_test)
set(X_train_reduced.columns)

In [None]:
set(X_train_selected.columns)

In [None]:
from sklearn.linear_model import LogisticRegression

# 用逻辑回归作为基模型
rfe = RFE(estimator=LogisticRegression(max_iter=500), n_features_to_select=100)
rfe.fit(X_train, y_train)
X_train_selected = X_train[:, rfe.support_]
X_test_selected = X_test[:, rfe.support_]

In [None]:
from cuml.ensemble import RandomForestClassifier as cumlRF
from sklearn.metrics import classification_report, confusion_matrix

# 使用 GPU 加速的随机森林
gpu_rf_model = cumlRF(n_estimators=100, max_depth=10, random_state=42)
start_time = time.time()
gpu_rf_model.fit(X_train_selected, y_train)
end_time = time.time()

# 预测
y_pred_rf = gpu_rf_model.predict(X_test_selected)

# 评估指标
cm_rf = confusion_matrix(y_test, y_pred_rf)
print("Confusion Matrix (GPU Random Forest with selected features):")
print(cm_rf)
print("\nClassification Report (GPU Random Forest with selected features):")
print(classification_report(y_test, y_pred_rf))

print(f"Training and prediction time (GPU Random Forest): {end_time - start_time:.2f} seconds")

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.feature_selection import SelectFromModel

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
selector = SelectFromModel(rf, prefit=True)
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'max_features': ['sqrt', 'log2']
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_rf_model = grid_search.best_estimator_
y_pred_rf = best_rf_model.predict(X_train_selected)

In [None]:
from sklearn.metrics import classification_report, roc_auc_score
print(classification_report(y_test, y_pred_rf))
print("ROC-AUC Score:", roc_auc_score(y_test, best_rf_model.predict_proba(X_test_selected)[:, 1]))

## 2.Baseline Models

In [None]:
import pandas as pd
import numpy as np

In [None]:
final_numeric_data = pd.read_csv('cate10num.csv',index_col=0)
target = pd.read_csv('target.csv',index_col=0)
x_train_time = pd.read_csv('train_time.csv',index_col=0)

In [None]:
x_train_time = pd.to_datetime(x_train_time['piezo_measurement_date'])

In [None]:
for col in final_numeric_data.select_dtypes(include=['float']).columns:
    final_numeric_data[col] = final_numeric_data[col].astype('float32')

In [None]:
final_numeric_data['year'] = x_train_time.dt.year
final_numeric_data['month'] = x_train_time.dt.month
final_numeric_data['day'] = x_train_time.dt.day

In [None]:
final_numeric_data = final_numeric_data.fillna(0)

In [None]:
tag = (final_numeric_data['month'].between(6, 9)) & (final_numeric_data['year'] == 2021)

X_train = final_numeric_data[~tag]
X_test = final_numeric_data[tag]

y_train = target[~tag]
y_test = target[tag]

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
from sklearn.naive_bayes import GaussianNB
import time

nb_model = GaussianNB()

start_time = time.time()
nb_model.fit(X_train, y_train)
end_time = time.time()

y_pred_nb = nb_model.predict(X_test)

cm_nb = confusion_matrix(y_test, y_pred_nb)
print("Confusion Matrix (Naive Bayes):")
print(cm_nb)

print("\nClassification Report (Naive Bayes):")
print(classification_report(y_test, y_pred_nb))

print(f"Training and prediction time (Naive Bayes): {end_time - start_time:.2f} seconds")

In [None]:
from sklearn.naive_bayes import GaussianNB
import time

nb_model = GaussianNB()

start_time = time.time()
nb_model.fit(X_train, y_train)
end_time = time.time()

y_pred_nb = nb_model.predict(X_test)

cm_nb = confusion_matrix(y_test, y_pred_nb)
print("Confusion Matrix (Naive Bayes):")
print(cm_nb)

print("\nClassification Report (Naive Bayes):")
print(classification_report(y_test, y_pred_nb))

print(f"Training and prediction time (Naive Bayes): {end_time - start_time:.2f} seconds")

In [None]:
from sklearn.ensemble import RandomForestClassifier
import time

rf_model = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, random_state=42)

start_time = time.time()
rf_model.fit(X_train, y_train)
end_time = time.time()

y_pred_rf = rf_model.predict(X_test)

cm_rf = confusion_matrix(y_test, y_pred_rf)
print("Confusion Matrix (Random Forest):")
print(cm_rf)

print("\nClassification Report (Random Forest):")
print(classification_report(y_test, y_pred_rf))

print(f"Training and prediction time (Random Forest): {end_time - start_time:.2f} seconds")

In [None]:
from sklearn.neural_network import MLPClassifier
import time

mlp_model = MLPClassifier(alpha=1, max_iter=1000, random_state=42)

start_time = time.time()
mlp_model.fit(X_train, y_train)
end_time = time.time()

y_pred_mlp = mlp_model.predict(X_test)

cm_mlp = confusion_matrix(y_test, y_pred_mlp)
print("Confusion Matrix (MLP Classifier):")
print(cm_mlp)

print("\nClassification Report (MLP Classifier):")
print(classification_report(y_test, y_pred_mlp))

print(f"Training and prediction time (MLP Classifier): {end_time - start_time:.2f} seconds")

## 3. Raise Performance

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
num_classes = len(np.unique(target)) 
y_one_hot = to_categorical(target-1, num_classes) 
y_train = y_one_hot[~tag]
y_test = y_one_hot[tag]

In [None]:
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(num_classes, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

encoder = LabelEncoder()
target_encoded = encoder.fit_transform(target)

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(target_encoded),
    y=target_encoded
)

class_weights_dict = {i: w for i, w in enumerate(class_weights)}
print("Class Weights:", class_weights_dict)

In [None]:
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=128,
    class_weight=class_weights_dict,
    verbose=2
)

In [None]:
small_X_train = X_train[:1000]
small_y_train = y_train[:1000]

history = model.fit(
    small_X_train, small_y_train,
    validation_data=(X_test, y_test),
    epochs=50,
    batch_size=32
)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
feature_importances = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf.feature_importances_
}).sort_values(by='importance', ascending=False)

print(feature_importances.head(10)) 

In [None]:
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {test_accuracy:.4f}")

y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)

cm = confusion_matrix(y_test_classes, y_pred_classes)
print("Confusion Matrix:")
print(cm)

print("\nClassification Report:")
print(classification_report(y_test_classes, y_pred_classes))

In [None]:
# from sklearn.naive_bayes import GaussianNB
# import time

# nb_model = GaussianNB()

# start_time = time.time()
# nb_model.fit(X_train, y_train)
# end_time = time.time()

# y_pred_nb = nb_model.predict(X_test)

# cm_nb = confusion_matrix(y_test, y_pred_nb)
# print("Confusion Matrix (Naive Bayes):")
# print(cm_nb)

# print("\nClassification Report (Naive Bayes):")
# print(classification_report(y_test, y_pred_nb))

# print(f"Training and prediction time (Naive Bayes): {end_time - start_time:.2f} seconds")

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# import time

# rf_model = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, random_state=42)

# start_time = time.time()
# rf_model.fit(X_train, y_train)
# end_time = time.time()

# y_pred_rf = rf_model.predict(X_test)

# cm_rf = confusion_matrix(y_test, y_pred_rf)
# print("Confusion Matrix (Random Forest):")
# print(cm_rf)

# print("\nClassification Report (Random Forest):")
# print(classification_report(y_test, y_pred_rf))

# print(f"Training and prediction time (Random Forest): {end_time - start_time:.2f} seconds")

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

cm_kn = confusion_matrix(y_test, y_pred)
print("Confusion Matrix (KNN):")
print(cm_kn)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.svm import SVC
import time

svc_model = SVC(gamma=2, C=1, random_state=42)

start_time = time.time()
svc_model.fit(X_train, y_train)
end_time = time.time()

y_pred_svc = svc_model.predict(X_test)

cm_svc = confusion_matrix(y_test, y_pred_svc)
print("Confusion Matrix (SVC):")
print(cm_svc)

print("\nClassification Report (SVC):")
print(classification_report(y_test, y_pred_svc))

print(f"Training and prediction time (SVC): {end_time - start_time:.2f} seconds")

In [None]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
import time

gpc_model = GaussianProcessClassifier(1.0 * RBF(1.0), random_state=42)

start_time = time.time()
gpc_model.fit(X_train, y_train_split)
end_time = time.time()

y_pred_gpc = gpc_model.predict(X_test)

cm_gpc = confusion_matrix(y_test_split, y_pred_gpc)
print("Confusion Matrix (Gaussian Process):")
print(cm_gpc)

print("\nClassification Report (Gaussian Process):")
print(classification_report(y_test_split, y_pred_gpc))

print(f"Training and prediction time (Gaussian Process): {end_time - start_time:.2f} seconds")

In [None]:
from sklearn.ensemble import AdaBoostClassifier
import time

adaboost_model = AdaBoostClassifier(algorithm="SAMME", random_state=42)

start_time = time.time()
adaboost_model.fit(X_train, y_train)
end_time = time.time()

y_pred_adaboost = adaboost_model.predict(X_test)

cm_adaboost = confusion_matrix(y_test, y_pred_adaboost)
print("Confusion Matrix (AdaBoost):")
print(cm_adaboost)

print("\nClassification Report (AdaBoost):")
print(classification_report(y_test, y_pred_adaboost))

print(f"Training and prediction time (AdaBoost): {end_time - start_time:.2f} seconds")

In [None]:
# from sklearn.neural_network import MLPClassifier
# import time

# mlp_model = MLPClassifier(alpha=1, max_iter=1000, random_state=42)

# start_time = time.time()
# mlp_model.fit(X_train, y_train)
# end_time = time.time()

# y_pred_mlp = mlp_model.predict(X_test)

# cm_mlp = confusion_matrix(y_test, y_pred_mlp)
# print("Confusion Matrix (MLP Classifier):")
# print(cm_mlp)

# print("\nClassification Report (MLP Classifier):")
# print(classification_report(y_test, y_pred_mlp))

# print(f"Training and prediction time (MLP Classifier): {end_time - start_time:.2f} seconds")