## 1.Data Preparation

In [None]:
import pandas as pd
import numpy as np

In [None]:
x_train = pd.read_csv('../data/X_train_Hi5.csv')

In [None]:
target_cat = {'High':4, 'Very High':5, 'Very Low' :1, 'Low':2, 'Average':3}
target = x_train['piezo_groundwater_level_category'].apply(lambda x : target_cat.get(x, 0))

In [None]:
target.to_csv('target.csv')

In [None]:
categorical_columns = [
    col for col in x_train.select_dtypes(include=['object']).columns
    if x_train[col].nunique() <= 10
]

encoded_data = pd.DataFrame()

for col in categorical_columns:
    encoded_data[col] = pd.factorize(x_train[col])[0]

In [None]:
numeric_columns = x_train.select_dtypes(include=['number']).columns
numeric_data = x_train[numeric_columns]

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

scaler = MinMaxScaler()
normalized_data = pd.DataFrame(scaler.fit_transform(numeric_data), columns=numeric_columns)
normalized_data = normalized_data.fillna(0)

In [None]:
def standardize_in_chunks(df, chunk_size):
    means = df.mean()
    stds = df.std()

    standardized_chunks = []
    for i in range(0, len(df), chunk_size):
        chunk = df.iloc[i:i + chunk_size]
        standardized_chunk = (chunk - means) / stds
        standardized_chunks.append(standardized_chunk)
    return pd.concat(standardized_chunks)

chunk_size = 10000 
standardized_data = standardize_in_chunks(normalized_data, chunk_size)

# standard_scaler = StandardScaler()
# standardized_data = pd.DataFrame(standard_scaler.fit_transform(normalized_data), columns=numeric_columns)

In [None]:
del x_train, numeric_data, normalized_data

In [None]:
for col in encoded_data.columns:
    encoded_data[col] = encoded_data[col].astype('int8')

In [None]:
final_numeric_data = pd.DataFrame()

for start in range(0, len(standardized_data), chunk_size):
    end = start + chunk_size
    chunk_standardized = standardized_data.iloc[start:end]
    chunk_encoded = encoded_data.iloc[start:end]
    chunk_combined = pd.concat([chunk_standardized, chunk_encoded], axis=1)
    final_numeric_data = pd.concat([final_numeric_data, chunk_combined], axis=0)

In [None]:
del standardized_data, encoded_data

In [None]:
final_numeric_data.to_csv('cate10num.csv')

## 2.Baseline Models

In [None]:
import pandas as pd
import numpy as np

In [None]:
final_numeric_data = pd.read_csv('cate10num.csv',index_col=0)
target = pd.read_csv('target.csv',index_col=0)

In [None]:
for col in final_numeric_data.select_dtypes(include=['float']).columns:
    final_numeric_data[col] = final_numeric_data[col].astype('float32')

In [None]:
final_numeric_data = final_numeric_data.drop(columns = 'piezo_groundwater_level_category')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

final_numeric_data = final_numeric_data.fillna(0)

split_idx = int(len(final_numeric_data) * 0.8)

X_train = final_numeric_data.iloc[:split_idx]
X_test = final_numeric_data.iloc[split_idx:]
y_train = target.iloc[:split_idx]
y_test = target.iloc[split_idx:]

In [None]:
from sklearn.naive_bayes import GaussianNB
import time

nb_model = GaussianNB()

start_time = time.time()
nb_model.fit(X_train, y_train)
end_time = time.time()

y_pred_nb = nb_model.predict(X_test)

cm_nb = confusion_matrix(y_test, y_pred_nb)
print("Confusion Matrix (Naive Bayes):")
print(cm_nb)

print("\nClassification Report (Naive Bayes):")
print(classification_report(y_test, y_pred_nb))

print(f"Training and prediction time (Naive Bayes): {end_time - start_time:.2f} seconds")

In [None]:
from sklearn.ensemble import RandomForestClassifier
import time

rf_model = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, random_state=42)

start_time = time.time()
rf_model.fit(X_train, y_train)
end_time = time.time()

y_pred_rf = rf_model.predict(X_test)

cm_rf = confusion_matrix(y_test, y_pred_rf)
print("Confusion Matrix (Random Forest):")
print(cm_rf)

print("\nClassification Report (Random Forest):")
print(classification_report(y_test, y_pred_rf))

print(f"Training and prediction time (Random Forest): {end_time - start_time:.2f} seconds")

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

cm_kn = confusion_matrix(y_test, y_pred)
print("Confusion Matrix (KNN):")
print(cm_kn)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.svm import SVC
import time

svc_model = SVC(gamma=2, C=1, random_state=42)

start_time = time.time()
svc_model.fit(X_train, y_train)
end_time = time.time()

y_pred_svc = svc_model.predict(X_test)

cm_svc = confusion_matrix(y_test, y_pred_svc)
print("Confusion Matrix (SVC):")
print(cm_svc)

print("\nClassification Report (SVC):")
print(classification_report(y_test, y_pred_svc))

print(f"Training and prediction time (SVC): {end_time - start_time:.2f} seconds")

In [None]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
import time

gpc_model = GaussianProcessClassifier(1.0 * RBF(1.0), random_state=42)

start_time = time.time()
gpc_model.fit(X_train, y_train_split)
end_time = time.time()

y_pred_gpc = gpc_model.predict(X_test)

cm_gpc = confusion_matrix(y_test_split, y_pred_gpc)
print("Confusion Matrix (Gaussian Process):")
print(cm_gpc)

print("\nClassification Report (Gaussian Process):")
print(classification_report(y_test_split, y_pred_gpc))

print(f"Training and prediction time (Gaussian Process): {end_time - start_time:.2f} seconds")

In [None]:
from sklearn.ensemble import AdaBoostClassifier
import time

adaboost_model = AdaBoostClassifier(algorithm="SAMME", random_state=42)

start_time = time.time()
adaboost_model.fit(X_train, y_train)
end_time = time.time()

y_pred_adaboost = adaboost_model.predict(X_test)

cm_adaboost = confusion_matrix(y_test, y_pred_adaboost)
print("Confusion Matrix (AdaBoost):")
print(cm_adaboost)

print("\nClassification Report (AdaBoost):")
print(classification_report(y_test, y_pred_adaboost))

print(f"Training and prediction time (AdaBoost): {end_time - start_time:.2f} seconds")

In [None]:
from sklearn.neural_network import MLPClassifier
import time

mlp_model = MLPClassifier(alpha=1, max_iter=1000, random_state=42)

start_time = time.time()
mlp_model.fit(X_train, y_train)
end_time = time.time()

y_pred_mlp = mlp_model.predict(X_test)

cm_mlp = confusion_matrix(y_test, y_pred_mlp)
print("Confusion Matrix (MLP Classifier):")
print(cm_mlp)

print("\nClassification Report (MLP Classifier):")
print(classification_report(y_test, y_pred_mlp))

print(f"Training and prediction time (MLP Classifier): {end_time - start_time:.2f} seconds")