# Training

In [None]:
import pandas as pd

X = pd.read_csv("data/X_train_ready.csv")
y = pd.read_csv("data/y_train_processed.csv")
print(X.shape, y.shape)

In [None]:
print(X.shape, y.shape)
print(X.columns)
print(y.columns)

In [None]:
# Keep only the rows that are in X
y = y[y["row_index"].isin(X["row_index"])]

print(X.shape, y.shape)

In [None]:
y.info()

# Data split

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(
    X, y["piezo_groundwater_level_category"], test_size=0.2, random_state=42
)

### Base classifiers

In [None]:
from sklearn.linear_model import RidgeClassifier

clf = RidgeClassifier()
# Select numerical columns only
numerical_cols = X.select_dtypes(include=["float64", "int64"]).columns

clf.fit(X_train[numerical_cols], y_train)
y_pred = clf.predict(X_test[numerical_cols])

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


## Try a quadratic model

In [None]:
from sklearn.svm import SVC

svc = SVC(C=10)
svc.fit(X_train[numerical_cols], y_train)
y_pred = svc.predict(X_test[numerical_cols])

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


In [None]:
# Encode the categorical columns
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import RidgeClassifier

numerical_cols = X.select_dtypes(include=["float64", "int64"]).columns
categorical_cols = X.select_dtypes(include=["object"]).columns

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy="constant")

# Preprocessing for categorical data
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

# Bundle preprocessing for numerical and categorical data

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

# Define the model

model = RidgeClassifier()

# Create and evaluate the pipeline
clf = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

In [None]:
def fit_and_score(estimator, X_train, X_test, y_train, y_test):
    """Fit the estimator on the train set and score it on both sets"""
    estimator.fit(X_train, y_train, eval_set=[(X_test, y_test)])

    train_score = estimator.score(X_train, y_train)
    test_score = estimator.score(X_test, y_test)

    return estimator, train_score, test_score


from sklearn.base import clone
from sklearn.model_selection import StratifiedKFold, cross_validate
import xgboost as xgb

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=94)

clf = xgb.XGBClassifier(
    tree_method="hist", early_stopping_rounds=3, enable_categorical=True
)

results = {}

for train, test in cv.split(X, y_encode["target"]):
    X_train = X[train]
    X_test = X[test]
    y_train = y[train]
    y_test = y[test]
    est, train_score, test_score = fit_and_score(
        clone(clf), X_train, X_test, y_train, y_test
    )
    results[est] = (train_score, test_score)

In [None]:
from xgboost import XGBClassifier
import time

# Adjust target classes to start from 0
y_train_adjusted = y_train - 1
y_test_adjusted = y_test - 1

strat_time = time.time()
xgb = XGBClassifier(enable_categorical=True)
xgb.fit(X_train[numerical_cols], y_train_adjusted)
end_time = time.time()
print("Training Time: ", end_time - strat_time)

y_pred = xgb.predict(X_test[numerical_cols])

cm_kn = confusion_matrix(y_test, y_pred)
print("Confusion Matrix (KNN):")
print(cm_kn)

print("\nClassification Report:")
print(classification_report(y_test_adjusted, y_pred))

In [None]:
# Encode the categorical columns
# Create and evaluate the pipeline
xgb = XGBClassifier(
    enable_categorical=True,
    tree_method="hist",
    cearly_stopping_rounds=3,
    radom_state=1,
    reg_alpha=0.1,
    reg_lambda=0.1,
    max_depth=3,
    n_estimators=1000,
)
xgb_encode = Pipeline(steps=[("preprocessor", preprocessor), ("model", xgb)])

xgb_encode.fit(X_train, y_train_adjusted)
y_pred = xgb_encode.predict(X_test)

print(accuracy_score(y_test_adjusted, y_pred))
print(classification_report(y_test_adjusted, y_pred))
print(confusion_matrix(y_test_adjusted, y_pred))