In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import xgboost as xgb

class MLUtilityXGBoost:
    def __init__(self, dataset_path, watermarked_data_path=None, test_size=0.3, random_state=42):
        self.dataset_path = dataset_path
        self.watermarked_data_path = watermarked_data_path
        self.test_size = test_size
        self.random_state = random_state
        self.model = xgb.XGBClassifier(n_estimators=30, max_depth=10, n_jobs=4)

    def load_data(self, file_path):
        _, file_extension = os.path.splitext(file_path)
        data = None
        if file_extension == '.npy':
            loaded_results = np.load(file_path, allow_pickle=True).item()
            data = loaded_results['watermarked_data']
        else:
            data = pd.read_csv(file_path)
        return data

    def preprocess_data(self, data):
        X = data.drop(columns=['Cover_Type'])
        y = data['Cover_Type']
        le = LabelEncoder()
        y = le.fit_transform(y)
        return X, y

    def split_data(self, X, y):
        return train_test_split(X, y, test_size=self.test_size, random_state=self.random_state)

    def train_model(self, X_train, y_train):
        self.model.fit(X_train, y_train)

    def predict_and_evaluate(self, X_test, y_test):
        y_pred = self.model.predict(X_test)
        return f1_score(y_test, y_pred, average=None)

    def compute_f1(self):
        origin_data = self.load_data(self.dataset_path)
        X_origin, y_origin = self.preprocess_data(origin_data)
        X_train, X_test, y_train, y_test = self.split_data(X_origin, y_origin)

        self.train_model(X_train, y_train)
        f1_origin = self.predict_and_evaluate(X_test, y_test)
        print(f"F1-scores of the original data:")
        for i, score in enumerate(f1_origin):
            print(f"Category {i}: F1-score = {score:.4f}")

        if self.watermarked_data_path:
            watermarked_data = self.load_data(self.watermarked_data_path)
            X_watermarked, y_watermarked = self.preprocess_data(watermarked_data)
            X_train, _, y_train, _ = self.split_data(X_watermarked, y_watermarked)

            self.train_model(X_train, y_train)
            f1_watermarked = self.predict_and_evaluate(X_test, y_test)
            print(f"F1-scores of the watermarked data:")
            for i, score in enumerate(f1_watermarked):
                print(f"Category {i}: F1-score = {score:.4f}")


if __name__ == "__main__":
    dataset = "covertype"
    dataset_path = "datasets/covtype_with_key.subset.data"
    watermarked_data_path = "datasets/watermark/covertype/covertype-10000.npy"
    
    measure_ml_util = MLUtilityXGBoost(dataset_path, watermarked_data_path)
    measure_ml_util.compute_f1()