In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer

class MLUtilityLinearRegression:
    
    def __init__(self, dataset_path, watermarked_data_path=None, test_size=0.3, random_state=42):
     
        self.dataset_path = dataset_path
        self.watermarked_data_path = watermarked_data_path
        self.test_size = test_size
        self.random_state = random_state
        self.model = LinearRegression()

    def load_data(self, file_path):
       
        _, file_extension = os.path.splitext(file_path)
        if(file_extension == '.csv'):
            data = pd.read_csv(file_path)
        elif(file_extension == '.npy'):
            loaded_results = np.load(file_path, allow_pickle=True).item()
            data = loaded_results['watermarked_data']
        return data

    def preprocess_data(self, data):

        X = data.drop(columns=['MEDV'])
        y = data['MEDV']
        X = X.dropna()
        y = y[X.index]
        return X, y

    def split_data(self, X, y):

        return train_test_split(X, y, test_size=self.test_size, random_state=self.random_state)

    def train_model(self, X_train, y_train):

        self.model.fit(X_train, y_train)

    def predict_and_evaluate(self, X_test, y_test):

        y_pred = self.model.predict(X_test)
        return mean_squared_error(y_test, y_pred)

    def compute_mse(self):

        origin_data = self.load_data(self.dataset_path)
        X_origin, y_origin = self.preprocess_data(origin_data)
        X_train, X_test, y_train, y_test = self.split_data(X_origin, y_origin)
        
        self.train_model(X_train, y_train)
        mse_origin = self.predict_and_evaluate(X_test, y_test)
        print(f"MSE of the original data: {mse_origin:.4f}")

        if self.watermarked_data_path:
            watermarked_data = self.load_data(self.watermarked_data_path)
            X_watermarked, y_watermarked = self.preprocess_data(watermarked_data)
            X_train, _, y_train, _ = self.split_data(X_watermarked, y_watermarked)

            self.train_model(X_train, y_train)
            mse_watermarked = self.predict_and_evaluate(X_test, y_test)
            print(f"MSE of the watermarked data: {mse_watermarked:.4f}")

if __name__ == "__main__":
    dataset = "housing"
    dataset_path = "datasets/HousingData.csv"
    watermarked_data_path = f"datasets/housing-10000.npy"  
    
    measure_ml_util = MLUtilityLinearRegression(dataset_path, watermarked_data_path)
    measure_ml_util.compute_mse()
