In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

class PreProcessor:
    def __init__(self):
        # Initialization method (if needed, can add parameters for configuration)
        self.data = None


    #Function to be used to load data from a file into a dataframe, takes the file path and the file type as input
    def load_data(self, file_path: str, file_type: str = 'csv'):

        if file_type == 'csv':
            self.data = pd.read_csv(file_path)
        elif file_type == 'excel':
            self.data = pd.read_excel(file_path)
        else:
            raise ValueError("Unsupported file type. Supported types are 'csv' and 'excel'.")
        
        return self.data

    #cleans a dataframe by dropping duplicates and filling missing values, takes
    def clean_data(self, df: pd.DataFrame):
        # Drop duplicates
        df = df.drop_duplicates()

        # Handle missing values (example: fill with mean for numeric columns)
        for column in df.select_dtypes(include=['number']).columns:
            df[column] = df[column].fillna(df[column].mean())

        # Handle missing values (example: fill with mode for categorical columns)
        for column in df.select_dtypes(include=['object']).columns:
            df[column] = df[column].fillna(df[column].mode()[0])

        # Convert categorical columns to category type
        #for column in df.select_dtypes(include=['object']).columns:
        #   df[column] = df[column].astype('category')

        return df
    
    #same as clean_data but with a different approach to filling missing valuees - pipleline approach
    def prepare_pipleline(self, df: pd.DataFrame):
       # Drop duplicates
        df = df.drop_duplicates()
        numerical_imputer = SimpleImputer(strategy='mean')
        categorical_imputer = SimpleImputer(strategy='most_frequent')

        # Create a ColumnTransformer
        preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_imputer, df.select_dtypes(include=['number']).columns),
            ('cat', categorical_imputer, df.select_dtypes(include=['object']).columns)
        ])
        return df
    
    #split the data into training and testing sets, takes the target column name, test size and random state as input    
    def split_data(self, target: str, test_size: float = 0.2, random_state: int = 42):
        if self.data is None:
            raise ValueError("Data not loaded. Please load the data first using load_data method.")

        X = self.data.drop(columns=[target])
        y = self.data[target]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

        return X_train, X_test, y_train, y_test

        #placeholder for any further transformations that may be needed
        def transform_data(self):
          
            pass


        #One Hot Encoding for Categorical Variables    
        def encode_categorical(self, df: pd.DataFrame):
            df = pd.get_dummies(df, drop_first=True)
            return df





