In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import unittest
from io import StringIO
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

class PreProcessor:
    def __init__(self):
        # Initialization method (if needed, can add parameters for configuration)
        self.data = None

        class TestPreProcessor(unittest.TestCase):

            def setUp(self):
                # Sample data for testing
                self.csv_data = StringIO("""
                A,B,C,D
                1,2,3,4
                5,6,,8
                9,10,11,12
                1,2,3,4
                """)
                self.df = pd.read_csv(self.csv_data)
                self.processor = PreProcessor()

            def test_load_data(self):
                processor = PreProcessor()
                data = processor.load_data(self.csv_data)
                self.assertEqual(len(data), 4)
                self.assertEqual(list(data.columns), ['A', 'B', 'C', 'D'])

            def test_clean_data(self):
                cleaned_df = self.processor.clean_data(self.df)
                self.assertEqual(cleaned_df.isnull().sum().sum(), 0)
                self.assertEqual(len(cleaned_df), 3)  # One duplicate row should be dropped

            def test_prepare_pipeline(self):

                df = self.processor.prepare_pipeline(self.df)
                self.assertEqual(len(df), 3)  # One duplicate row should be dropped

            def test_split_data(self):
                self.processor.data = self.df
                X_train, X_test, y_train, y_test = self.processor.split_data(target='D')
                self.assertEqual(len(X_train) + len(X_test), 3)
                self.assertEqual(len(y_train) + len(y_test), 3)

            def test_encode_categorical(self):
                df = pd.DataFrame({
                    'A': ['a', 'b', 'a'],
                    'B': [1, 2, 3]
                })
                encoded_df = self.processor.encode_categorical(df)
                self.assertEqual(list(encoded_df.columns), ['B', 'A_b'])

        if __name__ == '__main__':
            unittest.main(argv=[''], exit=False)
    def __init__(self, file_path: str, file_type: str = 'csv', target=None, model_type=None):
        self.file_path = file_path
        self.data = self.load_data(file_path, file_type)
        self.target
        self.model_type
    

    #Function to be used to load data from a file into a dataframe, takes the file path and the file type as input
    def load_data(self, file_path: str, file_type: str = 'csv'):

        if file_type == 'csv':
            self.data = pd.read_csv(file_path)
        elif file_type == 'excel':
            self.data = pd.read_excel(file_path)
        else:
            raise ValueError("Unsupported file type. Supported types are 'csv' and 'excel'.")
        
        return self.data

    #cleans a dataframe by dropping duplicates and filling missing values, takes
    def clean_data(self, df: pd.DataFrame):
        # Drop duplicates
        df = df.drop_duplicates()

        # Handle missing values (example: fill with mean for numeric columns)
        for column in df.select_dtypes(include=['number']).columns:
            df[column] = df[column].fillna(df[column].mean())

        # Handle missing values (example: fill with mode for categorical columns)
        for column in df.select_dtypes(include=['object']).columns:
            df[column] = df[column].fillna(df[column].mode()[0])

        # Convert categorical columns to category type
        #for column in df.select_dtypes(include=['object']).columns:
        #   df[column] = df[column].astype('category')

        return df
    
    #same as clean_data but with a different approach to filling missing valuees - pipleline approach
    def prepare_pipleline(self, df: pd.DataFrame):
       # Drop duplicates
        df = df.drop_duplicates()
        numerical_imputer = SimpleImputer(strategy='mean')
        categorical_imputer = SimpleImputer(strategy='most_frequent')

        # Create a ColumnTransformer
        preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_imputer, df.select_dtypes(include=['number']).columns),
            ('cat', categorical_imputer, df.select_dtypes(include=['object']).columns)
        ])
        return df
    
    #split the data into training and testing sets, takes the target column name, test size and random state as input    
    def split_data(self, target: str, test_size: float = 0.2, random_state: int = 42):
        if self.data is None:
            raise ValueError("Data not loaded. Please load the data first using load_data method.")

        X = self.data.drop(columns=[target])
        y = self.data[target]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

        return X_train, X_test, y_train, y_test

        #placeholder for any further transformations that may be needed
        def transform_data(self):
          
            pass


        #One Hot Encoding for Categorical Variables    
        def encode_categorical(self, df: pd.DataFrame):
            df = pd.get_dummies(df, drop_first=True)
            return df







KeyboardInterrupt: 