In [18]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.combine import SMOTETomek
from sklearn.ensemble import IsolationForest


import warnings
warnings.filterwarnings("ignore")

In [19]:
# Defining constants
dataset_path = 'datasets'
feature_store_file_path = os.path.join(dataset_path, 'creditcard.csv')
target_column = 'Class'
artifact_folder = 'artifacts'
transformed_train_file_path: str = os.path.join(artifact_folder, 'train.csv')
transformed_test_file_path: str = os.path.join(artifact_folder, 'test.csv')
RANDOM_STATE = 1
TEST_SIZE = 0.25

In [20]:
# Get data function
def get_data(file_path: str) -> pd.DataFrame:

    return pd.read_csv(file_path)

In [21]:
# Function to get the anomaly scores using Isolation Forest
def get_anomaly_scores(X_train, X_test):
    isolation_forest = IsolationForest(contamination=0.00172, random_state=1)
    isolation_forest.fit(X_train)
    
    anomaly_scores_train = isolation_forest.decision_function(X_train)
    anomaly_scores_test = isolation_forest.decision_function(X_test)
    
    return anomaly_scores_train, anomaly_scores_test

In [22]:
# Function to get data transformer object
def get_data_transformer_object():
    pca_features = ['V' + str(i) for i in range(1, 29)]
    other_features = ['Time', 'Amount', 'anomaly_score']

    pca_features_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))])
    other_features_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('pca_features', pca_features_transformer, pca_features),
            ('other_features', other_features_transformer, other_features)
        ]
    )
    
    return preprocessor

In [25]:
# Function to initiate data transformation
def initiate_data_transformation():
    # Load data
    df = get_data(feature_store_file_path)

    # Split data into features and target variable
    X = df.drop(columns=[target_column])
    y = df[target_column]

    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y)

    # Get anomaly scores for training data
    anomaly_scores_train, anomaly_scores_test = get_anomaly_scores(X_train, X_test)

    # Add anomaly scores to train and test data
    X_train['anomaly_score'] = anomaly_scores_train
    X_test['anomaly_score'] = anomaly_scores_test

    # Get the data transformer object
    preprocessor = get_data_transformer_object()

    # Fit and transform the training data
    X_train_transformed = preprocessor.fit_transform(X_train)

    # Transform the test data
    X_test_transformed = preprocessor.transform(X_test)

    # Handle class imbalance using SMOTE
    smote = SMOTETomek(random_state=1, sampling_strategy=0.2)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_transformed, y_train)

    # Concatenate the transformed features and target variable
    train_df = pd.DataFrame(X_train_resampled, columns=X_train.columns)
    train_df[target_column] = y_train_resampled
    test_df = pd.DataFrame(X_test_transformed, columns=X_test.columns)
    test_df[target_column] = y_test

    # Save the transformed data
    os.makedirs(artifact_folder, exist_ok=True)
    train_df.to_csv(transformed_train_file_path, index=False)
    test_df.to_csv(transformed_test_file_path, index=False)

In [26]:
initiate_data_transformation()