In [5]:
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

In [6]:
#load data

df = pd.read_csv("../data/raw/data.csv")


In [7]:
df.head()
df.info()
df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95662 entries, 0 to 95661
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   TransactionId         95662 non-null  object 
 1   BatchId               95662 non-null  object 
 2   AccountId             95662 non-null  object 
 3   SubscriptionId        95662 non-null  object 
 4   CustomerId            95662 non-null  object 
 5   CurrencyCode          95662 non-null  object 
 6   CountryCode           95662 non-null  int64  
 7   ProviderId            95662 non-null  object 
 8   ProductId             95662 non-null  object 
 9   ProductCategory       95662 non-null  object 
 10  ChannelId             95662 non-null  object 
 11  Amount                95662 non-null  float64
 12  Value                 95662 non-null  int64  
 13  TransactionStartTime  95662 non-null  object 
 14  PricingStrategy       95662 non-null  int64  
 15  FraudResult        

Unnamed: 0,CountryCode,Amount,Value,PricingStrategy,FraudResult
count,95662.0,95662.0,95662.0,95662.0,95662.0
mean,256.0,6717.846,9900.584,2.255974,0.002018
std,0.0,123306.8,123122.1,0.732924,0.044872
min,256.0,-1000000.0,2.0,0.0,0.0
25%,256.0,-50.0,275.0,2.0,0.0
50%,256.0,1000.0,1000.0,2.0,0.0
75%,256.0,2800.0,5000.0,2.0,0.0
max,256.0,9880000.0,9880000.0,4.0,1.0


In [8]:
print(df.shape)
print(df.columns)
print(df.head())

(95662, 16)
Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId',
       'ProductCategory', 'ChannelId', 'Amount', 'Value',
       'TransactionStartTime', 'PricingStrategy', 'FraudResult'],
      dtype='object')
         TransactionId         BatchId       AccountId       SubscriptionId  \
0  TransactionId_76871   BatchId_36123  AccountId_3957   SubscriptionId_887   
1  TransactionId_73770   BatchId_15642  AccountId_4841  SubscriptionId_3829   
2  TransactionId_26203   BatchId_53941  AccountId_4229   SubscriptionId_222   
3    TransactionId_380  BatchId_102363   AccountId_648  SubscriptionId_2185   
4  TransactionId_28195   BatchId_38780  AccountId_4841  SubscriptionId_3829   

        CustomerId CurrencyCode  CountryCode    ProviderId     ProductId  \
0  CustomerId_4406          UGX          256  ProviderId_6  ProductId_10   
1  CustomerId_4406          UGX          256  ProviderId_4   ProductId

In [9]:
class DateTimeFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, datetime_col='TransactionStartTime'):
        self.datetime_col = datetime_col

    def fit(self, X, y=None):
        return self

In [10]:
def transform(self, X):
        X = X.copy()
        X[self.datetime_col] = pd.to_datetime(X[self.datetime_col])
        X['TransactionHour'] = X[self.datetime_col].dt.hour
        X['TransactionDay'] = X[self.datetime_col].dt.day
        X['TransactionMonth'] = X[self.datetime_col].dt.month
        X['TransactionYear'] = X[self.datetime_col].dt.year
        X.drop(columns=[self.datetime_col], inplace=True)
        return X

In [11]:
class AggregateFeatureGenerator(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

In [12]:
def transform(self, X):
        X = X.copy()
        customer_agg = X.groupby('CustomerId')['Amount'].agg([
            ('TotalTransactionAmount', 'sum'),
            ('AverageTransactionAmount', 'mean'),
            ('TransactionCount', 'count'),
            ('StdTransactionAmount', 'std')
        ]).reset_index()
        X = X.merge(customer_agg, on='CustomerId', how='left')
        return X

In [13]:
def build_pipeline(df, target_col='FraudResult'):
    df = df.copy()
    df = df.drop(columns=['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId',
                          'ProviderId', 'ProductId', 'ChannelId'])

    datetime_pipe = Pipeline([
        ('datetime_features', DateTimeFeatureExtractor())
    ])

    aggregate_pipe = Pipeline([
        ('aggregate_features', AggregateFeatureGenerator())
    ])

    df = datetime_pipe.fit_transform(df)
    df = aggregate_pipe.fit_transform(df)

In [14]:


# Load raw CSV data from the given file path.

def load_csv_data(file_path: str = "../data/raw/data.csv") -> pd.DataFrame:
    
    return pd.read_csv(file_path) 


In [15]:
# Define target
target_col = 'FraudResult'

In [16]:
target_col = 'FraudResult'  # Replace with actual column name

# Separate features and target
y = df[target_col]
X = df.drop(columns=[target_col])


In [17]:

numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

numeric_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [18]:
full_pipeline = ColumnTransformer([
        ('num', numeric_pipeline, numerical_cols),
        ('cat', categorical_pipeline, categorical_cols)
    ])


In [19]:
def run_feature_engineering(input_path, output_pipeline_path):
    df = pd.read_csv(input_path)
    X, y, pipeline = build_pipeline(df)

    os.makedirs(output_pipeline_path, exist_ok=True)
    joblib.dump(pipeline, os.path.join(output_pipeline_path, 'feature_pipeline.pkl'))

    return X, y

In [20]:
def limit_cardinality(df, col, top_n=10):
    top_categories = df[col].value_counts().nlargest(top_n).index
    df[col] = df[col].apply(lambda x: x if x in top_categories else 'Other')
    return df


In [21]:
print("Shape before transform:", X.shape)
print("Number of categorical columns:", len(categorical_cols))
print("Unique levels in each categorical column:")
for col in categorical_cols:
    print(col, X[col].nunique())


Shape before transform: (95662, 15)
Number of categorical columns: 11
Unique levels in each categorical column:
TransactionId 95662
BatchId 94809
AccountId 3633
SubscriptionId 3627
CustomerId 3742
CurrencyCode 1
ProviderId 6
ProductId 23
ProductCategory 9
ChannelId 4
TransactionStartTime 94556
