# Feature Engineering

In [34]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import scorecardpy as sc

In [35]:
# Load your dataset (replace 'your_file.csv' with your actual file path)
df = pd.read_csv('C:/Users/Administrator/Documents/kifiya/Week_6/cleaned_data.csv')

In [36]:
print(df.columns)

Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId',
       'ProductCategory', 'ChannelId', 'Amount', 'Value',
       'TransactionStartTime', 'PricingStrategy', 'FraudResult'],
      dtype='object')


## 1. Create Aggregate Features

In [37]:
# Total Transaction Amount per customer
df['TotalTransactionAmount'] = df.groupby('CustomerId')['Amount'].transform('sum')

# Average Transaction Amount per customer
df['AverageTransactionAmount'] = df.groupby('CustomerId')['Amount'].transform('mean')

# Transaction Count per customer
df['TransactionCount'] = df.groupby('CustomerId')['TransactionId'].transform('count')

# Standard Deviation of Transaction Amounts per customer
df['TransactionAmountStd'] = df.groupby('CustomerId')['Amount'].transform('std').fillna(0)

## 2. Extract Time-Based Features

In [38]:
df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'], errors='coerce')

df['TransactionHour'] = df['TransactionStartTime'].dt.hour
df['TransactionDay'] = df['TransactionStartTime'].dt.day
df['TransactionMonth'] = df['TransactionStartTime'].dt.month
df['TransactionYear'] = df['TransactionStartTime'].dt.year

## 3. Encode Categorical Variables using WOE

In [39]:
# Assuming 'FraudResult' is the target variable and 'ProductCategory', 'ProviderId', 'ChannelId' are the features
features = ['ProductCategory', 'ProviderId', 'ChannelId']

# Calculate the WOE and IV for each feature
bins = sc.woebin(df, y='FraudResult', x=features)

# Apply the WOE transformation to the dataset
df_woe = sc.woebin_ply(df, bins)

[INFO] creating woe binning ...


  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  datetime_cols 

[INFO] converting into woe values ...


In [40]:
print(df_woe.columns)

Index(['TransactionDay', 'TransactionHour', 'BatchId', 'TransactionId',
       'Value', 'TransactionYear', 'ProductId', 'CurrencyCode',
       'TotalTransactionAmount', 'TransactionCount', 'Amount',
       'SubscriptionId', 'TransactionMonth', 'PricingStrategy', 'AccountId',
       'FraudResult', 'AverageTransactionAmount', 'TransactionStartTime',
       'CountryCode', 'TransactionAmountStd', 'CustomerId', 'ProviderId_woe',
       'ProductCategory_woe', 'ChannelId_woe'],
      dtype='object')


## 4. Handle Missing Values

In [41]:
# Fill missing numerical columns with median
for col in ['Amount', 'Value', 'TotalTransactionAmount', 'AverageTransactionAmount', 'TransactionAmountStd']:
    df_woe[col].fillna(df_woe[col].median(), inplace=True)

# Handle missing values for categorical WOE columns
for col in ['ProductCategory_woe', 'CurrencyCode', 'ProviderId_woe', 'ChannelId_woe']:
    df_woe[col].fillna(df_woe[col].mode()[0], inplace=True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_woe[col].fillna(df_woe[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_woe[col].fillna(df_woe[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we ar

## 5. Normalize/Standardize Numerical Features

In [42]:
# Standardize (mean=0, std=1) the numerical features
scaler = StandardScaler()
df_woe[['Amount', 'Value', 'TotalTransactionAmount', 'AverageTransactionAmount', 'TransactionAmountStd']] = scaler.fit_transform(
    df_woe[['Amount', 'Value', 'TotalTransactionAmount', 'AverageTransactionAmount', 'TransactionAmountStd']]
)

In [43]:
# Final check of the dataset
display(df_woe.head())

Unnamed: 0,TransactionDay,TransactionHour,BatchId,TransactionId,Value,TransactionYear,ProductId,CurrencyCode,TotalTransactionAmount,TransactionCount,...,AccountId,FraudResult,AverageTransactionAmount,TransactionStartTime,CountryCode,TransactionAmountStd,CustomerId,ProviderId_woe,ProductCategory_woe,ChannelId_woe
0,15,2,BatchId_36123,TransactionId_76871,-0.072291,2018,ProductId_10,UGX,0.170118,119,...,AccountId_3957,0,-0.067623,2018-11-15 02:18:49+00:00,256,-0.167016,CustomerId_4406,-2.906446,-1.690824,0.484515
1,15,2,BatchId_15642,TransactionId_73770,-0.080251,2018,ProductId_6,UGX,0.170118,119,...,AccountId_4841,0,-0.067623,2018-11-15 02:19:08+00:00,256,-0.167016,CustomerId_4406,-2.906446,0.607033,-2.736867
2,15,2,BatchId_53941,TransactionId_26203,-0.076352,2018,ProductId_1,UGX,0.165122,2,...,AccountId_4229,0,-0.072568,2018-11-15 02:44:21+00:00,256,-0.201209,CustomerId_4683,-2.906446,-1.690824,0.484515
3,15,3,BatchId_102363,TransactionId_380,0.096648,2018,ProductId_21,UGX,0.175567,38,...,AccountId_648,0,-0.008155,2018-11-15 03:32:55+00:00,256,-0.008243,CustomerId_988,1.939442,0.607033,0.484515
4,15,3,BatchId_38780,TransactionId_28195,-0.075183,2018,ProductId_6,UGX,0.175567,38,...,AccountId_4841,0,-0.008155,2018-11-15 03:34:21+00:00,256,-0.008243,CustomerId_988,-2.906446,0.607033,-2.736867


In [44]:
# Save the processed dataset to CSV 
df_woe.to_csv('C:/Users/Administrator/Documents/kifiya/Week_6/processed_data.csv', index=False)