In [1]:
# Cell 1: Setup and Imports

import pandas as pd
import numpy as np
import sys
import os
from IPython.display import display # For better display of dataframes

# Add the 'src' directory to the Python path
# This allows us to import modules directly from 'src'
current_dir = os.getcwd()
# Assuming the notebook is in 'your_project/notebooks/'
# and src is in 'your_project/src/'
project_root = os.path.abspath(os.path.join(current_dir, '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)
print(f"Added '{project_root}' to sys.path.")

# Import the feature engineering pipeline and its components
try:
    from src.feature_engineering import (
        create_feature_engineering_pipeline,
        FeatureExtractor,
        RFMCalculator,
        AggregateFeatures,
        MissingValueHandler,
        CustomEncoder,
        FeatureScaler
    )
    print("Successfully imported feature_engineering.py components.")
except ImportError as e:
    print(f"Error importing feature_engineering.py: {e}")
    print("Please ensure 'src/feature_engineering.py' exists and is correctly defined.")
    print("Also, check that the project root (containing 'src') is added to sys.path.")

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

print("\nSetup complete.")

Added 'D:\10academy\week5\Credit_Risk_Probability_Model' to sys.path.
Error importing feature_engineering.py: No module named 'sklearn'
Please ensure 'src/feature_engineering.py' exists and is correctly defined.
Also, check that the project root (containing 'src') is added to sys.path.

Setup complete.


In [2]:
# Cell 2: Load Data

# --- Configuration ---
DATA_PATH = '../data/raw/data.csv' # Path to your raw data file

# --- Load Data ---
print("--- Loading Raw Data ---")
try:
    df_raw = pd.read_csv(DATA_PATH)
    print(f"Data loaded successfully from '{DATA_PATH}'")
    print(f"Raw data shape: {df_raw.shape}")
    print("\nRaw data head:")
    display(df_raw.head())
    print("\nRaw data info:")
    df_raw.info()
except FileNotFoundError:
    print(f"Error: {DATA_PATH} not found.")
    print("Please ensure 'data.csv' is located in the '../data/raw/' directory relative to this notebook.")
    # Create a dummy dataframe for testing if the file is truly missing
    print("\n--- Creating a dummy DataFrame for testing. ---")
    df_raw = pd.DataFrame({
        'TransactionId': range(10),
        'BatchId': range(100,110),
        'AccountId': [1,1,2,2,3,3,1,4,4,5],
        'SubscriptionId': range(200,210),
        'CustomerId': [10,10,20,20,30,30,10,40,40,50],
        'CurrencyCode': ['KES']*10,
        'CountryCode': [254]*10,
        'ProviderId': [1,2,1,3,2,1,2,3,1,2],
        'ProductId': ['P1','P2','P1','P3','P2','P1','P3','P1','P2','P3'],
        'ProductCategory': ['CatA','CatB','CatA','CatC','CatB','CatA','CatC','CatA','CatB','CatC'],
        'ChannelId': ['App','Web','App','POS','Web','App','POS','Web','App','POS'],
        'Amount': [100.5, -50.0, 200.0, 75.2, -30.0, 150.0, 80.0, 120.0, -20.0, 90.0],
        'Value': [100.5, 50.0, 200.0, 75.2, 30.0, 150.0, 80.0, 120.0, 20.0, 90.0],
        'TransactionStartTime': pd.to_datetime([
            '2024-01-01 10:00:00', '2024-01-01 11:30:00', '2024-01-02 09:00:00',
            '2024-01-02 14:00:00', '2024-01-03 16:00:00', '2024-01-03 10:00:00',
            '2024-01-04 08:00:00', '2024-01-04 12:00:00', '2024-01-05 09:00:00',
            '2024-01-05 13:00:00'
        ]),
        'PricingStrategy': [1]*10,
        'FraudResult': [0,0,0,0,1,0,0,0,0,0] # Example fraud result
    })
    print(f"Dummy data shape: {df_raw.shape}")
    display(df_raw.head())

# Create a working copy for individual tests
df_working_copy = df_raw.copy()

--- Loading Raw Data ---
Data loaded successfully from '../data/raw/data.csv'
Raw data shape: (95662, 16)

Raw data head:


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0



Raw data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95662 entries, 0 to 95661
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   TransactionId         95662 non-null  object 
 1   BatchId               95662 non-null  object 
 2   AccountId             95662 non-null  object 
 3   SubscriptionId        95662 non-null  object 
 4   CustomerId            95662 non-null  object 
 5   CurrencyCode          95662 non-null  object 
 6   CountryCode           95662 non-null  int64  
 7   ProviderId            95662 non-null  object 
 8   ProductId             95662 non-null  object 
 9   ProductCategory       95662 non-null  object 
 10  ChannelId             95662 non-null  object 
 11  Amount                95662 non-null  float64
 12  Value                 95662 non-null  int64  
 13  TransactionStartTime  95662 non-null  object 
 14  PricingStrategy       95662 non-null  int64  
 15  Fra

In [4]:
# Cell 3: Test FeatureExtractor

print("\n--- Testing FeatureExtractor ---")
extractor = FeatureExtractor()
df_extracted = extractor.fit_transform(df_working_copy.copy()) # Use a fresh copy for this test
print("After FeatureExtractor (first 5 rows):")
display(df_extracted.head())
print(f"New time-based columns: {[col for col in df_extracted.columns if 'Transaction' in col and col != 'TransactionId']}")
print(f"Original 'TransactionStartTime' column removed: {'TransactionStartTime' not in df_extracted.columns}")


--- Testing FeatureExtractor ---


NameError: name 'FeatureExtractor' is not defined

In [None]:
# Cell 4: Test RFMCalculator

print("\n--- Testing RFMCalculator ---")
rfm_calc = RFMCalculator()
df_rfm = rfm_calc.fit_transform(df_working_copy.copy()) # Use a fresh copy for this test (needs original timestamp)
print("After RFMCalculator (first 5 rows):")
display(df_rfm.head())
print(f"RFM columns added: {[col for col in ['Recency', 'Frequency', 'Monetary'] if col in df_rfm.columns]}")
print("\nRFM features summary statistics:")
display(df_rfm[['Recency', 'Frequency', 'Monetary']].describe())

In [None]:
# Cell 5: Test AggregateFeatures

print("\n--- Testing AggregateFeatures ---")
# Ensure 'Amount' is numeric before aggregation
df_working_copy['Amount'] = pd.to_numeric(df_working_copy['Amount'], errors='coerce')
aggregator = AggregateFeatures()
df_aggregated = aggregator.fit_transform(df_working_copy.copy()) # Use a fresh copy for this test
print("After AggregateFeatures (first 5 rows):")
display(df_aggregated.head())
print(f"Aggregate columns added: {[col for col in df_aggregated.columns if 'TransactionAmount' in col or 'TransactionCount' in col]}")
print("\nAggregate features summary statistics:")
display(df_aggregated[['TotalTransactionAmount', 'AverageTransactionAmount', 'TransactionCount']].describe())

In [None]:
# Cell 6: Test MissingValueHandler

print("\n--- Testing MissingValueHandler ---")
# Create a copy and introduce some NaNs for this specific test
df_with_nans = df_working_copy.copy()
# Introduce NaN in 'Amount' and 'Value' for a couple of rows
df_with_nans.loc[[1, 5], ['Amount', 'Value']] = np.nan
# Introduce NaN in a categorical column for testing
df_with_nans.loc[[2, 6], 'ProductCategory'] = np.nan
df_with_nans.loc[[3], 'ChannelId'] = np.nan

print("Data before MissingValueHandler (showing NaNs counts):")
print(df_with_nans[['Amount', 'Value', 'ProductCategory', 'ChannelId']].isnull().sum())
print("\nBefore imputation - Sample rows with NaNs:")
display(df_with_nans[df_with_nans['Amount'].isnull() | df_with_nans['ProductCategory'].isnull()].head())

imputer = MissingValueHandler(strategy='mean')
df_imputed = imputer.fit_transform(df_with_nans.copy()) # Use a fresh copy for this test

print("\nData after MissingValueHandler (showing NaNs counts):")
print(df_imputed[['Amount', 'Value', 'ProductCategory', 'ChannelId']].isnull().sum())
print("\nAfter imputation - Sample rows (should be filled):")
display(df_imputed[df_imputed['TransactionId'].isin([df_with_nans.loc[1, 'TransactionId'], df_with_nans.loc[2, 'TransactionId']])])

print("\nValue counts for ProductCategory after imputation (should show no NaNs, most frequent used):")
print(df_imputed['ProductCategory'].value_counts(dropna=False))

In [None]:
# Cell 7: Test CustomEncoder (One-Hot Encoding)

print("\n--- Testing CustomEncoder (One-Hot Encoding) ---")
# Start with a dataframe that has no NaNs (e.g., df_imputed from previous step or a fresh copy)
df_for_encoding = df_imputed.copy() if 'df_imputed' in locals() else df_working_copy.copy()

# Select some categorical columns for encoding
categorical_cols_onehot = ['ProductCategory', 'ChannelId', 'CurrencyCode']
# Ensure these columns exist and are of object type before encoding
cols_to_encode_existing = [col for col in categorical_cols_onehot if col in df_for_encoding.columns and df_for_encoding[col].dtype == 'object']

encoder_onehot = CustomEncoder(method='onehot', columns=cols_to_encode_existing)
df_encoded_onehot = encoder_onehot.fit_transform(df_for_encoding.copy()) # Use a fresh copy for this test

print(f"Original categorical columns being encoded: {cols_to_encode_existing}")
print(f"Shape before encoding: {df_for_encoding.shape}")
print(f"Shape after One-Hot Encoding: {df_encoded_onehot.shape}")
print("\nFirst 5 rows with new one-hot encoded columns:")
display(df_encoded_onehot.head())

print("\nPresence of original and new columns:")
for col in cols_to_encode_existing:
    print(f"Is original '{col}' column present? {col in df_encoded_onehot.columns}")
    # Check for some of the new one-hot encoded columns
    if col in df_encoded_onehot.columns: # If original column still exists (shouldn't for proper one-hot)
        pass
    else: # If original column was dropped, check for new dummy variables
        example_dummy_cols = [c for c in df_encoded_onehot.columns if c.startswith(f"{col}_")][:3]
        if example_dummy_cols:
            print(f"Example new dummy columns for '{col}': {example_dummy_cols}")
        else:
            print(f"No new dummy columns found for '{col}'. Check if it was encoded.")

In [None]:
# Cell 8: Test CustomEncoder (Label Encoding)

print("\n--- Testing CustomEncoder (Label Encoding) ---")
# Use a fresh copy of the raw data for a clear demonstration of label encoding
df_for_label_encoding = df_raw.copy()
df_for_label_encoding['ProductCategory'] = df_for_label_encoding['ProductCategory'].fillna('Unknown') # Handle NaNs for label encoding

categorical_cols_label = ['ProductId', 'ProductCategory'] # Example columns for label encoding

encoder_label = CustomEncoder(method='label', columns=categorical_cols_label)
df_encoded_label = encoder_label.fit_transform(df_for_label_encoding.copy()) # Use a fresh copy for this test

print(f"Original categorical columns being encoded: {categorical_cols_label}")
print("\nFirst 5 rows with label encoded columns:")
display(df_encoded_label[categorical_cols_label].head())

print("\nUnique encoded values for 'ProductId':")
print(df_encoded_label['ProductId'].unique())
print("\nUnique encoded values for 'ProductCategory':")
print(df_encoded_label['ProductCategory'].unique())
print(f"Original 'ProductId' dtype: {df_for_label_encoding['ProductId'].dtype}, Encoded 'ProductId' dtype: {df_encoded_label['ProductId'].dtype}")

In [None]:
# Cell 9: Test FeatureScaler (StandardScaler)

print("\n--- Testing FeatureScaler (StandardScaler) ---")
# For this test, let's combine some features from previous steps to ensure we have variety
# Start with a dataframe that has numerical columns (e.g., after extraction, RFM, aggregation, imputation)
df_for_scaling = df_encoded_onehot.copy() # Using the one-hot encoded and imputed data

# Ensure numerical columns are present and identified correctly
numerical_cols_to_scale = df_for_scaling.select_dtypes(include=np.number).columns.tolist()
# Exclude identifier columns that shouldn't be scaled (if they are numeric)
ids_to_exclude = ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId', 'CountryCode', 'PricingStrategy', 'FraudResult', 'ProviderId']
numerical_cols_to_scale = [col for col in numerical_cols_to_scale if col not in ids_to_exclude]

print(f"Numerical columns selected for scaling: {numerical_cols_to_scale}")

if not numerical_cols_to_scale:
    print("No suitable numerical columns found for scaling after previous steps. Skipping scaler test.")
else:
    scaler = FeatureScaler(method='standard', columns=numerical_cols_to_scale)
    df_scaled = scaler.fit_transform(df_for_scaling.copy()) # Use a fresh copy for this test

    print("\nAfter FeatureScaler (first 5 rows of scaled numerical columns):")
    display(df_scaled[numerical_cols_to_scale].head())

    print("\nMean and Standard Deviation of scaled columns (should be ~0 and ~1):")
    display(df_scaled[numerical_cols_to_scale].describe().loc[['mean', 'std']])

In [None]:
# Cell 10: Test Full Pipeline

print("\n--- Testing Full Feature Engineering Pipeline ---")

# Initialize the full pipeline
# You can customize imputation strategy and encoding method here
full_pipeline = create_feature_engineering_pipeline(
    numerical_imputation_strategy='mean',
    categorical_encoding_method='onehot'
)

# Apply the full pipeline to the raw data
print(f"Shape of raw data: {df_raw.shape}")
df_processed_full = full_pipeline.fit_transform(df_raw.copy()) # Use a fresh copy of the raw data

print("\n--- Full Processed Data Overview ---")
print(f"Shape of fully processed data: {df_processed_full.shape}")
print("\nFirst 5 rows of fully processed data:")
display(df_processed_full.head())
print("\nInformation about fully processed columns:")
df_processed_full.info()

print("\n--- Final Check for Missing Values in Fully Processed Data ---")
final_missing_values = df_processed_full.isnull().sum()
final_missing_values = final_missing_values[final_missing_values > 0]
print(final_missing_values)
if final_missing_values.empty:
    print("No missing values found in the final processed data.")
else:
    print("Warning: Missing values still present after full pipeline. Investigate columns listed above.")

print("\n--- Feature Engineering Pipeline Testing Completed ---")