In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load raw data
df_loan = pd.read_csv('../data/raw/loan_applications.csv')
df_transaction = pd.read_csv('../data/raw/transactions.csv')

In [3]:
# Preprocessing: Convert dates and extract time info
df_transaction['transaction_date'] = pd.to_datetime(df_transaction['transaction_date'])
df_transaction['hour'] = df_transaction['transaction_date'].dt.hour
df_transaction['weekday'] = df_transaction['transaction_date'].dt.dayofweek

# 1. Transaction-Derived Features (per transaction)

In [4]:
# Flag high-value transactions (above 95th percentile)
df_transaction['is_high_amount'] = df_transaction['transaction_amount'] > df_transaction['transaction_amount'].quantile(0.95)

# Flag transactions done at night
df_transaction['is_night_transaction'] = df_transaction['hour'].between(0, 6)

# Flag transactions done on weekends
df_transaction['is_weekend'] = df_transaction['weekday'].isin([5, 6])

In [5]:
# Risky device detection (example: devices with >10% fraud rate)
device_fraud_rate = df_transaction.groupby('device_used')['fraud_flag'].mean()
risky_devices = device_fraud_rate[device_fraud_rate > 0.1].index
df_transaction['is_device_risky'] = df_transaction['device_used'].isin(risky_devices)

In [6]:
# Compute average risk per customer from device usage
device_risk_by_customer = df_transaction.groupby('customer_id')['is_device_risky'].mean().reset_index(name='device_risk_ratio')

# 2. Customer-Aggregated Features (from transaction history)

In [7]:
# Feature: customer-level aggregates
agg_by_customer = df_transaction.groupby('customer_id').agg(
    transaction_count=('transaction_amount', 'count'),
    total_transaction_amount=('transaction_amount', 'sum'),
    avg_transaction_amount=('transaction_amount', 'mean'),
    max_transaction_amount=('transaction_amount', 'max'),
    fraud_count=('fraud_flag', 'sum'),
    fraud_rate=('fraud_flag', 'mean'),
    num_international_txn=('is_international_transaction', 'sum'),
    unique_devices_used=('device_used', 'nunique')
).reset_index()

In [8]:
# Merge aggregated features into loan dataset
df = pd.merge(df_loan, agg_by_customer, on='customer_id', how='left')
df = pd.merge(df, device_risk_by_customer, on='customer_id', how='left')

# 3. Loan-Derived Features

In [9]:
# Ratio of loan amount to income
df['loan_to_income_ratio'] = df['loan_amount_requested'] / df['monthly_income']

# Estimated monthly installment of the new loan
estimated_emi = df['loan_amount_requested'] / df['loan_tenure_months']

# DEBT BURDEN SCORE: (existing EMIs + estimated new EMI) / income
df['debt_burden_score'] = (df['existing_emis_monthly'] + estimated_emi) / df['monthly_income']

# Flag if the applicant is young (< 25 years)
df['is_young_borrower'] = df['applicant_age'] < 25

# Flag for applicants with many dependents (3 or more)
df['is_high_dependents'] = df['number_of_dependents'] >= 3

# Interest rate binning
df['interest_rate_bin'] = pd.cut(df['interest_rate_offered'], bins=[0, 7, 12, 20], labels=['low', 'medium', 'high'])

# 4. Cross-Features

In [10]:
# Risky loan type + high fraud rate from transaction history
df['loan_type_x_fraud_rate'] = df['loan_type'] + "_" + pd.cut(df['fraud_rate'], bins=[0, 0.01, 0.05, 1], labels=['low', 'medium', 'high']).astype(str)

# Interaction between interest rate and loan amount
df['loan_amount_x_interest'] = df['loan_amount_requested'] * df['interest_rate_offered']

# Debt burden x fraud rate (behavioral + financial stress)
df['fraud_weighted_debt_burden'] = df['debt_burden_score'] * df['fraud_rate']

# Device risk + employment status (suspicious profile)
df['device_risk_x_employment'] = df['employment_status'] + "_" + pd.cut(df['device_risk_ratio'], bins=3, labels=['low', 'medium', 'high']).astype(str)

# Young borrower + high loan to income (possible profile manipulation?)
df['young_high_ratio'] = (df['is_young_borrower']) & (df['loan_to_income_ratio'] > 1)


# Save the processed data into csv

In [11]:
df.to_csv('../data/processed/loan_transactions_features.csv', index=False)
print("✅ Yassine feature engineering complete. Data saved to processed folder.")

✅ Yassine feature engineering complete. Data saved to processed folder.


In [13]:
display(df)
print(df.columns)

Unnamed: 0,application_id,customer_id,application_date,loan_type,loan_amount_requested,loan_tenure_months,interest_rate_offered,purpose_of_loan,employment_status,monthly_income,...,loan_to_income_ratio,debt_burden_score,is_young_borrower,is_high_dependents,interest_rate_bin,loan_type_x_fraud_rate,loan_amount_x_interest,fraud_weighted_debt_burden,device_risk_x_employment,young_high_ratio
0,c8bf0bea-70e6-4870-9125-41b8210c527f,CUST109427,2023-04-09,Business Loan,604000.0,12,11.66,Medical Emergency,Retired,34700.0,...,17.406340,1.482229,False,True,medium,Business Loan_nan,7042640.0,0.0,Retired_medium,False
1,91224cec-3544-4bc7-ac15-a9792da54c02,CUST106146,2023-09-23,Car Loan,100000.0,240,13.62,Education,Unemployed,51600.0,...,1.937984,0.008075,False,True,high,Car Loan_nan,1362000.0,0.0,Unemployed_medium,False
2,4efcd02d-4a03-4ab7-9bd1-0ff430493d0c,CUST100674,2023-05-22,Education Loan,431000.0,60,11.40,Medical Emergency,Self-Employed,14800.0,...,29.121622,0.796171,False,True,medium,Education Loan_nan,4913400.0,0.0,Self-Employed_medium,False
3,a61337d4-ba04-4a68-b492-2cb8266e6ed7,CUST106466,2024-07-09,Car Loan,324000.0,120,10.36,Debt Consolidation,Self-Employed,28800.0,...,11.250000,0.232639,False,True,medium,Car Loan_nan,3356640.0,0.0,Self-Employed_medium,False
4,a8d1639e-170b-41b2-826a-55c7dae38d16,CUST112319,2023-11-20,Personal Loan,100000.0,36,14.14,Business Expansion,Salaried,43900.0,...,2.277904,0.088332,False,False,high,Personal Loan_nan,1414000.0,0.0,Salaried_medium,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,1d4b1017-e8a1-4a7b-94d1-63e90b05d665,CUST114840,2024-12-29,Education Loan,381000.0,24,12.05,Vehicle Purchase,Student,49200.0,...,7.743902,0.371443,False,True,high,Education Loan_nan,4591050.0,0.0,Student_medium,False
49996,2984426c-6d0b-437c-9a13-84bf3244c6d4,CUST103055,2022-06-17,Education Loan,736000.0,120,11.03,Medical Emergency,Student,28600.0,...,25.734266,0.354312,False,False,medium,Education Loan_nan,8118080.0,0.0,Student_medium,False
49997,cdbc7ad8-0f23-46c4-aaca-619053ad5605,CUST119886,2024-12-14,Car Loan,436000.0,12,10.98,Wedding,Student,42000.0,...,10.380952,0.879365,False,True,medium,Car Loan_nan,4787280.0,0.0,Student_medium,False
49998,46a8f901-8005-4ae7-923e-f9d0dc8e0ae8,CUST109088,2022-12-16,Business Loan,827000.0,24,16.18,Vehicle Purchase,Business Owner,46700.0,...,17.708779,0.911313,False,True,high,Business Loan_nan,13380860.0,,Business Owner_nan,False


Index(['application_id', 'customer_id', 'application_date', 'loan_type',
       'loan_amount_requested', 'loan_tenure_months', 'interest_rate_offered',
       'purpose_of_loan', 'employment_status', 'monthly_income', 'cibil_score',
       'existing_emis_monthly', 'debt_to_income_ratio',
       'property_ownership_status', 'residential_address', 'applicant_age',
       'gender', 'number_of_dependents', 'loan_status', 'fraud_flag',
       'fraud_type', 'transaction_count', 'total_transaction_amount',
       'avg_transaction_amount', 'max_transaction_amount', 'fraud_count',
       'fraud_rate', 'num_international_txn', 'unique_devices_used',
       'device_risk_ratio', 'loan_to_income_ratio', 'debt_burden_score',
       'is_young_borrower', 'is_high_dependents', 'interest_rate_bin',
       'loan_type_x_fraud_rate', 'loan_amount_x_interest',
       'fraud_weighted_debt_burden', 'device_risk_x_employment',
       'young_high_ratio'],
      dtype='object')
