#### This notebook demonstrates how to calculate RFM (Recency, Frequency, Monetary) metrics for each customer, cluster customers based on their RFM values, assign a high-risk label, and merge this information back into the main dataset.
#### The following cells will walk through each step of this process.


In [1]:
import sys
sys.path.append('../src')

#### In the next cell, we load the cleaned transaction data from a CSV file into a pandas DataFrame for further processing.


In [2]:
import pandas as pd
df = pd.read_csv('../data/processed/clean_data.csv')

#### The next cell imports the cleaned transaction data from a CSV file into a pandas DataFrame, which will be used for RFM analysis.


In [3]:
from rfm_target_engineering import calculate_rfm

rfm = calculate_rfm(
    df,
    customer_id_col='CustomerId',
    date_col='TransactionStartTime',
    amount_col='Amount'
)

#### The next cell imports the function for clustering customers based on their RFM values and applies it to the RFM DataFrame, assigning each customer to a cluster.


In [4]:
from rfm_target_engineering import cluster_rfm

rfm = cluster_rfm(rfm, n_clusters=3, random_state=42)

#### The next cell imports the function to assign a high-risk label to each customer based on their RFM cluster and applies it to the RFM DataFrame.


In [5]:
from rfm_target_engineering import assign_high_risk_label

rfm = assign_high_risk_label(rfm)

#### The next cell imports the function to merge the high-risk labels from the RFM DataFrame back into the original cleaned transaction DataFrame, so that each transaction record is associated with its customer's risk label.


In [6]:
from rfm_target_engineering import merge_high_risk

df = merge_high_risk(df, rfm, customer_id_col='CustomerId')

#### The next cell splits the DataFrame into features (X) and the target variable (y), where 'is_high_risk' is the target indicating whether a transaction is associated with a high-risk customer.


In [7]:
X = df.drop(columns=['is_high_risk'])
y = df['is_high_risk']

#### The next cell prints the column names of the DataFrame to help verify the data structure after merging the high-risk labels.


In [8]:
print("Columns in clean_data.csv:")
print(df.columns.tolist())


Columns in clean_data.csv:
['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId', 'CountryCode', 'Amount', 'Value', 'TransactionStartTime', 'PricingStrategy', 'FraudResult', 'total_transaction_amount', 'avg_transaction_amount', 'transaction_count', 'std_transaction_amount', 'transaction_hour', 'transaction_day', 'transaction_month', 'transaction_year', 'CurrencyCode_UGX', 'ProviderId_ProviderId_1', 'ProviderId_ProviderId_2', 'ProviderId_ProviderId_3', 'ProviderId_ProviderId_4', 'ProviderId_ProviderId_5', 'ProviderId_ProviderId_6', 'ProductId_ProductId_1', 'ProductId_ProductId_10', 'ProductId_ProductId_11', 'ProductId_ProductId_12', 'ProductId_ProductId_13', 'ProductId_ProductId_14', 'ProductId_ProductId_15', 'ProductId_ProductId_16', 'ProductId_ProductId_19', 'ProductId_ProductId_2', 'ProductId_ProductId_20', 'ProductId_ProductId_21', 'ProductId_ProductId_22', 'ProductId_ProductId_23', 'ProductId_ProductId_24', 'ProductId_ProductId_27', 'ProductId_ProductId_3', 'Prod