In [1]:
!pip install pandas



In [2]:
import pandas as pd

# Load the already-existing file
final_data = pd.read_csv('/content/final_data.csv')

# Preview it
final_data.head()


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,...,Recency_in_person,Transaction_Hour,Transaction_Day,Transaction_Month,Transaction_Year,Recency,Frequency,Monetary,Cluster,is_high_risk
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,...,0,2,15,11,2018,0,118,203827,2,1
1,TransactionId_118063,BatchId_118460,AccountId_2442,SubscriptionId_1980,CustomerId_2858,UGX,256,ProviderId_5,ProductId_3,airtime,...,5,3,15,11,2018,5,28,129300,2,1
2,TransactionId_51905,BatchId_93774,AccountId_272,SubscriptionId_4731,CustomerId_598,UGX,256,ProviderId_6,ProductId_10,airtime,...,88,4,15,11,2018,88,4,8340,0,0
3,TransactionId_130161,BatchId_82409,AccountId_710,SubscriptionId_920,CustomerId_1053,UGX,256,ProviderId_1,ProductId_15,financial_services,...,89,4,15,11,2018,89,5,5812,0,0
4,TransactionId_51800,BatchId_112288,AccountId_2634,SubscriptionId_3511,CustomerId_3052,UGX,256,ProviderId_6,ProductId_3,airtime,...,61,4,15,11,2018,61,12,11080,0,0


In [3]:
!pip install mlflow scikit-learn pandas




In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import mlflow
import mlflow.sklearn

# Set MLflow tracking URI (local SQLite DB in Colab)
mlflow.set_tracking_uri("sqlite:///mlruns.db")
mlflow.set_experiment("CreditRisk_RFM_Experiment")


2025/06/29 10:06:15 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/06/29 10:06:15 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


<Experiment: artifact_location='/content/mlruns/1', creation_time=1751190154767, experiment_id='1', last_update_time=1751190154767, lifecycle_stage='active', name='CreditRisk_RFM_Experiment', tags={}>

In [5]:



# Step 1: Data Cleaning
# Check for missing values
missing_values = final_data.isnull().sum()
print("Missing Values:\n", missing_values)

# Dropping rows with missing values in critical columns
final_data = final_data.dropna(subset=['CustomerId', 'Recency', 'Frequency', 'Monetary', 'is_high_risk'])



Missing Values:
 TransactionId                     0
BatchId                           0
AccountId                         0
SubscriptionId                    0
CustomerId                        0
CurrencyCode                      0
CountryCode                       0
ProviderId                        0
ProductId                         0
ProductCategory                   0
ChannelId                         0
Amount                            0
Value                             0
TransactionStartTime              0
PricingStrategy                   0
FraudResult                       0
Net_Total_Transaction_Amount      0
Gross_Transaction_Amount          0
Average_Transaction_Amount        0
Transaction_Count                 0
Std_Transaction_Amount          712
Last_Transaction_Date             0
Recency_in_person                 0
Transaction_Hour                  0
Transaction_Day                   0
Transaction_Month                 0
Transaction_Year                  0
Recency    

In [6]:
# Step 2: Feature Selection
features = final_data[['Recency', 'Frequency', 'Monetary', 'Transaction_Hour', 'ChannelId', 'ProviderId','PricingStrategy','FraudResult', 'ProductCategory', 'is_high_risk']]



In [7]:
# Step 3: Encoding Categorical Variables
# Using One-Hot Encoding for categorical features
categorical_features = ['ProductCategory','ChannelId', 'ProviderId','PricingStrategy']
# Remove 'FraudResult' from features as it leaks outcome information
features = pd.get_dummies(features, columns=categorical_features, drop_first=True)




In [8]:
# Step 4: Splitting the Dataset
X = features.drop('is_high_risk', axis=1)  # Features
y = features['is_high_risk']  # Target variable

# Splitting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)



In [9]:
# Step 5: Scaling Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Display the prepared datasets
print("Training Features Shape:", X_train_scaled.shape)
print("Testing Features Shape:", X_test_scaled.shape)
print("Training Target Shape:", y_train.shape)
print("Testing Target Shape:", y_test.shape)

# Now X_train_scaled and y_train are ready for model training
# X_test_scaled and y_test are ready for model evaluation

Training Features Shape: (76520, 24)
Testing Features Shape: (19130, 24)
Training Target Shape: (76520,)
Testing Target Shape: (19130,)


Feature and Target Shapes

| Dataset   | Shape          | Meaning                                           |
| --------- | -------------- | ------------------------------------------------- |
| `X_train` | `(76,520, 24)` | 76,520 rows (examples) with 24 features each      |
| `X_test`  | `(19,130, 24)` | 19,130 unseen samples with same 24 features       |
| `y_train` | `(76,520,)`    | Corresponding labels (target values) for training |
| `y_test`  | `(19,130,)`    | Corresponding labels for evaluation/testing       |




In [10]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "DecisionTree": DecisionTreeClassifier(),
    "RandomForest": RandomForestClassifier(),
    "GradientBoosting": GradientBoostingClassifier()
}

param_grids = {
    "LogisticRegression": {'C': [0.1, 1.0, 10.0]},
    "DecisionTree": {'max_depth': [3, 5, 10]},
    "RandomForest": {'n_estimators': [100], 'max_depth': [5, 10]},
    "GradientBoosting": {'n_estimators': [100], 'learning_rate': [0.1]}
}

def evaluate(y_true, y_pred, y_proba):
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "f1_score": f1_score(y_true, y_pred),
        "roc_auc": roc_auc_score(y_true, y_proba)
    }


In [11]:
results = []  # Store all evaluation results

best_f1 = 0
best_model = None
best_model_name = ""

for name, model in models.items():
    print(f"Training {name}...")
    grid = GridSearchCV(model, param_grids[name], cv=5, scoring='f1', n_jobs=-1)
    grid.fit(X_train, y_train)

    y_pred = grid.predict(X_test)
    y_proba = grid.predict_proba(X_test)[:, 1]

    metrics = evaluate(y_test, y_pred, y_proba)
    print(f"{name} metrics: {metrics}")

    # Store metrics
    model_result = {"model": name}
    model_result.update(metrics)
    results.append(model_result)

    # Log to MLflow
    with mlflow.start_run(run_name=name):
        mlflow.log_params(grid.best_params_)
        mlflow.log_metrics(metrics)
        mlflow.sklearn.log_model(grid.best_estimator_, artifact_path="model")

        # Check if best model
        if metrics["f1_score"] > best_f1:
            best_f1 = metrics["f1_score"]
            best_model = grid.best_estimator_
            best_model_name = name

print(f"\nBest model: {best_model_name} with F1 score: {best_f1:.4f}")



Training LogisticRegression...
LogisticRegression metrics: {'accuracy': 0.9763721902770518, 'precision': 0.987056627255756, 'recall': 0.9848503663231094, 'f1_score': 0.9859522625559424, 'roc_auc': np.float64(0.9968376256638395)}




Training DecisionTree...




DecisionTree metrics: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1_score': 1.0, 'roc_auc': np.float64(1.0)}




Training RandomForest...




RandomForest metrics: {'accuracy': 0.9997386304234187, 'precision': 0.9998137455764574, 'recall': 0.9998758226747796, 'f1_score': 0.9998447831620775, 'roc_auc': np.float64(0.999999856276244)}




Training GradientBoosting...




GradientBoosting metrics: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1_score': 1.0, 'roc_auc': np.float64(1.0)}





Best model: DecisionTree with F1 score: 1.0000


In [12]:
import pandas as pd
import plotly.express as px

# Convert results list to DataFrame (if not already)
results_df = pd.DataFrame(results)

# Melt the DataFrame to long format
melted = results_df.melt(id_vars='model', var_name='metric', value_name='score')

# Create an interactive grouped bar chart
fig = px.bar(
    melted,
    x='model',
    y='score',
    color='metric',
    barmode='group',
    text_auto='.2f',
    title="🔍 Model Evaluation Metrics Comparison (Interactive)",
    labels={'score': 'Evaluation Score', 'model': 'Model Type'}
)

# Update layout for better appearance
fig.update_layout(
    width=950,
    height=550,
    legend_title="Metric",
    xaxis_title="Model",
    yaxis_title="Score",
    title_x=0.5,
    bargap=0.25,
    template='plotly_white'
)

fig.show()


In [13]:
from sklearn.metrics import f1_score

y_train_pred = best_model.predict(X_train)
f1_train = f1_score(y_train, y_train_pred)

y_test_pred = best_model.predict(X_test)
f1_test = f1_score(y_test, y_test_pred)

print(f"Train F1: {f1_train:.4f} | Test F1: {f1_test:.4f}")


Train F1: 1.0000 | Test F1: 1.0000


In [14]:
import joblib
joblib.dump(best_model, "best_model.pkl")


['best_model.pkl']

In [15]:
loaded_model = joblib.load("best_model.pkl")

# Predict risk probability for first 5 test samples
sample = X_test.iloc[:5]
probs = loaded_model.predict_proba(sample)[:, 1]
print("Predicted risk probabilities:", probs)


Predicted risk probabilities: [1. 1. 1. 1. 1.]
