In this project I'm learning how to create a reproducable ML model pipeline using MLflow to track experiments including logging hyperparameters and model versions. This allowed me to explore the effect of different feature engineering techniques and ways to deal with class imbalance in the dataset

In [91]:
import pandas as pd
import mlflow
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, make_scorer, recall_score
from sklearn.decomposition import PCA
from imblearn.over_sampling import ADASYN, SMOTE
import warnings
warnings.filterwarnings("ignore")

In [92]:
mlflow.set_tracking_uri("http://127.0.0.1:5000/")
mlflow.set_experiment("AnomalyDetection")

2026/01/26 19:37:26 INFO mlflow.tracking.fluent: Experiment with name 'AnomalyDetection' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/5', creation_time=1769449046886, experiment_id='5', last_update_time=1769449046886, lifecycle_stage='active', name='AnomalyDetection', tags={}>

In [93]:
df = pd.read_csv("FraudDetectionTransactionsDataset.csv")
df.head()

Unnamed: 0,Transaction_ID,User_ID,Transaction_Amount,Transaction_Type,Timestamp,Account_Balance,Device_Type,Location,Merchant_Category,IP_Address_Flag,...,Daily_Transaction_Count,Avg_Transaction_Amount_7d,Failed_Transaction_Count_7d,Card_Type,Card_Age,Transaction_Distance,Authentication_Method,Risk_Score,Is_Weekend,Fraud_Label
0,TXN_33553,USER_1834,39.79,POS,2023-08-14 19:30:00,93213.17,Laptop,Sydney,Travel,0,...,7,437.63,3,Amex,65,883.17,Biometric,0.8494,0,0
1,TXN_9427,USER_7875,1.19,Bank Transfer,2023-06-07 04:01:00,75725.25,Mobile,New York,Clothing,0,...,13,478.76,4,Mastercard,186,2203.36,Password,0.0959,0,1
2,TXN_199,USER_2734,28.96,Online,2023-06-20 15:25:00,1588.96,Tablet,Mumbai,Restaurants,0,...,14,50.01,4,Visa,226,1909.29,Biometric,0.84,0,1
3,TXN_12447,USER_2617,254.32,ATM Withdrawal,2023-12-07 00:31:00,76807.2,Tablet,New York,Clothing,0,...,8,182.48,4,Visa,76,1311.86,OTP,0.7935,0,1
4,TXN_39489,USER_2014,31.28,POS,2023-11-11 23:44:00,92354.66,Mobile,Mumbai,Electronics,0,...,14,328.69,4,Mastercard,140,966.98,Password,0.3819,1,1


In [94]:
df.columns

Index(['Transaction_ID', 'User_ID', 'Transaction_Amount', 'Transaction_Type',
       'Timestamp', 'Account_Balance', 'Device_Type', 'Location',
       'Merchant_Category', 'IP_Address_Flag', 'Previous_Fraudulent_Activity',
       'Daily_Transaction_Count', 'Avg_Transaction_Amount_7d',
       'Failed_Transaction_Count_7d', 'Card_Type', 'Card_Age',
       'Transaction_Distance', 'Authentication_Method', 'Risk_Score',
       'Is_Weekend', 'Fraud_Label'],
      dtype='object')

In [95]:
df.columns = [x.lower() for x in df.columns]

In [96]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 21 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   transaction_id                50000 non-null  object 
 1   user_id                       50000 non-null  object 
 2   transaction_amount            50000 non-null  float64
 3   transaction_type              50000 non-null  object 
 4   timestamp                     50000 non-null  object 
 5   account_balance               50000 non-null  float64
 6   device_type                   50000 non-null  object 
 7   location                      50000 non-null  object 
 8   merchant_category             50000 non-null  object 
 9   ip_address_flag               50000 non-null  int64  
 10  previous_fraudulent_activity  50000 non-null  int64  
 11  daily_transaction_count       50000 non-null  int64  
 12  avg_transaction_amount_7d     50000 non-null  float64
 13  f

In [97]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [98]:
df = df.drop(['transaction_id', 'user_id'], axis=1)

In [99]:
df['fraud_label'].value_counts()

fraud_label
0    33933
1    16067
Name: count, dtype: int64

## Run 1

In [100]:
X = df.drop(['fraud_label'], axis=1)
y = df[['fraud_label']]
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=55)
X_test, X_valid, y_test, y_valid = train_test_split(X_temp, y_temp, test_size=0.4, random_state=55)

In [101]:
categories = X.select_dtypes(include=['object']).columns.tolist()
numerical = X.select_dtypes(include=['number']).columns.tolist()
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False).set_output(transform='pandas')

In [102]:
transformer = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categories),
    ("num", 'passthrough', numerical)
])

In [103]:
params = {
    'C': 1.0,
    'class_weight': None,
    'max_iter': 100,
    'penalty': 'l2'
}

In [104]:
pipe = Pipeline([
    ('preprocess', transformer),
    ('model', LogisticRegression(**params))
])

In [105]:
pipe.fit(X_train, y_train)
pipe.score(X_train, y_train)
y_pred = pipe.predict(X_valid)
report = classification_report(y_pred, y_valid, output_dict=True)

In [106]:
with mlflow.start_run(run_name='LR'):
    mlflow.log_params(params)
    mlflow.log_metrics({
        'accuracy': report['accuracy'],
        'recall_0': report['0']['recall'],
        'recall_1': report['1']['recall']
    })
   
    mlflow.sklearn.log_model(pipe.get_params()['model'], name='LogisticRegression')

üèÉ View run LR at: http://127.0.0.1:5000/#/experiments/5/runs/5f8ad0ffac5742a9bab02685dac1ff29
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/5


## Run 2

In [107]:
transformer = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categories),
    ("num", 'passthrough', numerical),
    ("std", StandardScaler(), numerical),
    ("pca", PCA(), numerical)
])

In [108]:
pipe = Pipeline([
    ('preprocess', transformer),
    ('model', LogisticRegression(**params))
])

In [109]:
pipe.fit(X_train, y_train)
pipe.score(X_train, y_train)
y_pred = pipe.predict(X_valid)
report = classification_report(y_pred, y_valid, output_dict=True)

In [110]:
with mlflow.start_run(run_name='LR-OHE'):
    mlflow.log_params(params)
    mlflow.log_metrics({
        'accuracy': report['accuracy'],
        'recall_0': report['0']['recall'],
        'recall_1': report['1']['recall']
    })
   
    mlflow.sklearn.log_model(pipe.get_params()['model'], name='OHE-LogisticRegression')

üèÉ View run LR-OHE at: http://127.0.0.1:5000/#/experiments/5/runs/2d030ff4a5a848589568ad70b547fa5a
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/5


## Run 3

In [111]:
from imblearn.pipeline import Pipeline

pipe = Pipeline([
    ('preprocess', transformer),
    ('smote', SMOTE(sampling_strategy='minority')),
    ('model', LogisticRegression(**params))
])

In [112]:
pipe.fit(X_train, y_train)
pipe.score(X_train, y_train)
y_pred = pipe.predict(X_valid)
report = classification_report(y_pred, y_valid, output_dict=True)

In [113]:
with mlflow.start_run(run_name='LR-SMOTE'):
    mlflow.log_params(params)
    mlflow.log_metrics({
        'accuracy': report['accuracy'],
        'recall_0': report['0']['recall'],
        'recall_1': report['1']['recall']
    })
   
    mlflow.sklearn.log_model(pipe.get_params()['model'], name='SMOTE-LogisticRegression')

üèÉ View run LR-SMOTE at: http://127.0.0.1:5000/#/experiments/5/runs/b503058d24d8443ca3b70628506eaefd
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/5


## Run 4

In [114]:
transformer = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categories),
    ("num", 'passthrough', numerical),
    ("std", StandardScaler(), numerical),
    ("pca", PCA(), numerical)
])

In [115]:
from imblearn.pipeline import Pipeline
pipe = Pipeline([
    ('preprocess', transformer),
    ('adasyn', ADASYN(sampling_strategy='minority')),
    ('model', LogisticRegression(**params))
])

In [116]:
pipe.fit(X_train, y_train)
pipe.score(X_train, y_train)
y_pred = pipe.predict(X_valid)
report = classification_report(y_pred, y_valid, output_dict=True)

In [117]:
with mlflow.start_run(run_name='LR-ADASYN'):
    mlflow.log_params(params)
    mlflow.log_metrics({
        'accuracy': report['accuracy'],
        'recall_0': report['0']['recall'],
        'recall_1': report['1']['recall']
    })
   
    mlflow.sklearn.log_model(pipe.get_params()['model'], name='ADASYN-LogisticRegression')

üèÉ View run LR-ADASYN at: http://127.0.0.1:5000/#/experiments/5/runs/2f0aa0a96c514754b9cfed0bea137014
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/5


**After comparing instances, I find that the model from experiment 2 performs better than the others regarding the recall for** `class 1` **, so I'll register it to MLflow model registry to fine-tune it further**

### Registering a model

In [136]:
model_name = 'OHE-LogisticRegression'
run_id = '2d030ff4a5a848589568ad70b547fa5a'
model_uri = f"runs:/{run_id}/{model_name}"

client.create_model_version(
    name="ohe-LogisticRegression",
    source=model_uri,
    run_id=run_id
)

2026/01/26 20:19:17 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ohe-LogisticRegression, version 4


<ModelVersion: aliases=[], creation_timestamp=1769451557264, current_stage='None', deployment_job_state=<ModelVersionDeploymentJobState: current_task_name='', job_id='', job_state='DEPLOYMENT_JOB_CONNECTION_STATE_UNSPECIFIED', run_id='', run_state='DEPLOYMENT_JOB_RUN_STATE_UNSPECIFIED'>, description='', last_updated_timestamp=1769451557264, metrics=None, model_id=None, name='ohe-LogisticRegression', params=None, run_id='2d030ff4a5a848589568ad70b547fa5a', run_link='', source='runs:/2d030ff4a5a848589568ad70b547fa5a/OHE-LogisticRegression', status='READY', status_message=None, tags={}, user_id='', version='4'>

## Run 5

In [120]:
transformer = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categories),
    ("num", 'passthrough', numerical),
    ("std", StandardScaler(), numerical),
    ("pca", PCA(), numerical)
])

In [121]:
params = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'max_iter': [100, 500, 700, 1000],
    'solver': ['lbfgs','newton-cg','liblinear','sag','saga'],
}

In [122]:
recall_1 = make_scorer(recall_score, pos_label=1)

In [123]:
search = RandomizedSearchCV(
    LogisticRegression(), 
    param_distributions=params,
    scoring=recall_1,
    cv=5,
    verbose=1
)

In [124]:
pipe = Pipeline([
    ('preprocess', transformer),
    ('random-search', search)
])

In [125]:
pipe.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [126]:
y_pred = pipe.predict(X_valid)
report = classification_report(y_pred, y_valid, output_dict=True)
print(classification_report(y_pred, y_valid))

              precision    recall  f1-score   support

           0       0.92      0.87      0.89      5777
           1       0.70      0.80      0.74      2223

    accuracy                           0.85      8000
   macro avg       0.81      0.83      0.82      8000
weighted avg       0.86      0.85      0.85      8000



In [134]:
with mlflow.start_run(run_name='CV-OHE-LR'):
    mlflow.log_params(search.best_params_)
    mlflow.log_metrics({
        'accuracy': report['accuracy'],
        'recall_0': report['0']['recall'],
        'recall_1': report['1']['recall']
    })

    mlflow.sklearn.log_model(
        search.best_estimator_, 
        name='ohe-LogisticRegression', 
    )

Registered model 'ohe-LogisticRegression' already exists. Creating a new version of this model...
2026/01/26 20:07:53 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ohe-LogisticRegression, version 2
Created version '2' of model 'ohe-LogisticRegression'.


üèÉ View run CV-OHE-LR at: http://127.0.0.1:5000/#/experiments/5/runs/b775da2c61444fb5a9ac6a6832b31deb
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/5


In [135]:
run_id = 'b775da2c61444fb5a9ac6a6832b31deb'
model_name = 'ohe-LogisticRegression'

model_uri = f'runs:/{run_id}/{model_name}'

mlflow.register_model(model_uri=model_uri, name=model_name)

Registered model 'ohe-LogisticRegression' already exists. Creating a new version of this model...
2026/01/26 20:08:43 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ohe-LogisticRegression, version 3
Created version '3' of model 'ohe-LogisticRegression'.


<ModelVersion: aliases=[], creation_timestamp=1769450922955, current_stage='None', deployment_job_state=<ModelVersionDeploymentJobState: current_task_name='', job_id='', job_state='DEPLOYMENT_JOB_CONNECTION_STATE_UNSPECIFIED', run_id='', run_state='DEPLOYMENT_JOB_RUN_STATE_UNSPECIFIED'>, description='', last_updated_timestamp=1769450922955, metrics=None, model_id=None, name='ohe-LogisticRegression', params=None, run_id='b775da2c61444fb5a9ac6a6832b31deb', run_link='', source='models:/m-c2ab94744bb34a668d2591db510ad01d', status='READY', status_message=None, tags={}, user_id='', version='3'>

### Loading a specific version of a registered model

In [None]:
model_name = 'ohe-LogisticRegression'
run_id = 'b775da2c61444fb5a9ac6a6832b31deb'
model = mlflow.sklearn.load_model(f'runs:/{run_id}/{model_name}')

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [139]:
model

### Testing the best Logistic Regression model on the test set

In [141]:
pipe = Pipeline([
    ('preprocess', transformer),
    ('model', model)
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.92      0.85      0.89      8741
           1       0.67      0.80      0.73      3259

    accuracy                           0.84     12000
   macro avg       0.80      0.83      0.81     12000
weighted avg       0.85      0.84      0.84     12000

