In [None]:
import os
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Configurare MLflow din variabile de mediu
MLFLOW_TRACKING_URI = os.getenv('MLFLOW_TRACKING_URI', 'http://mlflow-service.mlflow.svc.cluster.local:5000')
EXPERIMENT_NAME = os.getenv('MLFLOW_EXPERIMENT', 'fraud-detection-demo')
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)
print(f'✅ MLflow Tracking URI: {MLFLOW_TRACKING_URI}')
print(f'✅ Experiment: {EXPERIMENT_NAME}')

## 1. Generate synthetic data (10,000 transactions, 2% fraud)

In [None]:
def generate_fraud_dataset():
    np.random.seed(42)
    n_samples = 10000
    n_fraud = 200
    normal = np.random.randn(n_samples - n_fraud, 28)
    fraud = np.random.randn(n_fraud, 28) + 2
    normal_amounts = np.random.gamma(2, 50, n_samples - n_fraud)
    fraud_amounts = np.random.gamma(3, 100, n_fraud)
    X = np.vstack([normal, fraud])
    amounts = np.hstack([normal_amounts, fraud_amounts]).reshape(-1, 1)
    X = np.hstack([X, amounts])
    y = np.hstack([np.zeros(n_samples - n_fraud), np.ones(n_fraud)])
    columns = [f'V{i}' for i in range(1, 29)] + ['Amount']
    df = pd.DataFrame(X, columns=columns)
    df['Class'] = y
    return df

df = generate_fraud_dataset()
print(f'Total: {len(df)} | Fraud: {sum(df[Class]==1)} | Normal: {sum(df[Class]==0)}')

## 2. Preprocessing and train/test split

In [None]:
X = df.drop('Class', axis=1)
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_train['Amount'] = scaler.fit_transform(X_train[['Amount']])
X_test['Amount'] = scaler.transform(X_test[['Amount']])
X_train_balanced = X_train
y_train_balanced = y_train
print(f'Train: {len(X_train)} | Test: {len(X_test)}')

## 3. Train RandomForest with class_weight='balanced'

In [None]:
model = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=10, min_samples_leaf=4, class_weight='balanced', random_state=42, n_jobs=-1)
model.fit(X_train_balanced, y_train_balanced)
print('Model trained!')

## 4. Evaluation and metrics

In [None]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]
metrics = {
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred),
    'recall': recall_score(y_test, y_pred),
    'f1_score': f1_score(y_test, y_pred),
    'roc_auc': roc_auc_score(y_test, y_pred_proba)
}
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
print(metrics)
print(f'Confusion matrix: TN={tn}, FP={fp}, FN={fn}, TP={tp}')

## 5. MLflow Tracking and Model Registry

In [None]:
with mlflow.start_run(run_name='fraud-rf-demo') as run:
    mlflow.log_params({
        'model_type': 'RandomForest',
        'n_estimators': 100,
        'max_depth': 10,
        'balancing_method': 'class_weight_balanced',
        'test_size': 0.2,
        'n_samples': len(df),
        'fraud_rate': f'{sum(df[Class]==1)/len(df)*100:.2f}%'
    })
    mlflow.log_metrics({
        **metrics,
        'true_negatives': int(tn),
        'false_positives': int(fp),
        'false_negatives': int(fn),
        'true_positives': int(tp)
    })
    from mlflow.models.signature import infer_signature
    signature = infer_signature(X_train_balanced, model.predict(X_train_balanced))
    mlflow.sklearn.log_model(model, 'model', signature=signature)
    mlflow.log_dict({'features': X_train.columns.tolist()}, 'features.json')
    run_id = run.info.run_id
    print(f'Run ID: {run_id}')
    # Model Registry
    client = MlflowClient()
    model_name = 'fraud-detection-demo'
    try:
        client.get_registered_model(model_name)
    except:
        client.create_registered_model(model_name, description='Demo fraud detection model')
    version = client.create_model_version(
        name=model_name,
        source=f'runs:/{run_id}/model',
        run_id=run_id
    )
    print(f'Model version: {version.version}')