In [7]:
# log_to_arize.py
import joblib
import pandas as pd
import mlflow
import uuid
from arize.pandas.logger import Client
from arize.utils.types import Schema, ModelTypes, Environments

# === CONFIG ===
mlflow.set_tracking_uri("http://127.0.0.1:5000")
RUN_ID = "2f080b7d8ddb4b1f9e7434336e4cad97"              # ← Replace with your actual run_id
PARQUET_PATH = "data/fraud_data.parquet"               # ← Path to your dataset
space_key = "U3BhY2U6MjM3MTI6RThBTQ=="
api_key = "ak-8c93aa68-e105-4c23-b977-4ffb437fe7a5-rZPuli0UaGIrRAJ3x-OkK1sg_l5e5mFT" # ← Replace with your Arize API key
MODEL_ID = "fraud_detection_model"
MODEL_VERSION = "v1"

# === 1. Load MLflow Model ===
print("📦 Loading model from MLflow...")
model_uri = f"runs:/{RUN_ID}/model"
model = mlflow.pyfunc.load_model(model_uri)

# === 2. Load and Prepare Data ===
print("📊 Loading and preparing data...")
df = pd.read_parquet(PARQUET_PATH).sample(n=1000, random_state=42)
X = df.drop(columns=["Class"])

# ✅ Load the saved feature names from training
feature_names = joblib.load("artifacts/feature_names.pkl")

# ✅ One-hot encode your input
pd.get_dummies(X)
X = X[[col for col in X.columns if col in feature_names]]
# Add any missing columns
for col in feature_names:
    if col not in X.columns:
        X[col] = 0

# Drop any unexpected columns
X = X[feature_names]  # This ensures exact column match
assert list(X.columns) == list(feature_names), "Mismatch in input features"

y = df["Class"]

# === 3. Make Predictions ===
print("🧠 Running model predictions...")
X["prediction"] = model.predict(X)
X["actual"] = y.values
X["prediction_id"] = [str(uuid.uuid4()) for _ in range(len(X))]

# ✅ Define log_df for Arize
log_df = X.copy()
log_df.reset_index(drop=True, inplace=True)

# === 4. Arize Client Setup ===
client = Client(space_key=space_key, api_key=api_key)


schema = Schema(
    prediction_id_column_name="prediction_id",
    prediction_label_column_name="prediction",
    actual_label_column_name="actual"
)

# === 5. Log to Arize ===
print("🚀 Logging to Arize...")
response = client.log(
    model_id="fraud_model_v1",
    model_version="1.0.0",
    model_type=ModelTypes.BINARY_CLASSIFICATION,
    environment=Environments.PRODUCTION,   # ✅ FIXED HERE
    dataframe=log_df.head(100),
    schema=schema
)
print(f"✅ Arize log status: {response.status_code}")
print(response.text)


📦 Loading model from MLflow...


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

📊 Loading and preparing data...
🧠 Running model predictions...


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- EventTime_2025-07-23T07:23:34.015046
- V1
- V10
- V2
- V21
- ...
