In [4]:
import mlflow
import pandas as pd
from gpt4all import GPT4All
# Connect to local MLflow server
mlflow.set_tracking_uri("http://127.0.0.1:5000")
experiment = mlflow.get_experiment_by_name("Fraud_Detection_Comparison_v1")

# Get all runs from the experiment
df_runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id]).head(9)

"""# Preview your data
print(df_runs[[
    "run_id", "params.model", 
    "metrics.accuracy", "metrics.precision", "metrics.recall",
]])"""
def create_comparison_prompt(df, target_run_id):
    target_row = df[df["run_id"] == target_run_id].iloc[0]
    prompt = (
        f"Compare the following ML models to the target model (Run ID: {target_run_id}). "
        "Use accuracy, precision, and recall. Then rank all models from best to worst and explain your ranking.\n\n"
    )

    for idx, row in df.iterrows():
        if row["run_id"] == target_run_id:
            continue  # ⛔ Skip printing the target model again

        model_type = row.get("params.model", "N/A")
        prompt += f"Model {idx + 1} ({model_type}):\n"
        prompt += f" - Run ID: {row['run_id']}\n"
        prompt += f" - Model Type: {row.get('params.model', 'N/A')}\n"
        for metric in ["accuracy", "precision", "recall"]:
            col = f"metrics.{metric}"
            if col in row and pd.notnull(row[col]):
                prompt += f" - {metric.capitalize()}: {row[col]:.4f}\n"
    prompt += "\n"


    prompt += (
    f"\n--- TARGET MODEL METRICS ---\n"
    f"Target Model ({df[df['run_id'] == target_run_id].index[0] + 1}):\n"
    f" - Run ID: {target_run_id}\n"
    f" - Accuracy: {target_row['metrics.accuracy']:.4f}\n"
    f" - Precision: {target_row['metrics.precision']:.4f}\n"
    f" - Recall: {target_row['metrics.recall']:.4f}\n"
)
    prompt += (
    "\nRank all models from best to worst compared to the target model "
    "using both model name and Run ID for clarity."
)

    return prompt
        
   
# Choose a specific run ID to compare others against (e.g., best Logistic Regression model)
target_run_id = df_runs.iloc[0]["run_id"]  # or pick based on best f1_score, etc.

# Load your local model — path must match your installed model
model = GPT4All("Llama-3.2-3B-Instruct-Q4_0.gguf")

# Generate response from prompt
prompt = create_comparison_prompt(df_runs, target_run_id)
response = model.generate(prompt, max_tokens=1024, temp=0.7)

with open("artifacts/ai_model_comparison.txt", "w", encoding="utf-8") as f:
    f.write(response)

print("AI Agent Response:")
print(response)

AI Agent Response:
 

## Step 1: Calculate the difference in accuracy between each model and the target model.
To determine which model is better, we need to calculate how well it performs compared to the target model's performance on accuracy.

- RandomForest (Run ID: 8dbf6cc47bc342569c784dd40c5bcd6d): Accuracy = 0.9993 - 0.9980 = 0.0013
- LogisticRegression (Run ID: e09f7961dd684b408c292ae6610ae4b1): Accuracy = 0.9980 - 0.9980 = 0.0000
- SVC (Run ID: fcb26167d4d741df9dc8e33f788dc7c5): Accuracy = 0.9990 - 0.9980 = 0.0010
- RandomForest (Run ID: 182a2e49cd3f48d1a5294a6188ff507b): Accuracy = 0.9987 - 0.9980 = 0.0007
- LogisticRegression (Run ID: 1ce46e3102934fbb9e10b04e49e833b7): Accuracy = 0.9983 - 0.9980 = 0.0003
- SVC (Run ID: ef0abf5224944d5daa3aef7ab4e6f48d): Accuracy = 0.9987 - 0.9980 = 0.0007
- RandomForest (Run ID: b75359b284754ceb9f80e6df58272cd6): Accuracy = 0.9987 - 0.9980 = 0.0007
- LogisticRegression (Run ID: 35d931bc1f564dd38b32ee6e3bb30242): Accuracy = 0.9980 - 0.9980 = 0