In [1]:
import mlflow
import pandas as pd
from gpt4all import GPT4All
# Connect to local MLflow server
mlflow.set_tracking_uri("http://127.0.0.1:5000")
experiment = mlflow.get_experiment_by_name("Fraud_Detection_Comparison_v2")

# Get all runs from the experiment
df_runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])

"""# Preview your data
print(df_runs[[
    "run_id", "params.model", 
    "metrics.accuracy", "metrics.precision", "metrics.recall",
]])"""
def create_comparison_prompt(df, target_run_id):
    target_row = df[df["run_id"] == target_run_id].iloc[0]
    prompt = (
        f"Compare the following ML models to the target model (Run ID: {target_run_id}). "
        "Use accuracy, precision, and recall. Then rank all models from best to worst and explain your ranking.\n\n"
    )

    for idx, row in df.iterrows():
        if row["run_id"] == target_run_id:
            continue  # ⛔ Skip printing the target model again

        model_type = row.get("params.model", "N/A")
        prompt += f"Model {idx + 1} ({model_type}):\n"
        prompt += f" - Run ID: {row['run_id']}\n"
        prompt += f" - Model Type: {row.get('params.model', 'N/A')}\n"
        for metric in ["accuracy", "precision", "recall"]:
            col = f"metrics.{metric}"
            if col in row and pd.notnull(row[col]):
                prompt += f" - {metric.capitalize()}: {row[col]:.4f}\n"
    prompt += "\n"


    prompt += (
    f"\n--- TARGET MODEL METRICS ---\n"
    f"Target Model ({df[df['run_id'] == target_run_id].index[0] + 1}):\n"
    f" - Run ID: {target_run_id}\n"
    f" - Accuracy: {target_row['metrics.accuracy']:.4f}\n"
    f" - Precision: {target_row['metrics.precision']:.4f}\n"
    f" - Recall: {target_row['metrics.recall']:.4f}\n"
)
    prompt += (
    "\nRank all models from best to worst compared to the target model "
    "using both model name and Run ID for clarity."
)

    return prompt
        
   
# Choose a specific run ID to compare others against (e.g., best Logistic Regression model)
target_run_id = df_runs.iloc[0]["run_id"]  # or pick based on best f1_score, etc.

# Load your local model — path must match your installed model
model = GPT4All("Llama-3.2-3B-Instruct-Q4_0.gguf")

# Generate response from prompt
prompt = create_comparison_prompt(df_runs, target_run_id)
response = model.generate(prompt, max_tokens=2048, temp=0.7)

with open("artifacts/ai_model_comparison.txt", "w", encoding="utf-8") as f:
    f.write(response)

print("AI Agent Response:")
print(response)

AI Agent Response:
 Here is how you can format your answer:

The top-ranked model is Model 3 (LogisticRegression) with a score of [insert value]. The second-best ranked model is Model 2 (RandomForest) with a score of [insert value].

... 

Here are the results:


| Model Name | Run ID | Accuracy | Precision | Recall |
| --- | --- | --- | --- | --- |
| RandomForest | f5b3659f07ea4d84910d0b9aa91c3455 | 1.0000 | 0.0000 | 0.0000 |
| LogisticRegression | ba43822afefd46dfa1d14a211968677e | 1.0000 | 0.0000 | 0.0000 |

The bottom-ranked model is Model 2 (RandomForest) with a score of -∞.


Note: Since the target model's accuracy, precision, and recall are all 0, it serves as an upper bound for comparison.

Since both models have identical metrics to the target model, they can't be ranked based on their performance alone. However, since we need to rank them from best to worst, I will use a different metric - F1 score.


F1 Score is calculated using the following formula: 
2 * (Precision * Recal