In [1]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT))

from src.llm.response_generator import generate_multiple_responses

prompt = "Explain what overfitting is in machine learning."

responses = generate_multiple_responses(prompt, n=5)

for i, r in enumerate(responses):
    print(f"\n--- Response {i+1} ---\n")
    print(r)


--- Response 1 ---

Overfitting is a common issue that occurs when a machine learning model learns the training data too well, to the point where it performs poorly on unseen data (such as validation or test sets). In other words, the model becomes excessively complex and starts fitting not just the underlying pattern in the training data, but also the noise or outliers.

Overfitting can occur when a model is too complex relative to the amount of training data available, or when it's trained for too long. This results in a model that fits the training data extremely well, but struggles to generalize to new, unseen data.

Some signs of overfitting include:
1. High accuracy on the training set but poor performance on the validation set or test set.
2. The model performs well on easy examples from the training set, but poorly on harder examples that the model has not seen before.
3. A high variance in the model's predictions, meaning that the model's predictions fluctuate a lot depending

In [2]:
from src.embeddings.embedding_engine import EmbeddingEngine

engine = EmbeddingEngine()

embeddings = engine.encode(responses)
similarity_matrix = engine.compute_similarity_matrix(embeddings)

consistency_score = engine.compute_consistency_score(similarity_matrix)

print("Consistency Score:", consistency_score)



Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Consistency Score: 0.89655083


In [3]:
from src.trust.self_critique import generate_self_critique

critique = generate_self_critique(responses[0])

print("Original Answer:\n", responses[0])
print("\nSelf-Critique:\n", critique)

Original Answer:
 Overfitting is a common issue that occurs when a machine learning model learns the training data too well, to the point where it performs poorly on unseen data (such as validation or test sets). In other words, the model becomes excessively complex and starts fitting not just the underlying pattern in the training data, but also the noise or outliers.

Overfitting can occur when a model is too complex relative to the amount of training data available, or when it's trained for too long. This results in a model that fits the training data extremely well, but struggles to generalize to new, unseen data.

Some signs of overfitting include:
1. High accuracy on the training set but poor performance on the validation set or test set.
2. The model performs well on easy examples from the training set, but poorly on harder examples that the model has not seen before.
3. A high variance in the model's predictions, meaning that the model's predictions fluctuate a lot depending on

In [4]:
from src.trust.self_critique import SemanticVulnerabilityScorer

scorer = SemanticVulnerabilityScorer()

semantic_self_score = scorer.compute_score(critique)

print("Semantic Self-Critique Score:", semantic_self_score)

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Semantic Self-Critique Score: 0.2064296156167984


In [7]:
from src.trust.trust_signals import TrustSignals

signals = TrustSignals(
    consistency_score=consistency_score,
    vulnerability_score=semantic_self_score
)

new_trust = signals.compute_final_score()

print("Updated Trust Score:", new_trust)

Updated Trust Score: 0.8583459


In [8]:
from src.llm.prompt_variants import generate_prompt_variants

variants = generate_prompt_variants(prompt)

variant_trust_scores = []

for v in variants:

    responses = generate_multiple_responses(v, n=5)

    embeddings = engine.encode(responses)
    sim_matrix = engine.compute_similarity_matrix(embeddings)
    consistency = engine.compute_consistency_score(sim_matrix)

    critique = generate_self_critique(responses[0])

    scorer = SemanticVulnerabilityScorer()
    vulnerability = scorer.compute_score(critique)

    signals = TrustSignals(
        consistency_score=consistency,
        vulnerability_score=vulnerability
    )

    trust = signals.compute_final_score()

    variant_trust_scores.append(trust)

print("Trust across variants:", variant_trust_scores)

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Trust across variants: [np.float32(0.8211827), np.float32(0.84009165), np.float32(0.8125754)]


In [9]:
import numpy as np

mean_trust = np.mean(variant_trust_scores)
std_trust = np.std(variant_trust_scores)

print("Mean Trust:", mean_trust)
print("Trust Std Dev:", std_trust)

Mean Trust: 0.8246166
Trust Std Dev: 0.011492882
