<a href="https://colab.research.google.com/github/wesslen/llm-experiments/blob/main/notebooks/uncertainty/gnll/llama_3_2_3b_instruct.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
from typing import List, Dict, Tuple, Optional, Union
import numpy as np
from scipy.stats import entropy
from tqdm import tqdm
import logging
from dataclasses import dataclass
import torch.nn.functional as F

@dataclass
class UncertaintyEstimate:
    """Container for various uncertainty metrics"""
    g_nll: float  # Greedy Negative Log-Likelihood
    predictive_entropy: Optional[float] = None
    semantic_entropy: Optional[float] = None
    length_normalized_pe: Optional[float] = None
    length_normalized_se: Optional[float] = None
    discrete_se: Optional[float] = None

class LLMUncertaintyAnalyzer:
    def __init__(
        self,
        model_name: str,
        nli_model_name: str = "microsoft/deberta-v3-large",
        device: str = "cuda" if torch.cuda.is_available() else "cpu",
        use_fp16: bool = True
    ):
        """
        Initialize uncertainty analyzer with models for generation and semantic similarity.

        Args:
            model_name: HuggingFace model identifier for the main LLM
            nli_model_name: Model for semantic similarity scoring
            device: Computing device ("cuda" or "cpu")
            use_fp16: Whether to use FP16 precision
        """
        self.device = device
        self.dtype = torch.float16 if use_fp16 and device == "cuda" else torch.float32

        print("Initializing tokenizer...")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        print("Initializing main model...")
        try:
            self.model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=self.dtype,
                device_map="auto" if device == "cuda" else None
            ).to(device)
        except Exception as e:
            print(f"Error loading main model: {e}")
            raise

        # Set up padding and attention mask
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
            self.model.config.pad_token_id = self.tokenizer.eos_token_id
        print("Initializing NLI model and tokenizer...")
        try:
            self.nli_model = AutoModel.from_pretrained(nli_model_name).to(device)
            self.nli_tokenizer = AutoTokenizer.from_pretrained(nli_model_name)
        except Exception as e:
            print(f"Error loading NLI model: {e}")
            raise

        print("Initialization complete.")

    def calculate_g_nll(
        self,
        prompt: str,
        max_length: int = 100,
        num_beams: int = 1  # 1 for greedy decoding
    ) -> float:
        """
        Calculate G-NLL (Greedy Negative Log-Likelihood) uncertainty measure.
        Uses greedy or beam search decoding to approximate the most likely sequence.

        Args:
            prompt: Input text prompt
            max_length: Maximum generation length
            num_beams: Number of beams for beam search (1 for greedy)

        Returns:
            G-NLL uncertainty score (higher means more uncertain)
        """
        input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)

        with torch.no_grad():
            outputs = self.model.generate(
                input_ids,
                max_length=max_length,
                num_beams=num_beams,
                return_dict_in_generate=True,
                output_scores=True,
                pad_token_id=self.tokenizer.pad_token_id
            )

            # Get log probabilities for the generated sequence
            scores = torch.stack(outputs.scores)
            log_probs = F.log_softmax(scores, dim=-1)

            # Get the maximum log probability for each position
            max_log_probs = torch.max(log_probs, dim=-1).values

            # Calculate G-NLL as negative sum of max log probs
            g_nll = -torch.sum(max_log_probs).item()

        return g_nll

    def calculate_semantic_similarity(
        self,
        text1: str,
        text2: str
    ) -> float:
        """Calculate semantic similarity between two texts using the NLI model."""
        # Encode each text separately
        encoding1 = self.nli_tokenizer(
            text1,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512
        ).to(self.device)

        encoding2 = self.nli_tokenizer(
            text2,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512
        ).to(self.device)

        with torch.no_grad():
            # Get embeddings for each text
            outputs1 = self.nli_model(**encoding1)
            outputs2 = self.nli_model(**encoding2)

            # Mean pool the last hidden states
            embedding1 = outputs1.last_hidden_state.mean(dim=1)
            embedding2 = outputs2.last_hidden_state.mean(dim=1)

            # Calculate cosine similarity
            similarity = F.cosine_similarity(embedding1, embedding2).item()

        return similarity

    def calculate_predictive_entropy(
        self,
        prompt: str,
        num_samples: int = 10,
        max_length: int = 100,
        temperature: float = 0.8
    ) -> Tuple[float, float]:
        """
        Calculate Predictive Entropy (PE) and Length-normalized PE.

        Returns:
            Tuple of (PE, Length-normalized PE)
        """
        log_probs_list = []
        lengths = []

        for _ in range(num_samples):
            with torch.no_grad():
                outputs = self.model.generate(
                    self.tokenizer.encode(prompt, return_tensors="pt").to(self.device),
                    max_length=max_length,
                    do_sample=True,
                    temperature=temperature,
                    return_dict_in_generate=True,
                    output_scores=True
                )

                scores = torch.stack(outputs.scores)
                log_probs = F.log_softmax(scores, dim=-1)
                selected_log_probs = torch.max(log_probs, dim=-1).values
                log_probs_list.append(selected_log_probs.sum().item())
                lengths.append(len(selected_log_probs))

        # Calculate standard PE
        pe = -np.mean(log_probs_list)

        # Calculate length-normalized PE
        normalized_log_probs = [lp / length for lp, length in zip(log_probs_list, lengths)]
        ln_pe = -np.mean(normalized_log_probs)

        return pe, ln_pe

    def calculate_semantic_entropy(
        self,
        prompt: str,
        num_samples: int = 10,
        max_length: int = 100,
        temperature: float = 0.8,
        similarity_threshold: float = 0.8
    ) -> Tuple[float, float, float]:
        """
        Calculate Semantic Entropy (SE), Length-normalized SE, and Discrete SE.

        Returns:
            Tuple of (SE, Length-normalized SE, Discrete SE)
        """
        # Generate samples
        samples = []
        for _ in range(num_samples):
            with torch.no_grad():
                output_ids = self.model.generate(
                    self.tokenizer.encode(prompt, return_tensors="pt").to(self.device),
                    max_length=max_length,
                    do_sample=True,
                    temperature=temperature
                )
                samples.append(self.tokenizer.decode(output_ids[0], skip_special_tokens=True))

        # Calculate semantic similarity matrix
        sim_matrix = np.zeros((num_samples, num_samples))
        for i in range(num_samples):
            for j in range(i + 1, num_samples):
                sim = self.calculate_semantic_similarity(samples[i], samples[j])
                sim_matrix[i, j] = sim_matrix[j, i] = sim

        # Cluster samples based on similarity
        clusters = []
        used = set()
        for i in range(num_samples):
            if i not in used:
                cluster = {i}
                used.add(i)
                for j in range(num_samples):
                    if j not in used and sim_matrix[i, j] >= similarity_threshold:
                        cluster.add(j)
                        used.add(j)
                clusters.append(cluster)

        # Calculate different forms of semantic entropy
        cluster_probs = [len(c) / num_samples for c in clusters]
        se = entropy(cluster_probs)
        ln_se = se / len(clusters)  # Length-normalized SE
        d_se = 1 - (1 / len(clusters))  # Discrete SE

        return se, ln_se, d_se

    def estimate_uncertainty(
        self,
        prompt: str,
        max_length: int = 100,
        num_samples: int = 10,
        temperature: float = 0.8,
        calculate_all: bool = True
    ) -> UncertaintyEstimate:
        """
        Calculate all uncertainty measures for a given prompt.

        Args:
            prompt: Input text prompt
            max_length: Maximum generation length
            num_samples: Number of samples for entropy-based measures
            temperature: Sampling temperature
            calculate_all: Whether to calculate all measures or just G-NLL

        Returns:
            UncertaintyEstimate object containing all calculated metrics
        """
        # Always calculate G-NLL as our primary measure
        g_nll = self.calculate_g_nll(prompt, max_length)

        if not calculate_all:
            return UncertaintyEstimate(g_nll=g_nll)

        # Calculate additional measures if requested
        pe, ln_pe = self.calculate_predictive_entropy(
            prompt, num_samples, max_length, temperature
        )

        se, ln_se, d_se = self.calculate_semantic_entropy(
            prompt, num_samples, max_length, temperature
        )

        return UncertaintyEstimate(
            g_nll=g_nll,
            predictive_entropy=pe,
            length_normalized_pe=ln_pe,
            semantic_entropy=se,
            length_normalized_se=ln_se,
            discrete_se=d_se
        )

def evaluate_uncertainty(
    analyzer: LLMUncertaintyAnalyzer,
    prompts: List[str],
    ground_truth: List[str],
    f1_threshold: float = 0.5
) -> Dict[str, float]:
    """
    Evaluate uncertainty estimation performance using AUROC.

    Args:
        analyzer: Initialized LLMUncertaintyAnalyzer
        prompts: List of input prompts
        ground_truth: List of ground truth answers
        f1_threshold: Threshold for F1 score to consider an answer correct

    Returns:
        Dictionary of AUROC scores for each uncertainty measure
    """
    from sklearn.metrics import roc_auc_score
    from nltk.translate.bleu_score import sentence_bleu

    results = []
    correctness = []

    for prompt, truth in tqdm(zip(prompts, ground_truth), total=len(prompts)):
        # Get uncertainty estimates
        uncertainty = analyzer.estimate_uncertainty(prompt, calculate_all=True)
        results.append(uncertainty)

        # Generate answer for correctness evaluation
        output_ids = analyzer.model.generate(
            analyzer.tokenizer.encode(prompt, return_tensors="pt").to(analyzer.device),
            max_length=100,
            num_beams=1  # Greedy decoding
        )
        answer = analyzer.tokenizer.decode(output_ids[0], skip_special_tokens=True)

        # Calculate F1-like score using BLEU (simplified for example)
        score = sentence_bleu([truth.split()], answer.split())
        correctness.append(score >= f1_threshold)

    # Calculate AUROC for each measure
    auroc_scores = {}
    for field in UncertaintyEstimate.__dataclass_fields__:
        if field == "g_nll":  # Always available
            values = [getattr(r, field) for r in results]
            auroc_scores[field] = roc_auc_score(correctness, values)
        else:  # Optional measures
            values = [getattr(r, field) for r in results if getattr(r, field) is not None]
            if values:
                auroc_scores[field] = roc_auc_score(
                    correctness[:len(values)], values
                )

    return auroc_scores

def test_uncertainty_estimation(
    model_name: str = "gpt2",  # Using smaller model by default
    nli_model_name: str = "microsoft/deberta-v3-small",
    test_prompt: str = "What is the capital of France?",
    use_fp16: bool = True,
    max_length: int = 50,
    num_samples: int = 5
):
    """
    Test function for uncertainty estimation.

    Args:
        model_name: Name of the main LLM model
        nli_model_name: Name of the NLI model for semantic similarity
        test_prompt: Prompt to test
        use_fp16: Whether to use half precision
        max_length: Maximum generation length
        num_samples: Number of samples for entropy calculation
    """
    print(f"Initializing with model: {model_name}")
    print(f"Using NLI model: {nli_model_name}")

    try:
        # Initialize analyzer
        analyzer = LLMUncertaintyAnalyzer(
            model_name=model_name,
            nli_model_name=nli_model_name,
            use_fp16=use_fp16
        )

        print("\nTesting uncertainty estimation...")
        print(f"Prompt: {test_prompt}")

        # Get uncertainty estimates
        uncertainty = analyzer.estimate_uncertainty(
            prompt=test_prompt,
            max_length=max_length,
            num_samples=num_samples,
            calculate_all=True
        )

        # Print results
        print("\nResults:")
        print(f"G-NLL: {uncertainty.g_nll:.4f}")
        if uncertainty.predictive_entropy is not None:
            print(f"Predictive Entropy: {uncertainty.predictive_entropy:.4f}")
        if uncertainty.semantic_entropy is not None:
            print(f"Semantic Entropy: {uncertainty.semantic_entropy:.4f}")
        if uncertainty.length_normalized_pe is not None:
            print(f"Length-normalized PE: {uncertainty.length_normalized_pe:.4f}")
        if uncertainty.length_normalized_se is not None:
            print(f"Length-normalized SE: {uncertainty.length_normalized_se:.4f}")
        if uncertainty.discrete_se is not None:
            print(f"Discrete SE: {uncertainty.discrete_se:.4f}")

    except Exception as e:
        print(f"\nError during testing: {str(e)}")
        raise


## Prompt Categories:

1.   Factual: Simple, verifiable facts
2.   Opinion: Subjective questions
3.   Creative: Open-ended creative tasks
4.   Ambiguous: Philosophical/abstract questions
5.   Technical: Complex technical explanations
6.   Arithmetic: Mathematical calculations

## Expected Patterns:

- G-NLL should be lowest for factual and arithmetic questions
- Semantic Entropy should be highest for creative and opinion prompts
- Predictive Entropy might be high for ambiguous questions
- Length-normalized measures should help compare across different response lengths

## Expected insights:

### Factual questions should show:

- Lower G-NLL (more confident)
- Lower Semantic Entropy (consistent meanings)
- Lower Predictive Entropy (consistent outputs)


### Creative/Opinion questions should show:

- Higher G-NLL (less confident)
- Higher Semantic Entropy (varied meanings)
- Higher Predictive Entropy (varied outputs)


### Technical questions might show:

- Moderate G-NLL
- Low Semantic Entropy
- High Predictive Entropy (many ways to explain)

In [None]:
from huggingface_hub import login
from google.colab import userdata

# Get the Hugging Face token from the Colab secret
hf_token = userdata.get('huggingface')

# Log in to Hugging Face Hub
login(token=hf_token)

In [None]:
import pandas as pd
import numpy as np
from typing import List, Dict
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

class UncertaintyExperiment:
    def __init__(self, analyzer):
        self.analyzer = analyzer

    def run_experiment(self, prompts: Dict[str, List[str]], max_length: int = 50):
        """
        Run uncertainty analysis on different types of prompts.

        Args:
            prompts: Dictionary mapping prompt categories to lists of prompts
            max_length: Maximum generation length
        """
        results = []

        for category, prompt_list in prompts.items():
            print(f"\nProcessing {category} prompts...")
            for prompt in tqdm(prompt_list):
                uncertainty = self.analyzer.estimate_uncertainty(
                    prompt=prompt,
                    max_length=max_length,
                    calculate_all=True
                )

                results.append({
                    'category': category,
                    'prompt': prompt,
                    'g_nll': uncertainty.g_nll,
                    'predictive_entropy': uncertainty.predictive_entropy,
                    'semantic_entropy': uncertainty.semantic_entropy,
                    'length_normalized_pe': uncertainty.length_normalized_pe,
                    'length_normalized_se': uncertainty.length_normalized_se,
                    'discrete_se': uncertainty.discrete_se
                })

        return pd.DataFrame(results)

    def visualize_results(self, df: pd.DataFrame, save_path: str = None):
        """Create visualizations for uncertainty measures across categories."""
        # Prepare data for plotting
        measures = ['g_nll', 'predictive_entropy', 'semantic_entropy',
                   'length_normalized_pe', 'length_normalized_se', 'discrete_se']

        # Create subplots for each measure
        fig, axes = plt.subplots(2, 3, figsize=(15, 10))
        fig.suptitle('Uncertainty Measures Across Different Prompt Types', fontsize=16)

        for idx, measure in enumerate(measures):
            ax = axes[idx // 3, idx % 3]
            sns.boxplot(data=df, x='category', y=measure, ax=ax)
            ax.set_title(measure)
            ax.set_xticklabels(ax.get_xticklabels(), rotation=45)

        plt.tight_layout()

        if save_path:
            plt.savefig(save_path)
            plt.close()
        else:
            plt.show()

    def analyze_correlations(self, df: pd.DataFrame):
        """
        Analyze correlations between different uncertainty measures and provide detailed statistics.
        """
        measures = ['g_nll', 'predictive_entropy', 'semantic_entropy',
                   'length_normalized_pe', 'length_normalized_se', 'discrete_se']

        # Calculate correlation matrix
        corr_matrix = df[measures].corr()

        # Calculate descriptive statistics for each measure by category
        stats_by_category = {}
        for category in df['category'].unique():
            category_data = df[df['category'] == category][measures]
            stats_by_category[category] = {
                'mean': category_data.mean(),
                'std': category_data.std(),
                'min': category_data.min(),
                'max': category_data.max(),
                'median': category_data.median()
            }

        return {
            'correlation_matrix': corr_matrix,
            'statistics_by_category': stats_by_category
        }

def run_comprehensive_experiment():
    """Run a comprehensive experiment with different types of prompts."""
    # Define test prompts by category
    test_prompts = {
        'factual': [
            "What is the capital of France?",
            "What year did World War II end?",
            "What is the chemical symbol for gold?",
            "Who wrote Romeo and Juliet?",
            "What is the speed of light in meters per second?"
        ],
        'opinion': [
            "What is the best way to learn a new language?",
            "Why is democracy important?",
            "What makes a good leader?",
            "Is artificial intelligence beneficial for society?",
            "What is the most important invention in history?"
        ],
        'creative': [
            "Write a short story about a magical forest",
            "Describe a futuristic city",
            "Create a new superhero character",
            "Write a poem about sunrise",
            "Invent a new sport and describe its rules"
        ],
        'ambiguous': [
            "Why do we dream?",
            "What is consciousness?",
            "How does time work?",
            "What is the meaning of life?",
            "What happens after death?"
        ],
        'technical': [
            "Explain how a quantum computer works",
            "Describe the process of photosynthesis",
            "How does blockchain technology function?",
            "Explain the theory of relativity",
            "How does the human immune system work?"
        ],
        'arithmetic': [
            "What is 2345 * 789?",
            "Calculate the square root of 169",
            "What is 15% of 430?",
            "Solve for x: 3x + 7 = 22",
            "Convert 157 kilometers to miles"
        ]
    }

    # Initialize analyzer and experiment
    # from llm_uncertainty import LLMUncertaintyAnalyzer
    analyzer = LLMUncertaintyAnalyzer(
        model_name="meta-llama/Llama-3.2-3B-Instruct",
        nli_model_name="microsoft/deberta-v3-small"
    )
    experiment = UncertaintyExperiment(analyzer)

    # Run experiment
    print("Running uncertainty experiment...")
    results_df = experiment.run_experiment(test_prompts)

    # Analyze results
    print("\nGenerating visualizations...")
    experiment.visualize_results(results_df, "uncertainty_results.png")

    print("\nAnalyzing correlations between measures...")
    correlations = experiment.analyze_correlations(results_df)

    # Print summary statistics for numeric columns only
    print("\nSummary Statistics by Category:")
    numeric_columns = ['g_nll', 'predictive_entropy', 'semantic_entropy',
                      'length_normalized_pe', 'length_normalized_se', 'discrete_se']
    summary = results_df.groupby('category')[numeric_columns].mean()
    print(summary)

    # Calculate standard deviations
    std_dev = results_df.groupby('category')[numeric_columns].std()
    print("\nStandard Deviations by Category:")
    print(std_dev)

    return results_df, correlations

if __name__ == "__main__":
    results_df, correlations = run_comprehensive_experiment()

    # Print detailed analysis
    print("\nDetailed Analysis:")
    print("=================")

    # Calculate and display statistics for each measure
    measures = ['g_nll', 'predictive_entropy', 'semantic_entropy',
               'length_normalized_pe', 'length_normalized_se', 'discrete_se']

    for measure in measures:
        print(f"\n{measure} Analysis:")
        print("-" * (len(measure) + 9))

        # Get statistics by category
        stats = results_df.groupby('category')[measure].agg(['mean', 'std', 'min', 'max'])

        # Find most and least uncertain categories
        most_uncertain = stats['mean'].idxmax()
        least_uncertain = stats['mean'].idxmin()

        print(f"Most uncertain category: {most_uncertain}")
        print(f"  Mean: {stats.loc[most_uncertain, 'mean']:.4f}")
        print(f"  Std Dev: {stats.loc[most_uncertain, 'std']:.4f}")

        print(f"\nLeast uncertain category: {least_uncertain}")
        print(f"  Mean: {stats.loc[least_uncertain, 'mean']:.4f}")
        print(f"  Std Dev: {stats.loc[least_uncertain, 'std']:.4f}")

        # Calculate relative uncertainty ratios
        max_uncertainty = stats.loc[most_uncertain, 'mean']
        min_uncertainty = stats.loc[least_uncertain, 'mean']
        ratio = max_uncertainty / min_uncertainty if min_uncertainty != 0 else float('inf')
        print(f"\nUncertainty ratio (most/least): {ratio:.2f}")

    # Print correlation insights
    print("\nCorrelation Analysis:")
    print("===================")

    high_correlations = []
    for i in range(len(measures)):
        for j in range(i+1, len(measures)):
            corr = correlations['correlation_matrix'].iloc[i,j]
            if abs(corr) > 0.5:  # Threshold for strong correlation
                high_correlations.append((measures[i], measures[j], corr))

    if high_correlations:
        print("\nStrong correlations found between:")
        for measure1, measure2, corr in sorted(high_correlations, key=lambda x: abs(x[2]), reverse=True):
            print(f"{measure1} and {measure2}: {corr:.3f}")

Example result:

```

Analyzing correlations between measures...

Summary Statistics by Category:
               g_nll  predictive_entropy  semantic_entropy  \
category                                                     
ambiguous   7.460395           10.947515          0.000000   
arithmetic  5.238763            7.340768          0.127806   
creative    8.621908           11.288671          0.000000   
factual     5.116576            9.395902          0.000000   
opinion     5.921459            9.509346          0.000000   
technical   3.741592            6.405319          0.065017   

            length_normalized_pe  length_normalized_se  discrete_se  
category                                                             
ambiguous               0.250374              0.000000     0.000000  
arithmetic              0.183778              0.042602     0.133333  
creative                0.262687              0.000000     0.000000  
factual                 0.235652              0.000000     0.000000  
opinion                 0.227618              0.000000     0.000000  
technical               0.151334              0.032508     0.100000  

Standard Deviations by Category:
               g_nll  predictive_entropy  semantic_entropy  \
category                                                     
ambiguous   3.438067            2.197581          0.000000   
arithmetic  3.164331            3.265902          0.285784   
creative    1.990587            2.570404          0.000000   
factual     3.054828            2.344336          0.000000   
opinion     2.475576            2.777571          0.000000   
technical   2.069559            2.807816          0.145382   

            length_normalized_pe  length_normalized_se  discrete_se  
category                                                             
ambiguous               0.051525              0.000000     0.000000  
arithmetic              0.081125              0.095261     0.298142  
creative                0.055048              0.000000     0.000000  
factual                 0.053355              0.000000     0.000000  
opinion                 0.060014              0.000000     0.000000  
technical               0.064493              0.072691     0.223607  

Detailed Analysis:
=================

g_nll Analysis:
--------------
Most uncertain category: creative
  Mean: 8.6219
  Std Dev: 1.9906

Least uncertain category: technical
  Mean: 3.7416
  Std Dev: 2.0696

Uncertainty ratio (most/least): 2.30

predictive_entropy Analysis:
---------------------------
Most uncertain category: creative
  Mean: 11.2887
  Std Dev: 2.5704

Least uncertain category: technical
  Mean: 6.4053
  Std Dev: 2.8078

Uncertainty ratio (most/least): 1.76

semantic_entropy Analysis:
-------------------------
Most uncertain category: arithmetic
  Mean: 0.1278
  Std Dev: 0.2858

Least uncertain category: ambiguous
  Mean: 0.0000
  Std Dev: 0.0000

Uncertainty ratio (most/least): inf

length_normalized_pe Analysis:
-----------------------------
Most uncertain category: creative
  Mean: 0.2627
  Std Dev: 0.0550

Least uncertain category: technical
  Mean: 0.1513
  Std Dev: 0.0645

Uncertainty ratio (most/least): 1.74

length_normalized_se Analysis:
-----------------------------
Most uncertain category: arithmetic
  Mean: 0.0426
  Std Dev: 0.0953

Least uncertain category: ambiguous
  Mean: 0.0000
  Std Dev: 0.0000

Uncertainty ratio (most/least): inf

discrete_se Analysis:
--------------------
Most uncertain category: arithmetic
  Mean: 0.1333
  Std Dev: 0.2981

Least uncertain category: ambiguous
  Mean: 0.0000
  Std Dev: 0.0000

Uncertainty ratio (most/least): inf

Correlation Analysis:
===================

Strong correlations found between:
length_normalized_se and discrete_se: 1.000
predictive_entropy and length_normalized_pe: 0.986
semantic_entropy and discrete_se: 0.984
semantic_entropy and length_normalized_se: 0.983
g_nll and predictive_entropy: 0.722
g_nll and length_normalized_pe: 0.705
```