# TextEvolve Evaluate

This notebook contains a demonstration implementation of the TextEvolve Evaluate service, inspired by ChatEval ([arXiv:2308.07201](https://arxiv.org/abs/2308.07201))

In [3]:
from scipy.stats import truncnorm
import numpy as np
from typing import List, Tuple

class TextEvolveEvaluator:
    def __init__(self, a: List[str], w: np.ndarray, r: int, c: float, l: int):
        """
        Initialize the evaluator with the given configuration settings.

        Args:
            a (List[str]): List of debater agents.
            w (np.ndarray): Weight vector for score components.
            r (int): Number of debate rounds.
            c (float): Convergence threshold for early stopping.
            l (int): Debate history parameter (currently a placeholder).
        """
        self.a = a
        self.w = w
        self.r = r
        self.c = c
        self.l = l  # Placeholder for the debate history length

    @staticmethod
    def llm(x: str, y: List[str], xi: List[str], m_i: List[str], j: int, k: int, round_num: int) -> np.ndarray:
        """
        Simulate the LLM scoring process for an agent using a truncated normal distribution.
        This simulation controls the coefficient of variation (CV) to mimic agents reaching consensus over time.

        Args:
            x (str): Input context in natural language.
            y (List[str]): List of candidate responses.
            xi (List[str]): Debate history up to the current round.
            m_i (List[str]): Memories specific to the current agent.
            j (int): Number of candidate responses.
            k (int): Number of score components.
            round_num (int): The current round number.
            initial_cv (float): Initial coefficient of variation for the first round.
            cv_decrease_percent (float): Percentage by which CV decreases each round.

        Returns:
            np.ndarray: A j x k matrix with scores between 0.0 and 10.0.
        """

        # Note: This implementation of LLM is simulating scores and not part of the core algorithm. 
        
        # Simulation configuration: The Initial coefficient of variation for the first round.
        initial_cv: float = 0.2

        # Simulation configuration: Percentage by which CV decreases each round.
        cv_decrease_percent: float = 10.0

        # Generate mean scores for each candidate response and score component
        # The mean is randomly chosen between 4.5 and 5.5 to center the scores around the middle of the 0-10 range
        mean_score = np.random.uniform(4.5, 5.5, (j, k))  # Shape: (j, k)

        # Compute the current coefficient of variation (CV) for this round
        # CV decreases with each round by the specified percentage to simulate agents reaching consensus
        current_cv = initial_cv * (1 - cv_decrease_percent / 100.0) ** round_num

        # Standard deviation is calculated as a proportion of the mean score, based on the current CV
        std_dev = mean_score * current_cv  # Shape: (j, k)

        # Set the lower and upper bounds for the scores to ensure they stay within the valid range [0.0, 10.0]
        lower, upper = 0.0, 10.0

        # Generate scores using a truncated normal distribution
        # The scores are generated such that they fall within the [0.0, 10.0] range, following a normal distribution centered on mean_score with std_dev
        scores = truncnorm(
            (lower - mean_score) / std_dev,  # Lower bound in standardized units
            (upper - mean_score) / std_dev,  # Upper bound in standardized units
            loc=mean_score,  # Mean of the distribution
            scale=std_dev   # Standard deviation of the distribution
        ).rvs()  # Generate random variates

        return scores

    @staticmethod
    def compute_cv(matrix: np.ndarray) -> float:
        """
        Compute the coefficient of variation (CV) for a given score matrix.

        Args:
            matrix (np.ndarray): A matrix of scores (i x j x k) from which CV is computed.

        Returns:
            float: The coefficient of variation of the scores.
        """
        return np.std(matrix) / np.mean(matrix)

    def evaluate(self, x: str, y: List[str]) -> np.ndarray:
        """
        Evaluate the candidate responses using the TextEvolve evaluation function.

        Args:
            x (str): Input context in natural language.
            y (List[str]): List of candidate responses.

        Returns:
            np.ndarray: The scoring tensor S with dimensions (r x i x j x k).
        """
        j = len(y)  # Number of candidate responses
        k = len(self.w)  # Number of score components
        i = len(self.a)  # Number of agents
        S = np.zeros((self.r, i, j, k))  # Initialize scoring tensor

        for round_num in range(self.r):
            for agent_num in range(i):
                # Placeholder for debate history
                xi = []

                # This is placeholder for agent memories, in a real implementation, xi and m_i would be 
                # derived from actual debate history and x y values
                m_i = [] 
                
                S[round_num, agent_num] = self.llm(x, y, xi, m_i, j, k, round_num)

            # After each round, compute CV and check for early stopping
            round_cv = self.compute_cv(S[round_num])
            print(f"Round {round_num + 1} CV: {round_cv:.4f}")
            print(f"Scores after Round {round_num + 1}:\n{S[round_num]}")
            print("-" * 50)

            if round_cv <= self.c:
                print(f"Early stopping triggered at Round {round_num + 1} (CV <= {self.c})")
                S = S[:round_num + 1]  # Truncate the tensor to the completed rounds
                break

        return S

    def compute_normalized_scores(self, S: np.ndarray) -> np.ndarray:
        """
        Compute the normalized scores for each response candidate.

        Args:
            S (np.ndarray): The scoring tensor with dimensions (r x i x j x k).

        Returns:
            np.ndarray: Normalized score vector for each candidate response.
        """
        return np.sum(S * self.w, axis=(0, 1, 3)) / np.prod(S.shape[:-2])

    @staticmethod
    def compute_softmax_scores(s_norm: np.ndarray) -> np.ndarray:
        """
        Compute the softmax scores for each response candidate.

        Args:
            s_norm (np.ndarray): Normalized score vector.

        Returns:
            np.ndarray: Softmax score vector representing probabilities.
        """
        e_x = np.exp(s_norm - np.max(s_norm))  # Subtract max for numerical stability
        return e_x / e_x.sum(axis=0)

    def select_best_candidate(self, S: np.ndarray, y: List[str], probabilistic: bool = False) -> str:
        """
        Select the best response candidate based on normalized or softmax scores.

        Args:
            S (np.ndarray): The scoring tensor with dimensions (r x i x j x k).
            y (List[str]): List of candidate responses.
            probabilistic (bool): If True, selects based on softmax scores; otherwise, based on normalized scores.

        Returns:
            str: The selected response candidate.
        """
        # Compute normalized scores
        s_norm = self.compute_normalized_scores(S)
        
        if probabilistic:
            # Compute softmax scores
            s_phi = self.compute_softmax_scores(s_norm)
            print(f"Softmax Scores:\n{s_phi}")
            # Select the best response candidate probabilistically using softmax scores
            best_index = np.random.choice(np.arange(len(y)), p=s_phi)
        else:
            print(f"Normalized Scores:\n{s_norm}")
            # Select the best response candidate using normalized scores
            best_index = np.argmax(s_norm)

        print(f"Best candidate: {y[best_index]}")
        return y[best_index]

## Run 1: All Debate Rounds

In [11]:
a = ["Critic", "Supporter", "Neutral Observer"]
w = np.array([1.0, 2.0, 1.0, 1.0, 1.0, 1.0])
r = 2
c = 0.05
l = 3

evaluator = TextEvolveEvaluator(a, w, r, c, l)

x = "What is the capital of France?"
y = ["Paris", "Lyon", "Marseille", "Bordeaux"]

S = evaluator.evaluate(x, y)
best_norm = evaluator.select_best_candidate(S, y, probabilistic=False)
best_phi = evaluator.select_best_candidate(S, y, probabilistic=True)

Round 1 CV: 0.1910
Scores after Round 1:
[[[5.56551277 6.47592468 5.20177472 5.54886533 5.34811008 5.77335109]
  [3.79044705 6.43762504 4.24772305 5.24323777 5.04434372 3.11961379]
  [5.27975442 5.070902   5.86902891 5.68094046 4.07851425 5.32363434]
  [5.16997582 5.13113557 3.16900574 3.35050658 5.33962937 5.25523733]]

 [[4.25917555 5.42478793 4.53769133 5.05544246 4.76753478 6.54091629]
  [4.61246683 4.84761556 5.2237192  7.47980295 4.5297462  4.5268935 ]
  [4.67085727 6.22199493 5.42572746 4.22478607 5.18227605 4.81687637]
  [4.11884819 5.00492179 4.43466791 4.99063672 6.22687015 6.93851698]]

 [[4.05193675 3.99576715 4.78230353 3.76518525 5.35236348 5.62661895]
  [4.55960648 6.66019104 5.41190266 4.76137714 2.53464633 5.29412747]
  [4.96274897 5.50484233 4.75056531 4.60678359 6.0066978  3.44586018]
  [3.98700427 7.31215325 4.29068005 3.81483393 4.83808499 4.04373613]]]
--------------------------------------------------
Round 2 CV: 0.1915
Scores after Round 2:
[[[4.66574882 6.56463

## Run 2: Early Stopping

In [15]:
a = ["Critic", "Supporter", "Neutral Observer"]
w = np.array([1.0, 2.0, 1.0, 1.0, 1.0, 1.0])
r = 2
c = 0.2
l = 3

evaluator = TextEvolveEvaluator(a, w, r, c, l)

x = "What is the capital of France?"
y = ["Paris", "Lyon", "Marseille", "Bordeaux"]

S = evaluator.evaluate(x, y)
best_norm = evaluator.select_best_candidate(S, y, probabilistic=False)
best_phi = evaluator.select_best_candidate(S, y, probabilistic=True)

Round 1 CV: 0.2061
Scores after Round 1:
[[[4.40003482 5.76765998 4.41084156 4.31928368 5.83284199 4.74998494]
  [4.62425354 5.36633232 4.95296737 3.87591941 5.96994978 6.66678477]
  [4.20660256 4.92685516 4.3238116  4.69895344 6.06101375 2.71496257]
  [7.02305673 4.58487347 7.37838062 6.67655097 4.52597735 4.46062579]]

 [[3.95553449 3.7668584  6.0774995  3.44993817 5.47137199 4.28423879]
  [4.73745441 5.5172704  4.8420824  7.85598858 4.14153904 5.40155101]
  [4.71989264 7.0849561  7.09259026 6.47248176 4.98430326 5.93845128]
  [5.49111395 5.87566403 4.00080663 4.59489754 4.90853868 6.30478466]]

 [[4.19929269 6.29161438 7.37663719 5.70719008 4.7922499  5.32176347]
  [6.04777018 5.89642721 3.48786916 5.47774224 4.04378048 6.5586131 ]
  [5.99654923 4.51092812 5.48205854 6.01867858 4.7433706  4.98619432]
  [5.67946141 6.2573098  3.91824824 6.47105409 4.7091243  3.35459124]]]
--------------------------------------------------
Round 2 CV: 0.1935
Scores after Round 2:
[[[4.13786967 7.91553