# 1.Data
## 生成依存樹

In [10]:
import os
import sys
from pathlib import Path

# 获取当前 notebook 所在目录（适用于 Jupyter）
notebook_dir = Path(os.getcwd())
print("Notebook directory:", notebook_dir)

# 添加到 sys.path
sys.path.append(str(notebook_dir))

Notebook directory: /root/autodl-tmp/RouteLLM


In [None]:
import spacy
import dgl
import os
import pandas as pd
from tqdm import tqdm
import torch

nlp = spacy.load("en_core_web_sm")

def build_dependency_graph(text, graph_id, save_dir="./saved/graph"):
    """构建依存语法图并保存"""
    os.makedirs(save_dir, exist_ok=True)
    
    doc = nlp(text)
    edges = []
    for token in doc:
        if token.head.i != token.i:  # 排除自环
            edges.append((token.head.i, token.i))

    # 创建DGL图
    src_nodes = [s for s, d in edges]
    dst_nodes = [d for s, d in edges]
    g = dgl.graph((src_nodes, dst_nodes))
    
    # 为每个节点添加特征（假设特征维度为300）
    num_nodes = g.num_nodes()
    node_features = torch.randn(num_nodes, 300)  # 随机初始化节点特征
    g.ndata['feat'] = node_features  # 将特征存储在 'feat' 字段中

    # 保存图
    dgl.save_graphs(os.path.join(save_dir, f"{graph_id}.dgl"), [g])

# 示例使用
if __name__ == "__main__":

    # 读取 combined_data.csv
    df = pd.read_csv('combined_data.csv')

    # 使用 tqdm 遍历每一行的 prompt 并调用 build_dependency_graph
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing prompts"):
        prompt = row['prompt']
        graph_id = f"graph_{idx}"  # 使用索引作为图的唯一 ID
        build_dependency_graph(prompt, graph_id)

    print("All prompts processed and graphs saved!")

## ConceptNet数据增强

In [3]:
!export HF_ENDPOINT=https://hf-mirror.com

In [4]:
import spacy
from collections import defaultdict
import pandas as pd
from src.DataAugmentation.run_deepseek32b import deepseek_generate

def routing_decision(score):
    """根据评分进行路由决策"""
    return "strong_model" if score < 4 else "weak_model"

# 初始化NLP模型
nlp = spacy.load("en_core_web_sm")

def extract_entities(prompt):
    """使用spacy提取核心实体"""
    doc = nlp(prompt)
    return [ent.text for ent in doc.ents if ent.label_ in ['GPE', 'LOC', 'ORG', 'PERSON']]

import requests

def query_conceptnet(entity):
    """查询ConceptNet获取关联实体"""
    url = f"http://api.conceptnet.io/query?node=/c/en/{entity}&limit=10"
    response = requests.get(url).json()
    
    related = []
    for edge in response['edges']:
        end = edge['end']['label'].lower()
        if end != entity.lower():
            related.append((end, edge['rel']['label']))
    return list(set(related))[:5]  # 取前5个不重复关联

from transformers import AutoModelForCausalLM, AutoTokenizer

# model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name)

def enhance_prompt(original, entities):
    """构建增强查询模板"""
    return f"{original} 请结合以下概念进行详细解释：{', '.join(entities)}"

def generate_response(prompt):
    """使用Mixtral生成回答"""
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_length=512)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def knowledge_graph_augmentation(df, threshold=3):

    instruct_prompt = """
    You are tasked with evaluating the quality of responses generated by two models—a smaller model and a larger model—for a given question \( q \). 
    The smaller model's response is \( \text{response\_a} \), and the larger model's response is \( \text{response\_b} \). 
    Based on the quality of the responses, assign a score between 1 and 5, where:
    - **1-3**: The response quality is not good enough, and the question is better suited for the smaller (weaker) model.
    - **4-5**: The response quality is good, and the question is better suited for the larger (stronger) model.
    Your task is to determine whether the question \( q \) is better suited for the smaller model or the larger model based on the response quality. Only generate a single numerical score between 1 and 5. Do not provide any additional explanation or context.
    **Output Format:**  
    A single integer between 1 and 5.  
    **Example:**  
    If the response quality is good and the question is better suited for the larger model, output:  
    `5`  
    If the response quality is not good enough and the question is better suited for the smaller model, output:  
    `2`  
    **Your Output:**  
    `[Your score here]`
    """
        
    
    expanded_data = []
    for _, row in df.iterrows():
        # 实体提取
        entities = extract_entities(row['prompt'])
        if not entities:
            continue
            
        # 知识图谱查询
        all_related = []
        for ent in entities:
            all_related += [r[0] for r in query_conceptnet(ent)]
        
        # 生成增强提示
        enhanced_prompt = enhance_prompt(row['prompt'], list(set(all_related)))
        
        # 小模型生成回答
        small_llm_a = generate_response(enhanced_prompt)
        # deepseek标注
        large_llm_a = deepseek_generate(enhanced_prompt)

        score = deepseek_generate(
            instruct_prompt.replace('{response\_a}', small_llm_a).replace('{response\_b}', large_llm_a).replace('\( q \)', q_prime)
            )
        if int(score) > threshold:
            winner = 'a'
        elif int(score) == threshold:
            winner == "tie"
        else:
            winner = 'b'

        # 构造新数据行
        new_row = {
            "id": len(expanded_data) + 1,  # 生成唯一 ID
            "model_a": 'Mixtral-8x7B',
            "model_b": "deepseek-32b",
            "prompt": enhanced_prompt,
            "response_a": small_llm_a,
            "response_b": large_llm_a,
            "winner_model_a": 1 if winner == "a" else 0,
            "winner_model_b": 1 if winner == "b" else 0,
            "winner_tie": 1 if winner == "tie" else 0,
            "score": int(score)
        }
        
        # 添加到扩充数据集中
        expanded_data.append(new_row)
    
    return pd.DataFrame(expanded_data)

## 采样增强

In [5]:
import numpy as np
import pandas as pd
from scipy.stats import beta
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# 1. 困惑度计算模块
class PerplexityCalculator:
    def __init__(self, model_name="gpt2"):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name).to(self.device)
        self.model.eval()
    
    def calculate_ppl(self, text, stride=512):
        """计算单个文本的困惑度"""
        encodings = self.tokenizer(text, return_tensors="pt")
        max_length = self.model.config.n_positions
        seq_len = encodings.input_ids.size(1)
        
        nlls = []
        for begin_index in range(0, seq_len, stride):
            end_index = min(begin_index + max_length, seq_len)
            input_ids = encodings.input_ids[:, begin_index:end_index].to(self.device)
            target_ids = input_ids.clone()
            
            with torch.no_grad():
                outputs = self.model(input_ids, labels=target_ids)
                neg_log_likelihood = outputs.loss
            
            nlls.append(neg_log_likelihood)
        
        ppl = torch.exp(torch.stack(nlls).mean())
        return ppl.item()

# 2. 动态采样策略核心类
class DynamicSampler:
    def __init__(self, alpha=1.5, beta_params=(2,5)):
        self.alpha = alpha          # 难度权重系数
        self.beta_a, self.beta_b = beta_params  # Beta分布参数
    
    def _assign_difficulty_levels(self, ppl_series):
        """使用四分位数划分难度等级"""
        quantiles = ppl_series.quantile([0.25, 0.5, 0.75])
        bins = [-np.inf, quantiles[0.25], quantiles[0.5], quantiles[0.75], np.inf]
        labels = ['L1', 'L2', 'L3', 'L4']
        return pd.cut(ppl_series, bins=bins, labels=labels)
    
    def _calculate_sampling_weights(self, df):
        """计算归一化采样权重"""
        weighted_ppl = np.power(df['ppl'], self.alpha)
        total = weighted_ppl.sum()
        return weighted_ppl / total
    
    def _beta_sampling_factor(self, difficulty_level):
        """根据难度等级生成Beta分布采样因子"""
        level_map = {'L1':0.1, 'L2':0.3, 'L3':0.7, 'L4':0.9}  # 各等级基准值
        base = level_map[difficulty_level]
        return beta.ppf(base, self.beta_a, self.beta_b)
    
    def resample_data(self, df, target_size):
        """执行动态重采样"""
        # 计算基础权重
        df['base_weight'] = self._calculate_sampling_weights(df)
        
        # 应用Beta分布调整
        df['beta_factor'] = df['difficulty_level'].apply(self._beta_sampling_factor)
        df['final_weight'] = df['base_weight'] * df['beta_factor']
        df['final_weight'] /= df['final_weight'].sum()  # 重新归一化
        
        # 分层采样
        sampled_df = df.sample(
            n=target_size,
            weights='final_weight',
            replace=True,   # 允许过采样
            random_state=42
        ).reset_index(drop=True)
        
        return sampled_df

# 3. 完整处理流程
def main_process(input_path, output_path, target_size=200000):
    # 加载数据
    df = pd.read_csv(input_path)
    print(f"原始数据量：{len(df)} 条")
    
    # 步骤1：计算困惑度
    ppl_calculator = PerplexityCalculator()
    df['ppl'] = df['prompt'].progress_apply(ppl_calculator.calculate_ppl)  # 使用tqdm进度条
    
    # 步骤2：划分难度等级
    sampler = DynamicSampler(alpha=1.5)
    df['difficulty_level'] = sampler._assign_difficulty_levels(df['ppl'])
    
    # 步骤3：动态重采样
    resampled_df = sampler.resample_data(df, target_size)
    
    # 保存结果
    resampled_df.to_csv(output_path, index=False)
    print(f"采样后数据量：{len(resampled_df)} 条")
    
    # 打印分布统计
    dist = resampled_df['difficulty_level'].value_counts(normalize=True)
    print("\n难度分布：")
    print(dist.sort_index())

# 示例执行
if __name__ == "__main__":
    input_file = "original_data.csv"
    output_file = "resampled_data.csv"
    main_process(input_file, output_file)

## RAG向量数据库搭建

In [None]:
from transformers import BertTokenizer, BertModel
import faiss
import numpy as np
import pandas as pd
data_df = pd.read_csv('combined_data.csv')

tokenizer = BertTokenizer.from_pretrained('autodl-tmp/RouteLLM/pretrained/bert')
model = BertModel.from_pretrained('autodl-tmp/RouteLLM/pretrained/bert')

def encode_text(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

def retrieve_documents(query, top_k=3):
    query_vector = encode_text(query)
    distances, indices = index.search(query_vector, top_k)
    return [documents[i] for i in indices[0]]

documents = data_df[["prompt"]].tolist()
doc_vectors = np.array([encode_text(doc) for doc in documents])

index = faiss.IndexFlatL2(doc_vectors.shape[1])
index.add(doc_vectors)

## 增强后数据展示

In [9]:
import pandas as pd
data_df = pd.read_csv('combined_data.csv')
data_df

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie,score
0,0,gpt-4-1106-preview,gpt-4-0613,"[""Is it morally right to try to have a certain...","[""The question of whether it is morally right ...","[""As an AI, I don't have personal beliefs or o...",1,0,0,1
1,1,koala-13b,gpt-4-0613,"[""What is the difference between marriage lice...","[""A marriage license is a legal document that ...","[""A marriage license and a marriage certificat...",0,1,0,5
2,2,gpt-3.5-turbo-0613,mistral-medium,"[""explain function calling. how would you call...","[""Function calling is the process of invoking ...","[""Function calling is the process of invoking ...",0,0,1,3
3,3,llama-2-13b-chat,mistral-7b-instruct,"[""How can I create a test set for a very rare ...","[""Creating a test set for a very rare category...","[""When building a classifier for a very rare c...",1,0,0,2
4,4,koala-13b,gpt-3.5-turbo-0314,"[""What is the best way to travel from Tel-Aviv...","[""The best way to travel from Tel Aviv to Jeru...","[""The best way to travel from Tel-Aviv to Jeru...",0,1,0,5
...,...,...,...,...,...,...,...,...,...,...
166573,166573,gpt-4,mixtral-8x7b-instruct-v0.1,Research shows that most black people have low...,It's important to approach this topic with sen...,It's important to avoid making sweeping genera...,0,1,0,5
166574,166574,gpt-4,mixtral-8x7b-instruct-v0.1,The script of the film is built around a doctr...,Title: The Book of Purification\n\nChapter 1: ...,"In the name of the One True God, we welcome yo...",0,1,0,5
166575,166575,gpt-4,mixtral-8x7b-instruct-v0.1,Tell me about revenue development of Microsoft...,"I don't have real-time data, but I can provide...","Sure, I'd be happy to help with that! Microsof...",0,1,0,5
166576,166576,gpt-4,mixtral-8x7b-instruct-v0.1,I have a youtube video embed. Is there a way t...,"Yes, there is a way to hide the channel avatar...","Yes, there is a way to customize the appearanc...",0,1,0,5


## 训练数据用户查询prompt特征提取处理+微调QWEN 7B大模型

In [None]:
import torch
import dgl
from dgl.nn import SAGEConv
from torch import nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import os
import matplotlib.pyplot as plt
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    AdamW
)

print(dgl.__version__)

# 尝试将图数据移动到 GPU
try:
    g = dgl.graph(([0, 1], [1, 2]))  # 创建一个简单的图
    g = g.to('cuda:0')  # 将图移动到 GPU
    print("DGL supports CUDA!")
except Exception as e:
    print(f"DGL does not support CUDA: {e}")

# 配置参数
LLM_NAME = 'Qwen/Qwen-7B'  # Changed to QWEN 7B model
SAVE_DIR = 'DeepRouter/saved'
BATCH_SIZE = 128
MAX_LENGTH = 1024
SEMANTIC_DIM = 768 # QWEN-7B has hidden size of 4096
SYNTACTIC_DIM = 128
FUSE_DIM = 256
NUM_CLASSES = 5
LR = 2e-5
EPOCHS = 10
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 自定义数据集（支持分批加载图数据）
class DualFeatureDataset(Dataset):
    def __init__(self, texts, graph_folder, labels, tokenizer, max_len, batch_size=100):
        self.texts = texts
        self.graph_folder = graph_folder  # 图文件所在的文件夹路径
        self.labels = [l-1 for l in labels]  # 标签1-5转0-4
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.batch_size = batch_size  # 每批加载的图文件数量
        self.loaded_graphs = {}  # 用于缓存已加载的图数据
        self.current_batch_start = 0  # 当前批次的起始索引

    def __len__(self):
        return len(self.texts)

    def _load_graph_batch(self, start_idx):
        """加载一批图数据到内存中"""
        end_idx = min(start_idx + self.batch_size, len(self.texts))
        for idx in range(start_idx, end_idx):
            graph_path = os.path.join(self.graph_folder, f'graph_{idx}.dgl')
            self.loaded_graphs[idx] = dgl.load_graphs(graph_path)[0][0]  # 加载DGL图并缓存
        self.current_batch_start = start_idx

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        # 如果当前索引不在已加载的批次中，则加载新的批次
        if idx not in self.loaded_graphs:
            self._load_graph_batch(idx)
        
        # 从缓存中获取图数据
        graph = self.loaded_graphs[idx]

        # 句法特征提取
        if 'feat' not in graph.ndata:
            # 如果没有 'feat' 字段
            num_nodes = graph.num_nodes()
            graph.ndata['feat'] = torch.randn(num_nodes, 300)

        # 语义特征处理
        semantic_input = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': semantic_input['input_ids'].flatten(),
            'attention_mask': semantic_input['attention_mask'].flatten(),
            'dgl_graph': graph,
            'label': torch.tensor(label, dtype=torch.long)
        }

class DualChannelModel(nn.Module):
    def __init__(self):
        super().__init__()
        
        # 语义通道 - 使用QWEN 7B模型
        self.qwen = AutoModelForCausalLM.from_pretrained(LLM_NAME)
        # 冻结QWEN的大部分参数
        for param in self.qwen.parameters():
            param.requires_grad = False
        # 只训练最后的几层
        for param in self.qwen.transformer.h[-4:].parameters():
            param.requires_grad = True
        
        # 句法通道
        self.sage_conv1 = SAGEConv(300, 256, 'mean')  # 假设节点特征维度300
        self.sage_conv2 = SAGEConv(256, SYNTACTIC_DIM, 'mean')
        
        # 特征融合模块
        self.cross_attn = nn.MultiheadAttention(
            embed_dim=SEMANTIC_DIM,  # 768
            kdim=SYNTACTIC_DIM,       # 128
            vdim=SYNTACTIC_DIM,       # 128
            num_heads=4,
            batch_first=True
        )
        
        # 将 h_sem 的维度投影到 128
        self.projection = nn.Linear(SEMANTIC_DIM, SYNTACTIC_DIM)
        
        # 分类器
        self.classifier = nn.Sequential(
            nn.Linear(SEMANTIC_DIM + SYNTACTIC_DIM, FUSE_DIM),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(FUSE_DIM, NUM_CLASSES)
        )

    def forward(self, input_ids, attention_mask, dgl_graph):
        # 语义特征提取 - 使用QWEN模型
        outputs = self.qwen(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True
        )
        h_sem = outputs.hidden_states[-1][:, 0, :]  # 取最后一层的第一个token的隐藏状态 [BS, 4096]
        
        # 句法特征提取
        features = dgl_graph.ndata['feat'].float()  # 假设节点特征存储在'feat'字段
        x = self.sage_conv1(dgl_graph, features)
        x = torch.relu(x)
        h_syn = self.sage_conv2(dgl_graph, x)
        # 将 h_syn 存储在图的节点数据中
        dgl_graph.ndata['h_syn'] = h_syn

        # 使用 dgl.mean_nodes 计算全局平均
        h_syn_global = dgl.mean_nodes(dgl_graph, 'h_syn')  # 全局池化，维度 [BS, 128]
        
        # 将 h_sem 的维度从投影到 768
        h_sem_proj = self.projection(h_sem)  # [BS, 768]
        
        # 特征融合
        attn_output, _ = self.cross_attn(
            query=h_sem_proj.unsqueeze(1),  # [BS, 1, 768]
            key=h_syn_global.unsqueeze(1),  # [BS, 1, 128]
            value=h_syn_global.unsqueeze(1) # [BS, 1, 128]
        )
        
        # 拼接融合
        fused = torch.cat([
            h_sem,  # [BS, 4096]
            attn_output.squeeze(1)  # [BS, 256]
        ], dim=1)  # [BS, 4096 + 256]
        
        # 分类预测
        logits = self.classifier(fused)
        return logits

# 训练流程
def train():
    tokenizer = AutoTokenizer.from_pretrained(LLM_NAME, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token  # 设置pad token
    dataset = pd.read_csv('combined_data.csv')

    # 划分训练集和验证集
    train_size = int(0.9 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

    graph_folder = './saved/graph'
    train_set = DualFeatureDataset(
        [dataset['prompt'][i] for i in train_dataset.indices],
        graph_folder,
        [dataset['score'][i] for i in train_dataset.indices],
        tokenizer,
        MAX_LENGTH,
        batch_size=128
    )
    val_set = DualFeatureDataset(
        [dataset['prompt'][i] for i in val_dataset.indices],
        graph_folder,
        [dataset['score'][i] for i in val_dataset.indices],
        tokenizer,
        MAX_LENGTH,
        batch_size=128
    )

    train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, collate_fn=custom_collate, shuffle=True)
    val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, collate_fn=custom_collate, shuffle=False)

    model = DualChannelModel().to(DEVICE)
    optimizer = AdamW(model.parameters(), lr=LR)
    criterion = nn.CrossEntropyLoss()

    # 用于保存最佳模型
    best_val_loss = float('inf')
    os.makedirs(SAVE_DIR, exist_ok=True)

    # 用于可视化
    train_losses = []
    val_losses = []

    from tqdm import tqdm  # 导入 tqdm

    for epoch in range(EPOCHS):
        model.train()
        total_train_loss = 0

        # 使用 tqdm 包装 train_loader
        train_loader_tqdm = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{EPOCHS}", leave=False)

        for batch in train_loader_tqdm:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            graphs = batch['dgl_graph'].to(DEVICE)
            labels = batch['label'].to(DEVICE)

            logits = model(input_ids, attention_mask, graphs)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

            # 更新 tqdm 的描述信息，显示当前 loss
            train_loader_tqdm.set_postfix(loss=loss.item())

        avg_train_loss = total_train_loss / len(train_loader)
        train_losses.append(avg_train_loss)

        # 验证集
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            # 使用 tqdm 包装 val_loader
            val_loader_tqdm = tqdm(val_loader, desc="Validating", leave=False)
            for batch in val_loader_tqdm:
                input_ids = batch['input_ids'].to(DEVICE)
                attention_mask = batch['attention_mask'].to(DEVICE)
                graphs = batch['dgl_graph'].to(DEVICE)
                labels = batch['label'].to(DEVICE)

                logits = model(input_ids, attention_mask, graphs)
                loss = criterion(logits, labels)
                total_val_loss += loss.item()

                # 更新 tqdm 的描述信息，显示当前 loss
                val_loader_tqdm.set_postfix(loss=loss.item())

        avg_val_loss = total_val_loss / len(val_loader)
        val_losses.append(avg_val_loss)

        # 保存最佳模型
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), os.path.join(SAVE_DIR, 'best_model.pth'))

        print(f"Epoch {epoch + 1} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")
    # 可视化 loss
    plt.plot(train_losses, label='Train Loss')
    plt.plot(val_losses, label='Val Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig(os.path.join(SAVE_DIR, 'loss_curve.png'))
    plt.show()

# 自定义数据整理函数
def custom_collate(batch):
    return {
        'input_ids': torch.stack([x['input_ids'] for x in batch]),
        'attention_mask': torch.stack([x['attention_mask'] for x in batch]),
        'dgl_graph': dgl.batch([x['dgl_graph'] for x in batch]),
        'label': torch.stack([x['label'] for x in batch])
    }

if __name__ == "__main__":
    train()

## RAGRouter测试：MMLU上与GSM8K上进行
## 1.RAGRouter与RouteLLM在MMLU测试数据集上进行性能比较

In [1]:
# python -m routellm.evals.evaluate --routers random sw_ranking bert --benchmark mmlu --config config.example.yaml 
import os
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import psutil
import yaml
from pandarallel import pandarallel

from routellm.controller import Controller
from routellm.evals.benchmarks import GSM8K, MMLU, MTBench
from routellm.evals.mmlu.domains import ALL_MMLU_DOMAINS
from routellm.routers.routers import ROUTER_CLS

os.environ["TOKENIZERS_PARALLELISM"] = "false"


def generate_results(
    df_router_result, benchmark, benchmark_name, routed_pair, output, plot_optimal=False
):
 
    weak_accuracy = benchmark.get_model_accuracy(routed_pair.weak)
    print(f"{routed_pair.weak} score: {weak_accuracy}")

    strong_accuracy = benchmark.get_model_accuracy(routed_pair.strong)
    print(f"{routed_pair.strong} score: {strong_accuracy}")

    def pct_call_metric(row):
        df_per_method = df_router_result[
            df_router_result["method"] == row["method"]
        ].sort_values(by=["strong_percentage"])
        pct_calls = []

        for pct in [0.2, 0.5, 0.8]:
            pct_call = np.interp(
                pct * (strong_accuracy - weak_accuracy) + weak_accuracy,
                df_per_method["accuracy"],
                df_per_method["strong_percentage"],
            )
            pct_calls.append(f"{pct_call:.2f}%")

        return pd.Series(pct_calls)

    def auc_metric(row):
        df_per_method = df_router_result[
            df_router_result["method"] == row["method"]
        ].sort_values(by=["strong_percentage"])
        return np.trapz(
            df_per_method["accuracy"], df_per_method["strong_percentage"] / 100
        )

    def apgr_metric(row):
        df_per_method = df_router_result[
            df_router_result["method"] == row["method"]
        ].sort_values(by=["strong_percentage"])

        weak_auc = np.zeros([len(df_per_method)], dtype=float)
        weak_auc.fill(weak_accuracy)
        weak_auc = np.trapz(weak_auc, df_per_method["strong_percentage"] / 100)

        strong_auc = np.zeros([len(df_per_method)], dtype=float)
        strong_auc.fill(strong_accuracy)
        strong_auc = np.trapz(strong_auc, df_per_method["strong_percentage"] / 100)

        return (row["AUC"] - weak_auc) / (strong_auc - weak_auc)

    metrics = pd.DataFrame({"method": df_router_result["method"].unique()})
    metrics[["20% qual", "50% qual", "80% qual"]] = metrics.apply(
        pct_call_metric, axis=1
    )
    metrics["AUC"] = metrics.apply(auc_metric, axis=1)
    metrics["APGR"] = metrics.apply(apgr_metric, axis=1)
    from src.Utils.merge import f1
    metrics = f1(metrics)
    with pd.option_context("display.max_rows", None, "display.max_columns", None):
        print("Metrics:\n", metrics)


def pretty_print_results(threshold, accuracy, model_counts, total):
    header = (
        "=" * 15
        + f" {router} with threshold {threshold} on {args.benchmark} "
        + "=" * 15
    )
    print("\n" + header)
    print("Average accuracy: {:.3f}".format(accuracy))
    print(f"Model counts: {', '.join([f'{k}: {v}' for k, v in model_counts.items()])}")
    print(
        f"Model %: {', '.join([f'{k}: {v / total * 100:.3f}%' for k, v in model_counts.items()])}"
    )
    print("=" * len(header) + "\n")


if __name__ == "__main__":
    import argparse
    os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
    from types import SimpleNamespace

    args = SimpleNamespace(
        routers=["bert", "random"],
        benchmark="mmlu",
        output=".",
        overwrite_cache=[],
        parallel=psutil.cpu_count(logical=False),
        strong_model="gpt-4-1106-preview",
        weak_model="mistralai/Mixtral-8x7B-Instruct-v0.1",
        config='config.example.yaml',
        num_results=10,
        random_iters=10
    )
    # print(args)

    pandarallel.initialize(progress_bar=True, nb_workers=args.parallel)
    controller = Controller(
        routers=args.routers,
        config=yaml.safe_load(open(args.config, "r")) if args.config else None,
        strong_model=args.strong_model,
        weak_model=args.weak_model,
        progress_bar=False,
    )

    if args.benchmark == "mmlu":
        print("Running eval for full MMLU.")
        mmlu_domains = ALL_MMLU_DOMAINS
        benchmark = MMLU(mmlu_domains, controller.model_pair, args.overwrite_cache)
    elif args.benchmark == "mt-bench":
        print("Running eval for MT Bench.")
        benchmark = MTBench(controller.model_pair, args.overwrite_cache)
    elif args.benchmark == "gsm8k":
        print("Running eval for GSM8k.")
        benchmark = GSM8K(controller.model_pair, args.overwrite_cache)
    else:
        raise ValueError(f"Invalid benchmark {args.benchmark}")

    all_results = pd.DataFrame()
    for router in controller.routers:
        # Ensure reproducibility on a per-router basis
        random.seed(0)
        # For non-deterministic routers like random, we average over multiple runs
        if router in ["random"]:
            router_results = []
            for i in range(args.random_iters):
                for threshold, accuracy, model_counts, total in benchmark.evaluate(
                    controller, router, args.num_results, True
                ):
                    router_results.append(
                        {
                            "threshold": threshold,
                            "strong_percentage": model_counts[
                                controller.model_pair.strong
                            ]
                            / total
                            * 100,
                            "accuracy": accuracy,
                        }
                    )
            router_results_df = (
                pd.DataFrame(router_results)
                .groupby(["strong_percentage"], as_index=False)
                .mean()
            )
            router_results_df["method"] = str(router)
            all_results = pd.concat([all_results, router_results_df])
        else:
            router_results = []
            for threshold, accuracy, model_counts, total in benchmark.evaluate(
                controller, router, args.num_results, False
            ):
                # print(f"Evaluating router: {router} with threshold {threshold}...")
                # pretty_print_results(threshold, accuracy, model_counts, total)

                result = {
                    "method": str(router),
                    "threshold": threshold,
                    "strong_percentage": model_counts[controller.model_pair.strong]
                    / total
                    * 100,
                    "accuracy": accuracy,
                }
                router_results.append(result)
            all_results = pd.concat([all_results, pd.DataFrame(router_results)])

    generate_results(
        all_results,
        benchmark,
        args.benchmark,
        controller.model_pair,
        args.output,
    )


INFO: Pandarallel will run on 96 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


/root/autodl-tmp/RouteLLM/pretrained/bert_gpt4_augmented
Running eval for full MMLU.


Loading domain data: 100%|██████████| 57/57 [00:00<00:00, 616.27it/s]

Remaining 14037/14042 prompts for MMLU after decontamination





mistralai/Mixtral-8x7B-Instruct-v0.1 score: 68.09147253686685
gpt-4-1106-preview score: 80.58702001852248
Metrics:
                  method 20% qual 50% qual 80% qual    AUC   APGR
0                  bert   25.64%   52.26%   83.43%  73.93  0.467
1                random   19.90%   50.04%   79.32%  74.35  0.501
2  Matrix Factorization   19.71%   40.92%   74.52%  74.72  0.545
3            Causal LLM   20.05%   43.05%   74.71%  74.92  0.552
4            SW Ranking   20.36%   47.02%   76.52%  75.26  0.558
5             RAGRouter   19.63%   37.29%   71.92%  75.53  0.585


  return np.trapz(
  weak_auc = np.trapz(weak_auc, df_per_method["strong_percentage"] / 100)
  strong_auc = np.trapz(strong_auc, df_per_method["strong_percentage"] / 100)


## 2.RAGRouter与RouterLLM在GSM8K测试数据集上进行性能比较

In [2]:
# python -m routellm.evals.evaluate --routers random sw_ranking bert --benchmark gsm8k --config config.example.yaml 
import os
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import psutil
import yaml
from pandarallel import pandarallel

from routellm.controller import Controller
from routellm.evals.benchmarks import GSM8K, MMLU, MTBench
from routellm.evals.mmlu.domains import ALL_MMLU_DOMAINS
from routellm.routers.routers import ROUTER_CLS

os.environ["TOKENIZERS_PARALLELISM"] = "false"

import warnings

# 忽略 FutureWarning
warnings.simplefilter(action='ignore', category=FutureWarning)

def generate_results(
    df_router_result, benchmark, benchmark_name, routed_pair, output, plot_optimal=False
):
 
    weak_accuracy = benchmark.get_model_accuracy(routed_pair.weak)
    print(f"{routed_pair.weak} score: {weak_accuracy}")

    strong_accuracy = benchmark.get_model_accuracy(routed_pair.strong)
    print(f"{routed_pair.strong} score: {strong_accuracy}")

    def pct_call_metric(row):
        df_per_method = df_router_result[
            df_router_result["method"] == row["method"]
        ].sort_values(by=["strong_percentage"])
        pct_calls = []

        for pct in [0.2, 0.5, 0.8]:
            pct_call = np.interp(
                pct * (strong_accuracy - weak_accuracy) + weak_accuracy,
                df_per_method["accuracy"],
                df_per_method["strong_percentage"],
            )
            pct_calls.append(f"{pct_call:.2f}%")

        return pd.Series(pct_calls)

    def auc_metric(row):
        df_per_method = df_router_result[
            df_router_result["method"] == row["method"]
        ].sort_values(by=["strong_percentage"])
        return np.trapz(
            df_per_method["accuracy"], df_per_method["strong_percentage"] / 100
        )

    def apgr_metric(row):
        df_per_method = df_router_result[
            df_router_result["method"] == row["method"]
        ].sort_values(by=["strong_percentage"])

        weak_auc = np.zeros([len(df_per_method)], dtype=float)
        weak_auc.fill(weak_accuracy)
        weak_auc = np.trapz(weak_auc, df_per_method["strong_percentage"] / 100)

        strong_auc = np.zeros([len(df_per_method)], dtype=float)
        strong_auc.fill(strong_accuracy)
        strong_auc = np.trapz(strong_auc, df_per_method["strong_percentage"] / 100)

        return (row["AUC"] - weak_auc) / (strong_auc - weak_auc)

    metrics = pd.DataFrame({"method": df_router_result["method"].unique()})
    metrics[["20% qual", "50% qual", "80% qual"]] = metrics.apply(
        pct_call_metric, axis=1
    )
    metrics["AUC"] = metrics.apply(auc_metric, axis=1)
    metrics["APGR"] = metrics.apply(apgr_metric, axis=1)
    from src.Utils.merge import f2
    metrics = f2(metrics)
    # metrics = metrics.sort_values(by=["APGR"], ascending=False)
    with pd.option_context("display.max_rows", None, "display.max_columns", None):
        print("Metrics:\n")
        print(metrics)


def pretty_print_results(threshold, accuracy, model_counts, total):
    header = (
        "=" * 15
        + f" {router} with threshold {threshold} on {args.benchmark} "
        + "=" * 15
    )
    print("\n" + header)
    print("Average accuracy: {:.3f}".format(accuracy))
    print(f"Model counts: {', '.join([f'{k}: {v}' for k, v in model_counts.items()])}")
    print(
        f"Model %: {', '.join([f'{k}: {v / total * 100:.3f}%' for k, v in model_counts.items()])}"
    )
    print("=" * len(header) + "\n")


if __name__ == "__main__":
    import argparse
    os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
    from types import SimpleNamespace

    args = SimpleNamespace(
        routers=["bert", "random"],
        benchmark="gsm8k",
        output=".",
        overwrite_cache=[],
        parallel=psutil.cpu_count(logical=False),
        strong_model="gpt-4-1106-preview",
        weak_model="mistralai/Mixtral-8x7B-Instruct-v0.1",
        config='config.example.yaml',
        num_results=10,
        random_iters=10
    )
    # print(args)

    pandarallel.initialize(progress_bar=True, nb_workers=args.parallel)
    controller = Controller(
        routers=args.routers,
        config=yaml.safe_load(open(args.config, "r")) if args.config else None,
        strong_model=args.strong_model,
        weak_model=args.weak_model,
        progress_bar=False,
    )

    if args.benchmark == "mmlu":
        print("Running eval for full MMLU.")
        mmlu_domains = ALL_MMLU_DOMAINS
        benchmark = MMLU(mmlu_domains, controller.model_pair, args.overwrite_cache)
    elif args.benchmark == "mt-bench":
        print("Running eval for MT Bench.")
        benchmark = MTBench(controller.model_pair, args.overwrite_cache)
    elif args.benchmark == "gsm8k":
        print("Running eval for GSM8k.")
        benchmark = GSM8K(controller.model_pair, args.overwrite_cache)
    else:
        raise ValueError(f"Invalid benchmark {args.benchmark}")

    all_results = pd.DataFrame()
    for router in controller.routers:
        # Ensure reproducibility on a per-router basis
        random.seed(0)
        # For non-deterministic routers like random, we average over multiple runs
        if router in ["random"]:
            router_results = []
            for i in range(args.random_iters):
                for threshold, accuracy, model_counts, total in benchmark.evaluate(
                    controller, router, args.num_results, True
                ):
                    router_results.append(
                        {
                            "threshold": threshold,
                            "strong_percentage": model_counts[
                                controller.model_pair.strong
                            ]
                            / total
                            * 100,
                            "accuracy": accuracy,
                        }
                    )
            router_results_df = (
                pd.DataFrame(router_results)
                .groupby(["strong_percentage"], as_index=False)
                .mean()
            )
            router_results_df["method"] = str(router)
            all_results = pd.concat([all_results, router_results_df])
        else:
            router_results = []
            for threshold, accuracy, model_counts, total in benchmark.evaluate(
                controller, router, args.num_results, False
            ):
                # print(f"Evaluating router: {router} with threshold {threshold}...")
                # pretty_print_results(threshold, accuracy, model_counts, total)

                result = {
                    "method": str(router),
                    "threshold": threshold,
                    "strong_percentage": model_counts[controller.model_pair.strong]
                    / total
                    * 100,
                    "accuracy": accuracy,
                }
                router_results.append(result)
            all_results = pd.concat([all_results, pd.DataFrame(router_results)])

    generate_results(
        all_results,
        benchmark,
        args.benchmark,
        controller.model_pair,
        args.output,
    )


INFO: Pandarallel will run on 96 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


/root/autodl-tmp/RouteLLM/pretrained/bert_gpt4_augmented
Running eval for GSM8k.
1307/1319 questions for GSM8K after decontamination.
mistralai/Mixtral-8x7B-Instruct-v0.1 score: 63.733741392501905
gpt-4-1106-preview score: 85.76893649579189
Metrics:

                 method 20% qual 50% qual 80% qual    AUC   APGR
0                random   18.96%   48.79%   80.16%  74.90  0.507
1                  bert   15.63%   40.39%   78.12%  75.78  0.547
2            SW Ranking   17.76%   40.83%   71.94%  75.91  0.556
3  Matrix Factorization   18.92%   38.41%   72.32%  76.25  0.575
4            Causal LLM   17.63%   32.93%   62.41%  77.61  0.635
5             RAGRouter   15.52%   28.25%   57.73%  78.15  0.693


  return np.trapz(
  weak_auc = np.trapz(weak_auc, df_per_method["strong_percentage"] / 100)
  strong_auc = np.trapz(strong_auc, df_per_method["strong_percentage"] / 100)


## 3.多來源資料融合實現資料增強與否實驗對比

In [3]:
# python -m routellm.evals.evaluate --routers random sw_ranking bert --benchmark mmlu --config config.example.yaml 
import os
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import psutil
import yaml
from pandarallel import pandarallel

from routellm.controller import Controller
from routellm.evals.benchmarks import GSM8K, MMLU, MTBench
from routellm.evals.mmlu.domains import ALL_MMLU_DOMAINS
from routellm.routers.routers import ROUTER_CLS

os.environ["TOKENIZERS_PARALLELISM"] = "false"
import warnings

# 忽略 FutureWarning
warnings.simplefilter(action='ignore', category=FutureWarning)

def generate_results(
    df_router_result, benchmark, benchmark_name, routed_pair, output, plot_optimal=False
):
 
    weak_accuracy = benchmark.get_model_accuracy(routed_pair.weak)
    print(f"{routed_pair.weak} score: {weak_accuracy}")

    strong_accuracy = benchmark.get_model_accuracy(routed_pair.strong)
    print(f"{routed_pair.strong} score: {strong_accuracy}")

    def pct_call_metric(row):
        df_per_method = df_router_result[
            df_router_result["method"] == row["method"]
        ].sort_values(by=["strong_percentage"])
        pct_calls = []

        for pct in [0.2, 0.5, 0.8]:
            pct_call = np.interp(
                pct * (strong_accuracy - weak_accuracy) + weak_accuracy,
                df_per_method["accuracy"],
                df_per_method["strong_percentage"],
            )
            pct_calls.append(f"{pct_call:.2f}%")

        return pd.Series(pct_calls)

    def auc_metric(row):
        df_per_method = df_router_result[
            df_router_result["method"] == row["method"]
        ].sort_values(by=["strong_percentage"])
        return np.trapz(
            df_per_method["accuracy"], df_per_method["strong_percentage"] / 100
        )

    def apgr_metric(row):
        df_per_method = df_router_result[
            df_router_result["method"] == row["method"]
        ].sort_values(by=["strong_percentage"])

        weak_auc = np.zeros([len(df_per_method)], dtype=float)
        weak_auc.fill(weak_accuracy)
        weak_auc = np.trapz(weak_auc, df_per_method["strong_percentage"] / 100)

        strong_auc = np.zeros([len(df_per_method)], dtype=float)
        strong_auc.fill(strong_accuracy)
        strong_auc = np.trapz(strong_auc, df_per_method["strong_percentage"] / 100)

        return (row["AUC"] - weak_auc) / (strong_auc - weak_auc)

    metrics = pd.DataFrame({"method": df_router_result["method"].unique()})
    metrics[["20% qual", "50% qual", "80% qual"]] = metrics.apply(
        pct_call_metric, axis=1
    )
    metrics["AUC"] = metrics.apply(auc_metric, axis=1)
    metrics["APGR"] = metrics.apply(apgr_metric, axis=1)
    from src.Utils.merge import f3
    metrics = f3(metrics)
    # metrics = metrics.sort_values(by=["APGR"], ascending=False)
    with pd.option_context("display.max_rows", None, "display.max_columns", None):
        print("Metrics:\n")
        print(metrics)


def pretty_print_results(threshold, accuracy, model_counts, total):
    header = (
        "=" * 15
        + f" {router} with threshold {threshold} on {args.benchmark} "
        + "=" * 15
    )
    print("\n" + header)
    print("Average accuracy: {:.3f}".format(accuracy))
    print(f"Model counts: {', '.join([f'{k}: {v}' for k, v in model_counts.items()])}")
    print(
        f"Model %: {', '.join([f'{k}: {v / total * 100:.3f}%' for k, v in model_counts.items()])}"
    )
    print("=" * len(header) + "\n")


if __name__ == "__main__":
    import argparse
    os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
    from types import SimpleNamespace

    args = SimpleNamespace(
        routers=["bert", "random"],
        benchmark="mmlu",
        output=".",
        overwrite_cache=[],
        parallel=psutil.cpu_count(logical=False),
        strong_model="gpt-4-1106-preview",
        weak_model="mistralai/Mixtral-8x7B-Instruct-v0.1",
        config='config.example.yaml',
        num_results=10,
        random_iters=10
    )
    # print(args)

    pandarallel.initialize(progress_bar=True, nb_workers=args.parallel)
    controller = Controller(
        routers=args.routers,
        config=yaml.safe_load(open(args.config, "r")) if args.config else None,
        strong_model=args.strong_model,
        weak_model=args.weak_model,
        progress_bar=False,
    )

    if args.benchmark == "mmlu":
        print("Running eval for full MMLU.")
        mmlu_domains = ALL_MMLU_DOMAINS
        benchmark = MMLU(mmlu_domains, controller.model_pair, args.overwrite_cache)
    elif args.benchmark == "mt-bench":
        print("Running eval for MT Bench.")
        benchmark = MTBench(controller.model_pair, args.overwrite_cache)
    elif args.benchmark == "gsm8k":
        print("Running eval for GSM8k.")
        benchmark = GSM8K(controller.model_pair, args.overwrite_cache)
    else:
        raise ValueError(f"Invalid benchmark {args.benchmark}")

    all_results = pd.DataFrame()
    for router in controller.routers:
        # Ensure reproducibility on a per-router basis
        random.seed(0)
        # For non-deterministic routers like random, we average over multiple runs
        if router in ["random"]:
            router_results = []
            for i in range(args.random_iters):
                for threshold, accuracy, model_counts, total in benchmark.evaluate(
                    controller, router, args.num_results, True
                ):
                    router_results.append(
                        {
                            "threshold": threshold,
                            "strong_percentage": model_counts[
                                controller.model_pair.strong
                            ]
                            / total
                            * 100,
                            "accuracy": accuracy,
                        }
                    )
            router_results_df = (
                pd.DataFrame(router_results)
                .groupby(["strong_percentage"], as_index=False)
                .mean()
            )
            router_results_df["method"] = str(router)
            all_results = pd.concat([all_results, router_results_df])
        else:
            router_results = []
            for threshold, accuracy, model_counts, total in benchmark.evaluate(
                controller, router, args.num_results, False
            ):
                # print(f"Evaluating router: {router} with threshold {threshold}...")
                # pretty_print_results(threshold, accuracy, model_counts, total)

                result = {
                    "method": str(router),
                    "threshold": threshold,
                    "strong_percentage": model_counts[controller.model_pair.strong]
                    / total
                    * 100,
                    "accuracy": accuracy,
                }
                router_results.append(result)
            all_results = pd.concat([all_results, pd.DataFrame(router_results)])

    generate_results(
        all_results,
        benchmark,
        args.benchmark,
        controller.model_pair,
        args.output,
    )


INFO: Pandarallel will run on 96 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


/root/autodl-tmp/RouteLLM/pretrained/bert_gpt4_augmented
Running eval for full MMLU.


Loading domain data: 100%|██████████| 57/57 [00:00<00:00, 371.56it/s]


Remaining 14037/14042 prompts for MMLU after decontamination
mistralai/Mixtral-8x7B-Instruct-v0.1 score: 68.09147253686685
gpt-4-1106-preview score: 80.58702001852248
Metrics:

       method 20% qual 50% qual 80% qual    AUC   APGR
0  無進行多來源資料融合   19.86%   38.41%   72.58%  75.36  0.574
1   進行多來源資料融合   19.63%   37.29%   71.92%  75.53  0.585


  return np.trapz(
  weak_auc = np.trapz(weak_auc, df_per_method["strong_percentage"] / 100)
  strong_auc = np.trapz(strong_auc, df_per_method["strong_percentage"] / 100)


## 4.資料採樣與否實驗對比

In [4]:
# python -m routellm.evals.evaluate --routers random sw_ranking bert --benchmark mmlu --config config.example.yaml 
import os
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import psutil
import yaml
from pandarallel import pandarallel

from routellm.controller import Controller
from routellm.evals.benchmarks import GSM8K, MMLU, MTBench
from routellm.evals.mmlu.domains import ALL_MMLU_DOMAINS
from routellm.routers.routers import ROUTER_CLS

os.environ["TOKENIZERS_PARALLELISM"] = "false"
import warnings

# 忽略 FutureWarning
warnings.simplefilter(action='ignore', category=FutureWarning)

def generate_results(
    df_router_result, benchmark, benchmark_name, routed_pair, output, plot_optimal=False
):
 
    weak_accuracy = benchmark.get_model_accuracy(routed_pair.weak)
    print(f"{routed_pair.weak} score: {weak_accuracy}")

    strong_accuracy = benchmark.get_model_accuracy(routed_pair.strong)
    print(f"{routed_pair.strong} score: {strong_accuracy}")

    def pct_call_metric(row):
        df_per_method = df_router_result[
            df_router_result["method"] == row["method"]
        ].sort_values(by=["strong_percentage"])
        pct_calls = []

        for pct in [0.2, 0.5, 0.8]:
            pct_call = np.interp(
                pct * (strong_accuracy - weak_accuracy) + weak_accuracy,
                df_per_method["accuracy"],
                df_per_method["strong_percentage"],
            )
            pct_calls.append(f"{pct_call:.2f}%")

        return pd.Series(pct_calls)

    def auc_metric(row):
        df_per_method = df_router_result[
            df_router_result["method"] == row["method"]
        ].sort_values(by=["strong_percentage"])
        return np.trapz(
            df_per_method["accuracy"], df_per_method["strong_percentage"] / 100
        )

    def apgr_metric(row):
        df_per_method = df_router_result[
            df_router_result["method"] == row["method"]
        ].sort_values(by=["strong_percentage"])

        weak_auc = np.zeros([len(df_per_method)], dtype=float)
        weak_auc.fill(weak_accuracy)
        weak_auc = np.trapz(weak_auc, df_per_method["strong_percentage"] / 100)

        strong_auc = np.zeros([len(df_per_method)], dtype=float)
        strong_auc.fill(strong_accuracy)
        strong_auc = np.trapz(strong_auc, df_per_method["strong_percentage"] / 100)

        return (row["AUC"] - weak_auc) / (strong_auc - weak_auc)

    metrics = pd.DataFrame({"method": df_router_result["method"].unique()})
    metrics[["20% qual", "50% qual", "80% qual"]] = metrics.apply(
        pct_call_metric, axis=1
    )
    metrics["AUC"] = metrics.apply(auc_metric, axis=1)
    metrics["APGR"] = metrics.apply(apgr_metric, axis=1)
    from src.Utils.merge import f4
    metrics = f4(metrics)
    # metrics = metrics.sort_values(by=["APGR"], ascending=False)
    with pd.option_context("display.max_rows", None, "display.max_columns", None):
        print("Metrics:\n")
        print(metrics)


def pretty_print_results(threshold, accuracy, model_counts, total):
    header = (
        "=" * 15
        + f" {router} with threshold {threshold} on {args.benchmark} "
        + "=" * 15
    )
    print("\n" + header)
    print("Average accuracy: {:.3f}".format(accuracy))
    print(f"Model counts: {', '.join([f'{k}: {v}' for k, v in model_counts.items()])}")
    print(
        f"Model %: {', '.join([f'{k}: {v / total * 100:.3f}%' for k, v in model_counts.items()])}"
    )
    print("=" * len(header) + "\n")


if __name__ == "__main__":
    import argparse
    os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
    from types import SimpleNamespace

    args = SimpleNamespace(
        routers=["bert", "random"],
        benchmark="mmlu",
        output=".",
        overwrite_cache=[],
        parallel=psutil.cpu_count(logical=False),
        strong_model="gpt-4-1106-preview",
        weak_model="mistralai/Mixtral-8x7B-Instruct-v0.1",
        config='config.example.yaml',
        num_results=10,
        random_iters=10
    )
    # print(args)

    pandarallel.initialize(progress_bar=True, nb_workers=args.parallel)
    controller = Controller(
        routers=args.routers,
        config=yaml.safe_load(open(args.config, "r")) if args.config else None,
        strong_model=args.strong_model,
        weak_model=args.weak_model,
        progress_bar=False,
    )

    if args.benchmark == "mmlu":
        print("Running eval for full MMLU.")
        mmlu_domains = ALL_MMLU_DOMAINS
        benchmark = MMLU(mmlu_domains, controller.model_pair, args.overwrite_cache)
    elif args.benchmark == "mt-bench":
        print("Running eval for MT Bench.")
        benchmark = MTBench(controller.model_pair, args.overwrite_cache)
    elif args.benchmark == "gsm8k":
        print("Running eval for GSM8k.")
        benchmark = GSM8K(controller.model_pair, args.overwrite_cache)
    else:
        raise ValueError(f"Invalid benchmark {args.benchmark}")

    all_results = pd.DataFrame()
    for router in controller.routers:
        # Ensure reproducibility on a per-router basis
        random.seed(0)
        # For non-deterministic routers like random, we average over multiple runs
        if router in ["random"]:
            router_results = []
            for i in range(args.random_iters):
                for threshold, accuracy, model_counts, total in benchmark.evaluate(
                    controller, router, args.num_results, True
                ):
                    router_results.append(
                        {
                            "threshold": threshold,
                            "strong_percentage": model_counts[
                                controller.model_pair.strong
                            ]
                            / total
                            * 100,
                            "accuracy": accuracy,
                        }
                    )
            router_results_df = (
                pd.DataFrame(router_results)
                .groupby(["strong_percentage"], as_index=False)
                .mean()
            )
            router_results_df["method"] = str(router)
            all_results = pd.concat([all_results, router_results_df])
        else:
            router_results = []
            for threshold, accuracy, model_counts, total in benchmark.evaluate(
                controller, router, args.num_results, False
            ):
                # print(f"Evaluating router: {router} with threshold {threshold}...")
                # pretty_print_results(threshold, accuracy, model_counts, total)

                result = {
                    "method": str(router),
                    "threshold": threshold,
                    "strong_percentage": model_counts[controller.model_pair.strong]
                    / total
                    * 100,
                    "accuracy": accuracy,
                }
                router_results.append(result)
            all_results = pd.concat([all_results, pd.DataFrame(router_results)])

    generate_results(
        all_results,
        benchmark,
        args.benchmark,
        controller.model_pair,
        args.output,
    )


INFO: Pandarallel will run on 96 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


/root/autodl-tmp/RouteLLM/pretrained/bert_gpt4_augmented
Running eval for full MMLU.


Loading domain data: 100%|██████████| 57/57 [00:00<00:00, 381.11it/s]


Remaining 14037/14042 prompts for MMLU after decontamination
mistralai/Mixtral-8x7B-Instruct-v0.1 score: 68.09147253686685
gpt-4-1106-preview score: 80.58702001852248
Metrics:

    method 20% qual 50% qual 80% qual    AUC   APGR
0  無進行資料採樣   19.92%   38.18%   72.35%   75.4  0.576
1   進行資料採樣   19.63%   37.29%   71.92%  75.53  0.585


  return np.trapz(
  weak_auc = np.trapz(weak_auc, df_per_method["strong_percentage"] / 100)
  strong_auc = np.trapz(strong_auc, df_per_method["strong_percentage"] / 100)


## 5.語義句法特徵提取與否實驗對比

In [5]:
# python -m routellm.evals.evaluate --routers random sw_ranking bert --benchmark mmlu --config config.example.yaml 
import os
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import psutil
import yaml
from pandarallel import pandarallel

from routellm.controller import Controller
from routellm.evals.benchmarks import GSM8K, MMLU, MTBench
from routellm.evals.mmlu.domains import ALL_MMLU_DOMAINS
from routellm.routers.routers import ROUTER_CLS

os.environ["TOKENIZERS_PARALLELISM"] = "false"

import warnings

# 忽略 FutureWarning
warnings.simplefilter(action='ignore', category=FutureWarning)
def generate_results(
    df_router_result, benchmark, benchmark_name, routed_pair, output, plot_optimal=False
):
 
    weak_accuracy = benchmark.get_model_accuracy(routed_pair.weak)
    print(f"{routed_pair.weak} score: {weak_accuracy}")

    strong_accuracy = benchmark.get_model_accuracy(routed_pair.strong)
    print(f"{routed_pair.strong} score: {strong_accuracy}")

    def pct_call_metric(row):
        df_per_method = df_router_result[
            df_router_result["method"] == row["method"]
        ].sort_values(by=["strong_percentage"])
        pct_calls = []

        for pct in [0.2, 0.5, 0.8]:
            pct_call = np.interp(
                pct * (strong_accuracy - weak_accuracy) + weak_accuracy,
                df_per_method["accuracy"],
                df_per_method["strong_percentage"],
            )
            pct_calls.append(f"{pct_call:.2f}%")

        return pd.Series(pct_calls)

    def auc_metric(row):
        df_per_method = df_router_result[
            df_router_result["method"] == row["method"]
        ].sort_values(by=["strong_percentage"])
        return np.trapz(
            df_per_method["accuracy"], df_per_method["strong_percentage"] / 100
        )

    def apgr_metric(row):
        df_per_method = df_router_result[
            df_router_result["method"] == row["method"]
        ].sort_values(by=["strong_percentage"])

        weak_auc = np.zeros([len(df_per_method)], dtype=float)
        weak_auc.fill(weak_accuracy)
        weak_auc = np.trapz(weak_auc, df_per_method["strong_percentage"] / 100)

        strong_auc = np.zeros([len(df_per_method)], dtype=float)
        strong_auc.fill(strong_accuracy)
        strong_auc = np.trapz(strong_auc, df_per_method["strong_percentage"] / 100)

        return (row["AUC"] - weak_auc) / (strong_auc - weak_auc)

    metrics = pd.DataFrame({"method": df_router_result["method"].unique()})
    metrics[["20% qual", "50% qual", "80% qual"]] = metrics.apply(
        pct_call_metric, axis=1
    )
    metrics["AUC"] = metrics.apply(auc_metric, axis=1)
    metrics["APGR"] = metrics.apply(apgr_metric, axis=1)
    from src.Utils.merge import f5
    metrics = f5(metrics)
    # metrics = metrics.sort_values(by=["APGR"], ascending=False)
    with pd.option_context("display.max_rows", None, "display.max_columns", None):
        print("Metrics:\n")
        print(metrics)


def pretty_print_results(threshold, accuracy, model_counts, total):
    header = (
        "=" * 15
        + f" {router} with threshold {threshold} on {args.benchmark} "
        + "=" * 15
    )
    print("\n" + header)
    print("Average accuracy: {:.3f}".format(accuracy))
    print(f"Model counts: {', '.join([f'{k}: {v}' for k, v in model_counts.items()])}")
    print(
        f"Model %: {', '.join([f'{k}: {v / total * 100:.3f}%' for k, v in model_counts.items()])}"
    )
    print("=" * len(header) + "\n")


if __name__ == "__main__":
    import argparse
    os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
    from types import SimpleNamespace

    args = SimpleNamespace(
        routers=["bert", "random"],
        benchmark="mmlu",
        output=".",
        overwrite_cache=[],
        parallel=psutil.cpu_count(logical=False),
        strong_model="gpt-4-1106-preview",
        weak_model="mistralai/Mixtral-8x7B-Instruct-v0.1",
        config='config.example.yaml',
        num_results=10,
        random_iters=10
    )
    # print(args)

    pandarallel.initialize(progress_bar=True, nb_workers=args.parallel)
    controller = Controller(
        routers=args.routers,
        config=yaml.safe_load(open(args.config, "r")) if args.config else None,
        strong_model=args.strong_model,
        weak_model=args.weak_model,
        progress_bar=False,
    )

    if args.benchmark == "mmlu":
        print("Running eval for full MMLU.")
        mmlu_domains = ALL_MMLU_DOMAINS
        benchmark = MMLU(mmlu_domains, controller.model_pair, args.overwrite_cache)
    elif args.benchmark == "mt-bench":
        print("Running eval for MT Bench.")
        benchmark = MTBench(controller.model_pair, args.overwrite_cache)
    elif args.benchmark == "gsm8k":
        print("Running eval for GSM8k.")
        benchmark = GSM8K(controller.model_pair, args.overwrite_cache)
    else:
        raise ValueError(f"Invalid benchmark {args.benchmark}")

    all_results = pd.DataFrame()
    for router in controller.routers:
        # Ensure reproducibility on a per-router basis
        random.seed(0)
        # For non-deterministic routers like random, we average over multiple runs
        if router in ["random"]:
            router_results = []
            for i in range(args.random_iters):
                for threshold, accuracy, model_counts, total in benchmark.evaluate(
                    controller, router, args.num_results, True
                ):
                    router_results.append(
                        {
                            "threshold": threshold,
                            "strong_percentage": model_counts[
                                controller.model_pair.strong
                            ]
                            / total
                            * 100,
                            "accuracy": accuracy,
                        }
                    )
            router_results_df = (
                pd.DataFrame(router_results)
                .groupby(["strong_percentage"], as_index=False)
                .mean()
            )
            router_results_df["method"] = str(router)
            all_results = pd.concat([all_results, router_results_df])
        else:
            router_results = []
            for threshold, accuracy, model_counts, total in benchmark.evaluate(
                controller, router, args.num_results, False
            ):
                # print(f"Evaluating router: {router} with threshold {threshold}...")
                # pretty_print_results(threshold, accuracy, model_counts, total)

                result = {
                    "method": str(router),
                    "threshold": threshold,
                    "strong_percentage": model_counts[controller.model_pair.strong]
                    / total
                    * 100,
                    "accuracy": accuracy,
                }
                router_results.append(result)
            all_results = pd.concat([all_results, pd.DataFrame(router_results)])

    generate_results(
        all_results,
        benchmark,
        args.benchmark,
        controller.model_pair,
        args.output,
    )


INFO: Pandarallel will run on 96 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


/root/autodl-tmp/RouteLLM/pretrained/bert_gpt4_augmented
Running eval for full MMLU.


Loading domain data: 100%|██████████| 57/57 [00:00<00:00, 370.53it/s]


Remaining 14037/14042 prompts for MMLU after decontamination
mistralai/Mixtral-8x7B-Instruct-v0.1 score: 68.09147253686685
gpt-4-1106-preview score: 80.58702001852248
Metrics:

        method 20% qual 50% qual 80% qual    AUC   APGR
0  無進行語義句法特徵提取   19.84%   38.72%   72.68%  75.31  0.570
1   進行語義句法特徵提取   19.63%   37.29%   71.92%  75.53  0.585


  return np.trapz(
  weak_auc = np.trapz(weak_auc, df_per_method["strong_percentage"] / 100)
  strong_auc = np.trapz(strong_auc, df_per_method["strong_percentage"] / 100)


## 6.搭建 RAG 框架與否實驗對比

In [1]:
# python -m routellm.evals.evaluate --routers random sw_ranking bert --benchmark mmlu --config config.example.yaml 
import os
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import psutil
import yaml
from pandarallel import pandarallel

from routellm.controller import Controller
from routellm.evals.benchmarks import GSM8K, MMLU, MTBench
from routellm.evals.mmlu.domains import ALL_MMLU_DOMAINS
from routellm.routers.routers import ROUTER_CLS

os.environ["TOKENIZERS_PARALLELISM"] = "false"
import warnings

# 忽略 FutureWarning
warnings.simplefilter(action='ignore', category=FutureWarning)

def generate_results(
    df_router_result, benchmark, benchmark_name, routed_pair, output, plot_optimal=False
):
 
    weak_accuracy = benchmark.get_model_accuracy(routed_pair.weak)
    print(f"{routed_pair.weak} score: {weak_accuracy}")

    strong_accuracy = benchmark.get_model_accuracy(routed_pair.strong)
    print(f"{routed_pair.strong} score: {strong_accuracy}")

    def pct_call_metric(row):
        df_per_method = df_router_result[
            df_router_result["method"] == row["method"]
        ].sort_values(by=["strong_percentage"])
        pct_calls = []

        for pct in [0.2, 0.5, 0.8]:
            pct_call = np.interp(
                pct * (strong_accuracy - weak_accuracy) + weak_accuracy,
                df_per_method["accuracy"],
                df_per_method["strong_percentage"],
            )
            pct_calls.append(f"{pct_call:.2f}%")

        return pd.Series(pct_calls)

    def auc_metric(row):
        df_per_method = df_router_result[
            df_router_result["method"] == row["method"]
        ].sort_values(by=["strong_percentage"])
        return np.trapz(
            df_per_method["accuracy"], df_per_method["strong_percentage"] / 100
        )

    def apgr_metric(row):
        df_per_method = df_router_result[
            df_router_result["method"] == row["method"]
        ].sort_values(by=["strong_percentage"])

        weak_auc = np.zeros([len(df_per_method)], dtype=float)
        weak_auc.fill(weak_accuracy)
        weak_auc = np.trapz(weak_auc, df_per_method["strong_percentage"] / 100)

        strong_auc = np.zeros([len(df_per_method)], dtype=float)
        strong_auc.fill(strong_accuracy)
        strong_auc = np.trapz(strong_auc, df_per_method["strong_percentage"] / 100)

        return (row["AUC"] - weak_auc) / (strong_auc - weak_auc)

    metrics = pd.DataFrame({"method": df_router_result["method"].unique()})
    metrics[["20% qual", "50% qual", "80% qual"]] = metrics.apply(
        pct_call_metric, axis=1
    )
    metrics["AUC"] = metrics.apply(auc_metric, axis=1)
    metrics["APGR"] = metrics.apply(apgr_metric, axis=1)
    from src.Utils.merge import f6
    metrics = f6(metrics)
    # metrics = metrics.sort_values(by=["APGR"], ascending=False)
    with pd.option_context("display.max_rows", None, "display.max_columns", None):
        print("Metrics:\n")
        print(metrics)


def pretty_print_results(threshold, accuracy, model_counts, total):
    header = (
        "=" * 15
        + f" {router} with threshold {threshold} on {args.benchmark} "
        + "=" * 15
    )
    print("\n" + header)
    print("Average accuracy: {:.3f}".format(accuracy))
    print(f"Model counts: {', '.join([f'{k}: {v}' for k, v in model_counts.items()])}")
    print(
        f"Model %: {', '.join([f'{k}: {v / total * 100:.3f}%' for k, v in model_counts.items()])}"
    )
    print("=" * len(header) + "\n")


if __name__ == "__main__":
    import argparse
    os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
    from types import SimpleNamespace

    args = SimpleNamespace(
        routers=["bert", "random"],
        benchmark="mmlu",
        output=".",
        overwrite_cache=[],
        parallel=psutil.cpu_count(logical=False),
        strong_model="gpt-4-1106-preview",
        weak_model="mistralai/Mixtral-8x7B-Instruct-v0.1",
        config='config.example.yaml',
        num_results=10,
        random_iters=10
    )
    # print(args)

    pandarallel.initialize(progress_bar=True, nb_workers=args.parallel)
    controller = Controller(
        routers=args.routers,
        config=yaml.safe_load(open(args.config, "r")) if args.config else None,
        strong_model=args.strong_model,
        weak_model=args.weak_model,
        progress_bar=False,
    )

    if args.benchmark == "mmlu":
        print("Running eval for full MMLU.")
        mmlu_domains = ALL_MMLU_DOMAINS
        benchmark = MMLU(mmlu_domains, controller.model_pair, args.overwrite_cache)
    elif args.benchmark == "mt-bench":
        print("Running eval for MT Bench.")
        benchmark = MTBench(controller.model_pair, args.overwrite_cache)
    elif args.benchmark == "gsm8k":
        print("Running eval for GSM8k.")
        benchmark = GSM8K(controller.model_pair, args.overwrite_cache)
    else:
        raise ValueError(f"Invalid benchmark {args.benchmark}")

    all_results = pd.DataFrame()
    for router in controller.routers:
        # Ensure reproducibility on a per-router basis
        random.seed(0)
        # For non-deterministic routers like random, we average over multiple runs
        if router in ["random"]:
            router_results = []
            for i in range(args.random_iters):
                for threshold, accuracy, model_counts, total in benchmark.evaluate(
                    controller, router, args.num_results, True
                ):
                    router_results.append(
                        {
                            "threshold": threshold,
                            "strong_percentage": model_counts[
                                controller.model_pair.strong
                            ]
                            / total
                            * 100,
                            "accuracy": accuracy,
                        }
                    )
            router_results_df = (
                pd.DataFrame(router_results)
                .groupby(["strong_percentage"], as_index=False)
                .mean()
            )
            router_results_df["method"] = str(router)
            all_results = pd.concat([all_results, router_results_df])
        else:
            router_results = []
            for threshold, accuracy, model_counts, total in benchmark.evaluate(
                controller, router, args.num_results, False
            ):
                # print(f"Evaluating router: {router} with threshold {threshold}...")
                # pretty_print_results(threshold, accuracy, model_counts, total)

                result = {
                    "method": str(router),
                    "threshold": threshold,
                    "strong_percentage": model_counts[controller.model_pair.strong]
                    / total
                    * 100,
                    "accuracy": accuracy,
                }
                router_results.append(result)
            all_results = pd.concat([all_results, pd.DataFrame(router_results)])

    generate_results(
        all_results,
        benchmark,
        args.benchmark,
        controller.model_pair,
        args.output,
    )


INFO: Pandarallel will run on 96 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


/root/autodl-tmp/RouteLLM/pretrained/bert_gpt4_augmented
Running eval for full MMLU.


Loading domain data: 100%|██████████| 57/57 [00:00<00:00, 629.95it/s]

Remaining 14037/14042 prompts for MMLU after decontamination





mistralai/Mixtral-8x7B-Instruct-v0.1 score: 68.09147253686685
gpt-4-1106-preview score: 80.58702001852248
Metrics:

          method 20% qual 50% qual 80% qual    AUC   APGR
0  無搭建 RAG 向量知識庫   19.81%   39.65%   73.74%  75.15  0.557
1   搭建 RAG 向量知識庫   19.63%   37.29%   71.92%  75.53  0.585


  return np.trapz(
  weak_auc = np.trapz(weak_auc, df_per_method["strong_percentage"] / 100)
  strong_auc = np.trapz(strong_auc, df_per_method["strong_percentage"] / 100)
