In [1]:
import pandas as pd 
from langchain_chroma import Chroma 
from langchain_community.vectorstores import FAISS

from retriever import Retriever
from evaluator import RetrievalEvaluator

test data

In [2]:
df = pd.read_csv('sample_score.csv') 

In [None]:
df.columns

### evaluate retriever

In [None]:
modes = ['embed', 'bm25', 'hybrid']
embed_model_names = ["all-MiniLM-L6-v2", "BAAI/LLM-Embedder", "BAAI/bge-small-en-v1.5"]
dbs = [FAISS, Chroma]
db_names = ['faiss', 'chroma']

queries = ["What are the lowest writing scores?", 
           "What are the highest reading scores?", 
           "What are the average math scores?", 
           "If parental level of education has the impact for reading score?", 
           "What's the best comprehensive score?",
           "If food impacts writing score?"]

relevant_docs = [['writing score'], 
                 ['reading score'], 
                 ['math score'], 
                 ['parental level of education','reading score'], 
                 ['writing score','reading score', 'math score'],
                 ['lunch','writing score']]

retriever_results = {'modes': [], 
                     'embed_model_names': [], 
                     'dbs': [], 
                     'Recall@k': [],
                     'MRR': [],
                     'queries': [],
                     'retrieved_docs': []}

for mode in modes:    
    for embed_model_name in embed_model_names:       
        for i in range(len(dbs)):           
            try:
                retriever = Retriever(mode=mode, embed_model_name=embed_model_name, db=dbs[i], top_k=5)
                for j in range(len(queries)):
                    retrieved_doc = retriever.retrieve_schema(queries[j], df, evaluate=True)
                    relevant_doc = relevant_docs[j]
                    evaluator = RetrievalEvaluator(retrieved_doc, relevant_doc)
                    results = evaluator.evaluate()
                    for key, value in results.items():
                        retriever_results[key].append(value)
                        
                    retriever_results['modes'].append(mode)
                    retriever_results['embed_model_names'].append(embed_model_name)
                    retriever_results['dbs'].append(db_names[i])
                    retriever_results['queries'].append(queries[j])
                    retriever_results['retrieved_docs'].append(retrieved_doc)
            except Exception as e:
                print(f"Error: {e}")


In [5]:
retr_df =pd.DataFrame(retriever_results)

In [None]:
retr_df.groupby(['modes', 'embed_model_names', 'dbs'])[['Recall@k', 'MRR']].agg(
    recall_5_mean=('Recall@k', 'mean'), 
    recall_5_count=('Recall@k', 'count'),
    mrr_mean=('MRR', 'mean'), 
    mrr_count=('MRR', 'count')
)

"embed + Chroma" isn't a good combination choice.


In [None]:

retr_df[retr_df[['Recall@k', 'MRR']].sum(axis=1) < 2]

In [None]:
retr_df[retr_df[['Recall@k', 'MRR']].sum(axis=1) < 2]['retrieved_docs'].tolist()

表现都挺好的，选择一个方案用，并说明原因就行

### evaluate code generator



In [1]:
import pandas as pd 
from langchain_community.vectorstores import FAISS

from model import Model
from retriever import Retriever
from prompts import get_prompt, combined_template
from agent import RAGAgent
from evaluator import GenerCodeEvaluator
from execute import extract_code

import warnings
warnings.filterwarnings("ignore")


test data

In [2]:
df = pd.read_csv('sample_score.csv') 

evaluate code generator 

In [3]:
retriever = Retriever(mode='hybrid', embed_model_name="BAAI/bge-small-en-v1.5", db=FAISS, top_k=5)
prompt = get_prompt(combined_template)

In [4]:
model_names = ["llama3.1", "llama-3.3-70b-versatile","mistral", "gemma2-9b-it"]


#Kwargs = [{'stable': {'temperature': 0.2,'top_k': 1}},
#          {'diverse': {'temperature': 0.8, 'top_p': 0.95}},
#          {'default': {}}]

Kwargs = [{'stable': {'temperature': 0.1, 'top_p': 0.1}},
          {'diverse': {'temperature': 0.8, 'top_p': 0.95}},
          {'default': {}}]

queries = ["What are the lowest writing scores?", 
           "What are the highest reading scores?", 
           "What are the average math scores?", 
           "If parental level of education has the impact for reading score?", 
           "What's the best comprehensive score?",
           "If food impacts writing score?"]

reference_codes = ["df['writing score'].min()", 
                 "df['reading score'].max()", 
                 "df['math score'].mean()", 
                 "df.groupby('parental level of education')['reading score'].mean()", 
                 "df[['reading score', 'writing score', 'math score']].sum(axis=1).max()",
                 "df.groupby('lunch')['writing score'].mean()"]

code_generator_results = {'model_names': [], 
                         'Exact Match': [],
                         'F1 Score': [],
                         'queries': [],
                         'generateds': [],
                         'param_types': []}


for model_name in model_names:
    for dic_ in Kwargs:
        for param_type, kwargs in dic_.items():
            model = Model(model_name=model_name, **kwargs)
            processor = RAGAgent(retriever, prompt, model, df)
            for i in range(len(queries)):
                try:
                    generated = processor.processor(queries[i])
                    generated_code = extract_code(generated)
                    evaluator = GenerCodeEvaluator(generated_code, reference_codes[i])
                    results = evaluator.evaluate()
                    for key, value in results.items():
                        code_generator_results[key].append(value)
                    
                    code_generator_results['model_names'].append(model_name)
                    code_generator_results['queries'].append(queries[i])
                    code_generator_results['generateds'].append(generated)
                    code_generator_results['param_types'].append(param_type)
                except Exception as e:
                    print(f"Error: {e}")



Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama-3.3-70b-versatile` in organization `org_01jhnywfrpfrhb5r935e6ewy2h` service tier `on_demand` on : Limit 100000, Used 99291, Requested 1721. Please try again in 14m34.032999999s. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': '', 'code': 'rate_limit_exceeded'}}
Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama-3.3-70b-versatile` in organization `org_01jhnywfrpfrhb5r935e6ewy2h` service tier `on_demand` on : Limit 100000, Used 99291, Requested 1718. Please try again in 14m31.122999999s. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': '', 'code': 'rate_limit_exceeded'}}
Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama-3.3-70b-versatile` in organization `org_01jhnywfrpfrhb5r935e6ewy2h` service tier `on_demand` on : Limit 100000, Used 99290, Requested 1721. Please try again in 

In [5]:
gen_df = pd.DataFrame(code_generator_results)

In [6]:

gen_df.groupby(['model_names', 'param_types'])[['Exact Match', 'F1 Score']].agg(
    em_mean=('Exact Match', 'mean'), 
    em_count=('Exact Match', 'count'),
    f1_mean=('F1 Score', 'mean'), 
    f1_count=('F1 Score', 'count')
)


Unnamed: 0_level_0,Unnamed: 1_level_0,em_mean,em_count,f1_mean,f1_count
model_names,param_types,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
gemma2-9b-it,default,1.0,6,1.0,6
gemma2-9b-it,diverse,1.0,6,1.0,6
gemma2-9b-it,stable,1.0,6,1.0,6
llama-3.3-70b-versatile,diverse,0.75,4,0.875,4
llama-3.3-70b-versatile,stable,1.0,6,1.0,6
llama3.1,default,1.0,6,1.0,6
llama3.1,diverse,0.833333,6,0.875,6
llama3.1,stable,1.0,6,1.0,6
mistral,default,0.833333,6,0.916667,6
mistral,diverse,1.0,6,1.0,6


choose llama3.1 + stable


### evaluate interpretation generator

In [1]:
import pandas as pd 
from langchain_community.vectorstores import FAISS

from model import Model
from retriever import Retriever
from prompts import get_prompt, combined_template, interp_template
from agent import RAGAgent, InterpAgent
from evaluator import bert_score_f1
from execute import parse_response

import warnings
warnings.filterwarnings("ignore")


test data




In [2]:
df = pd.read_csv('sample_score.csv') 

evaluate interpretation generator

In [3]:
retriever = Retriever(mode='hybrid', embed_model_name="BAAI/bge-small-en-v1.5", db=FAISS, top_k=5)
prompt = get_prompt(combined_template)
code_model = Model(model_name="gemma2-9b-it",temperature=0.2,top_p=0.1)
processor = RAGAgent(retriever, prompt, code_model, df)
interp_prompt = get_prompt(interp_template)



In [4]:
model_names = ["llama3.1", "mistral", "gemma2-9b-it","llama-3.3-70b-versatile"]

Kwargs = [{'stable': {'temperature': 0.2,'top_p': 0.1}},
          {'diverse': {'temperature': 0.8, 'top_p': 0.9}},
          {'default': {}}]

queries = ["What are the lowest writing scores?", 
           "What are the highest reading scores?", 
           "What are the average math scores?", 
           "If parental level of education has the impact for reading score?", 
           "What's the best comprehensive score?",
           "If food impacts writing score?"]

references = [["The relative result indicates that the lowest writing scores are 22."], 
              ["The highest reading scores achieved are 86."], 
              ["The calculated result indicates that the average math scores among the given data is approximately 63.85, suggesting a relatively moderate performance in this subject area."], 
              ["The relative result indicates that there is a significant difference in reading scores across different levels of parental education, suggesting that parental level of education does have an impact on reading performance."], 
              ["The best comprehensive score is 261, indicating that the sum of reading, writing, and math scores for this particular group or individual is the highest among all available data."],
              ["The relative data suggests that students who received free or reduced lunch had a significantly higher mean writing score (65.7) compared to those who received standard lunch (62.2), indicating a positive impact of food on writing performance."]]

generator_results = {'model_names': [], 
                    'bert_score_f1': [],
                    'queries': [],
                    'responses': [],
                    'param_types': []}

for model_name in model_names:
    for dic_ in Kwargs:
        for param_type, kwargs in dic_.items():
            model = Model(model_name=model_name, **kwargs)
            interp = InterpAgent(interp_prompt, model)
            for i in range(len(queries)):
                try:
                    ctx = processor.invoke(queries[i])
                    answer = interp.processor(ctx, queries[i])
                    response = parse_response(answer)['The concluding response:']
                    f1 = bert_score_f1([response], references[i])
                    generator_results['bert_score_f1'].append(f1)
                    generator_results['model_names'].append(model_name)
                    generator_results['queries'].append(queries[i])
                    generator_results['responses'].append(response)
                    generator_results['param_types'].append(param_type)
                except Exception as e:
                    print(f"Error: {e}")



Error: 'The concluding response:'
Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama-3.3-70b-versatile` in organization `org_01jhnywfrpfrhb5r935e6ewy2h` service tier `on_demand` on : Limit 100000, Used 99902, Requested 398. Please try again in 4m18.676s. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': '', 'code': 'rate_limit_exceeded'}}
Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama-3.3-70b-versatile` in organization `org_01jhnywfrpfrhb5r935e6ewy2h` service tier `on_demand` on : Limit 100000, Used 99895, Requested 426. Please try again in 4m36.747s. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': '', 'code': 'rate_limit_exceeded'}}
Error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama-3.3-70b-versatile` in organization `org_01jhnywfrpfrhb5r935e6ewy2h` service tier `on_demand` on : Limit 100000, Used 99887, Requested 375. Ple

In [5]:
interp_df = pd.DataFrame(generator_results)

In [6]:
interp_df.groupby(['model_names', 'param_types'])[['bert_score_f1']].agg(
    f1_mean=('bert_score_f1', 'mean'), 
    f1_count=('bert_score_f1', 'count')
)



Unnamed: 0_level_0,Unnamed: 1_level_0,f1_mean,f1_count
model_names,param_types,Unnamed: 2_level_1,Unnamed: 3_level_1
gemma2-9b-it,default,0.811667,6
gemma2-9b-it,diverse,0.811667,6
gemma2-9b-it,stable,0.81,6
llama-3.3-70b-versatile,default,0.84,1
llama-3.3-70b-versatile,stable,0.825,4
llama3.1,default,0.866667,6
llama3.1,diverse,0.871667,6
llama3.1,stable,0.89,6
mistral,default,0.818333,6
mistral,diverse,0.8,5


llama3.1 + stable