In [1]:
import os
from tryRAG.framework import RAGFramework

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
doc_path = os.path.join(os.getcwd(), "../..", "ref_docs", "eecs_20250606_text_bs_rewritten.jsonl")
idx_path = os.path.join(os.getcwd(), "..", "ref_idx", "paragraph_p")

cfg = {
    "lm_model_name": "../../gemma-3-4b-it", 
    "emb_model_name": "all-MiniLM-L6-v2", 
    "mode": "hybrid", #@ "hybrid" / "sparse" / "hybrid"
    "chunk_level": "paragraph", #@ "web_page" / "paragraph" / "sentence"
    "more_info": True, #@ True / False
    # "doc_path": doc_path, 
    "idx_path": idx_path, 
    "device": "cuda", 
}
USE_UPPER_TEXT = False
USE_PRE_ANSWER = False
TOP_K = 5

rag = RAGFramework.from_config(cfg)

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.00s/it]


Loaded ['dense', 'sparse'] index from /home/yasaisen/Desktop/29_research/research_main/lab_05/to_git/tryRAG/../ref_idx/paragraph_p
RAGFramework initialized



In [3]:
question = 'For how long has Berkeley EECS been at the forefront of research? Available options:\n    (A) Less than a decade\n    (B) Over a quarter of a century\n    (C) Over half a century\n    (D) Over a century'

response = rag.ask(question, top_k=TOP_K)
print(response['response'])

print('=' * 20)
for doc in response['relevant_docs']:
    print(doc.idx, doc.url)

Answer: (C) Over half a century. The text states that Berkeley EECS has been at the forefront of research for “over half a century.”
13572 https://eecs.berkeley.edu/research/colloquium/archives/spring2021
17551 https://eecs.berkeley.edu/category/research/page/4
58708 https://eecs.berkeley.edu/category/research/page/76
17550 https://eecs.berkeley.edu/category/research/page/4
44627 https://eecs.berkeley.edu/category/research/page/12
18856 http://www.eecs.berkeley.edu/IPRO/BEARS/2011
18855 http://www.eecs.berkeley.edu/IPRO/BEARS/2011
18881 http://www.eecs.berkeley.edu/IPRO/BEARS/2010
18882 http://www.eecs.berkeley.edu/IPRO/BEARS/2010
19411 http://www.eecs.berkeley.edu/IPRO/BEARS/open-house/12/trust.html


In [4]:
question = 'Who is Lee Julian Purnell'

response = rag.ask(question, top_k=TOP_K)
print(response['response'])

print('=' * 20)
for doc in response['relevant_docs']:
    print(doc.idx, doc.url)

Lee Julian Purnell is the first Black student known to have graduated from the EECS department at UC Berkeley. He was born in Washington, D.C. in 1896, graduated from Berkeley High in 1915, earned a B.A. from Cal in 1919, and later obtained a B.S. in Electrical Engineering from MIT in 1921 and an M.S. in Electrical Engineering from Berkeley in 19
63613 https://eecs.berkeley.edu/category/people/page/29
63461 https://eecs.berkeley.edu/news/page/27/?field_eecs_news_topics_target_id_entityreference_filter=61
63279 https://eecs.berkeley.edu/news/page/27/?field_eecs_news_topics_target_id_entityreference_filter=68
14766 https://eecs.berkeley.edu/blog/page/7
63462 https://eecs.berkeley.edu/news/page/27/?field_eecs_news_topics_target_id_entityreference_filter=61
63154 https://eecs.berkeley.edu/news/page/27/?field_eecs_news_topics_target_id_entityreference_filter=60
63023 https://eecs.berkeley.edu/news/page/27/?field_eecs_news_topics_target_id_entityreference_filter=64
65052 https://eecs.berkele

In [5]:
import json
import evaluate
from tqdm import tqdm

bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

def load_jsonl2list(data_path):
    data_list = []
    with open(data_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if line:
                data = json.loads(line)
                data_list += [data]

    print(len(data_list), data_list[0].keys())
    return data_list

In [6]:
data_path = os.path.join(os.getcwd(), "..", "dataset", "ucb_eecs_rag_eval_dataset.jsonl")
data_list = load_jsonl2list(data_path)

103 dict_keys(['question', 'answer', 'url'])


In [7]:
url_acc_list = []
con_acc_list = []
cand = []
ref = []
for idx, sample in tqdm(enumerate(data_list), total=len(data_list)):

    if idx in [50]:
        continue

    response = rag.ask(
        sample['question'], 
        top_k=TOP_K, 
        use_upper_text=USE_UPPER_TEXT, 
        pre_answer=USE_PRE_ANSWER, 
    )

    url_pred_list = []
    doc_pred_list = []
    for doc in response['relevant_docs']:
        url_pred_list += [doc.url]
        if USE_UPPER_TEXT:
            doc_pred_list += [doc.upper_text]
        else:
            doc_pred_list += [doc.content]
    url_gt = sample['url']

    ans_pred = response['response']
    ans_gt = sample['answer']

    if url_gt in url_pred_list:
        url_acc_list += [1]
    else:
        url_acc_list += [0]

    hit_list = []
    for content in doc_pred_list:
        if ans_gt in content:
            hit_list += [1]
        else:
            hit_list += [0]
    con_acc_list += [sum(hit_list) / len(hit_list)]

    cand += [ans_pred.split('Answer:')[-1]]
    ref += [ans_gt]


100%|██████████| 103/103 [02:26<00:00,  1.43s/it]


In [8]:
_bleu = bleu.compute(predictions=cand, references=ref)
print(f"{_bleu['bleu']}")
_rouge = rouge.compute(predictions=cand, references=ref)
print(f"{_rouge['rouge1']}\n{_rouge['rouge2']}\n{_rouge['rougeL']}\n{_rouge['rougeLsum']}")
bs_res = bertscore.compute(
    predictions=cand,
    references=ref,
    lang="en"
)
print(f"{bs_res['f1'][0]:.4f}")
print(sum(url_acc_list) / len(url_acc_list))
print(sum(con_acc_list) / len(con_acc_list))

0.04198787635885376
0.5147775331166334
0.31880252100840334
0.5129326715831906
0.5153271469015414


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0.8579
0.4411764705882353
0.32435807656395893




In [9]:
import random

data_path = os.path.join(os.getcwd(), "..", "dataset", "multiple_choice_qa_dataset.jsonl")
_data_list = load_jsonl2list(data_path)

random.seed(42)
random.shuffle(_data_list)
_data_list = _data_list[:200]

map_dict = {
    "? (A) ": "? Available options:\n    (A) ", 
    " (B) ": "\n    (B) ", 
    " (C) ": "\n    (C) ",
    " (D) ": "\n    (D) ",
}

data_list = []
for sample in _data_list:

    for key, value in map_dict.items():
        question = sample['question'].replace(key, value)
    data_list += [{
        'question': question,
        'answer': sample['answer'],
        'url': sample['chunk_url']
    }]


14241 dict_keys(['chunk_url', 'content', 'question', 'answer'])


In [10]:
url_acc_list = []
con_acc_list = []
cand = []
ref = []
for idx, sample in tqdm(enumerate(data_list), total=len(data_list)):

    if idx in []:
        continue

    response = rag.ask(
        sample['question'], 
        top_k=TOP_K, 
        use_upper_text=USE_UPPER_TEXT, 
        pre_answer=USE_PRE_ANSWER, 
    )

    url_pred_list = []
    doc_pred_list = []
    for doc in response['relevant_docs']:
        url_pred_list += [doc.url]
        if USE_UPPER_TEXT:
            doc_pred_list += [doc.upper_text]
        else:
            doc_pred_list += [doc.content]
    url_gt = sample['url']

    ans_pred = response['response']
    ans_gt = sample['answer']

    if url_gt in url_pred_list:
        url_acc_list += [1]
    else:
        url_acc_list += [0]

    hit_list = []
    for content in doc_pred_list:
        if ans_gt in content:
            hit_list += [1]
        else:
            hit_list += [0]
    con_acc_list += [sum(hit_list) / len(hit_list)]

    cand += [ans_pred.split('Answer:')[-1]]
    ref += [ans_gt]


100%|██████████| 200/200 [04:35<00:00,  1.38s/it]


In [11]:
_bleu = bleu.compute(predictions=cand, references=ref)
print(f"{_bleu['bleu']}")
_rouge = rouge.compute(predictions=cand, references=ref)
print(f"{_rouge['rouge1']}\n{_rouge['rouge2']}\n{_rouge['rougeL']}\n{_rouge['rougeLsum']}")
bs_res = bertscore.compute(
    predictions=cand,
    references=ref,
    lang="en"
)
print(f"{bs_res['f1'][0]:.4f}")
print(sum(url_acc_list) / len(url_acc_list))
print(sum(con_acc_list) / len(con_acc_list))

0.0
0.29665829155777124
0.0
0.2963090363459442
0.29632136761694494
0.9033
0.755
0.0




In [12]:
acc_list = []
for pred, gt in zip(cand, ref):
    # print(pred, '/', gt)
    if gt in pred:
        acc_list += [1]
    else:
        acc_list += [0]

print(sum(acc_list) / len(acc_list))

0.88


In [13]:
for pred, gt in zip(cand, ref):
    print(pred, '/', gt)

 (C) FREE / (C)
 (B) He helped shape the industry with the design of Apple’s first line of products.

References indicate that Steve Wozniak “helped shape the computing industry with the design of Apple’s first line of products, co-founding Apple Computer Inc. with Steve Jobs.” / (B)
 (C) Twitter, Instagram, LinkedIn, YouTube / (C)
 (C) By attending a seminar or conference and by making a gift to the university. The references consistently state that you can support EECS by attending seminars/conferences, viewing lectures, or making a gift to the university. / (C)
 (B) T. Lin and M. I. Jordan / (B)
 (B) Prof. Emeritus Leon O. Chua

References 1, 3, 7, 8, and 9 all state that Prof. Emeritus Leon O. Chua has been named a Celebrated Member of the IEEE Electron Devices Society (EDS). / (B)
 (B) Secretary-of-state Web sites around the country

Vy-An Phan focused on various secretary-of-state Web sites around the country, which house tools central to the electoral process. / (B)
 (B) Folding