In [10]:
import math
from collections import Counter, defaultdict
from typing import List
import jieba
import json
import os
from pathlib import Path

import jieba
import pandas as pd
from tqdm import tqdm
from utils import BM25, filter_stop

from langchain_ollama.llms import OllamaLLM
llm = OllamaLLM(model="qwen2.5:1.5b")

# vector retrieval

In [2]:
prompt_template = '''You are a helpful assistant, please answer the following question based on the given content:

Question:
```
{question}
```

Content:
```
{content}
```

Just give a simple answer, do not include any additional information or explaination.
'''

In [3]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

embed = HuggingFaceEmbeddings(model_name='BAAI/bge-small-en-v1.5')
vector_db = FAISS.load_local('/Users/wenjiazhai/Documents/GitHub/paper_analyze/store/vanilla_langchain', 
                             embed, allow_dangerous_deserialization=True)

  from tqdm.autonotebook import tqdm, trange


# BM25

In [4]:
all_documents = [doc.page_content for doc in vector_db.docstore._dict.values()]
bm25 = BM25(all_documents)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/77/tngzlz3n44s1cm7spj20fy8m0000gn/T/jieba.cache
Loading model cost 0.398 seconds.
Prefix dict has been built successfully.


# all

In [5]:
with open('../../data/LongBench_original.json') as f:
    data = json.load(f)

res = {'query': [], 'answer': [], 'predict': []}
for q, a in tqdm(zip(data['query'], data['answer'])):
    scores = defaultdict(int)
    
    bm25_retrival_res = bm25.query(q)[:12]
    for doc, score in bm25_retrival_res:
        scores[doc] += score * 0.4
    vector_retrieval_res = vector_db.similarity_search_with_score(q, k=12)
    for doc, score in vector_retrieval_res:
        scores[doc.page_content] += score * 0.6
    
    scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    retrieval = '\n'.join([x[0] for x in scores[:6]])
    response = llm.invoke(prompt_template.format(question=q, content=retrieval))
    res['query'].append(q)
    res['answer'].append(a)
    res['predict'].append(response)

314it [04:42,  1.11it/s]


In [6]:
with open('../../output/hybrid.json', 'w') as f:
    json.dump(res, f, indent=4, ensure_ascii=False)

In [8]:
path = Path('/Users/wenjiazhai/Documents/GitHub/paper_analyze/output/hybrid').rglob('*.json')

for file in path:
    try:
        with open(file, 'r') as f:
            data = json.load(f)
    except:
        print(f'{file} error')
        os.remove(file)

/Users/wenjiazhai/Documents/GitHub/paper_analyze/output/hybrid/76.json error
/Users/wenjiazhai/Documents/GitHub/paper_analyze/output/hybrid/308.json error
/Users/wenjiazhai/Documents/GitHub/paper_analyze/output/hybrid/297.json error
/Users/wenjiazhai/Documents/GitHub/paper_analyze/output/hybrid/151.json error
/Users/wenjiazhai/Documents/GitHub/paper_analyze/output/hybrid/192.json error
/Users/wenjiazhai/Documents/GitHub/paper_analyze/output/hybrid/219.json error
/Users/wenjiazhai/Documents/GitHub/paper_analyze/output/hybrid/300.json error
/Users/wenjiazhai/Documents/GitHub/paper_analyze/output/hybrid/195.json error
/Users/wenjiazhai/Documents/GitHub/paper_analyze/output/hybrid/121.json error
/Users/wenjiazhai/Documents/GitHub/paper_analyze/output/hybrid/273.json error


In [11]:
path = Path('/Users/wenjiazhai/Documents/GitHub/paper_analyze/output/hybrid').rglob('*.json')

data = []
for file in path:
    try:
        with open(file, 'r') as f:
            data.append(json.load(f))
    except:
        pass
    
df = pd.DataFrame(data)
df.to_csv('/Users/wenjiazhai/Documents/GitHub/paper_analyze/output/hybrid_output_consistency.csv', index=False)

In [12]:
consistency = [int(d) for d in df['consistency']]
sum(consistency) / len(consistency)

0.2138157894736842