- cf. [LangChainを使ったRAGをElyza 7bを用いて試してみた](https://note.com/alexweberk/n/n3cffc010e9e9)

In [2]:
import torch


torch.cuda.is_available()

True

# 検索DB をセットアップ

- FAISS

In [None]:
from trafilatura import fetch_url, extract

# url = "https://ja.m.wikipedia.org/wiki/ONE_PIECE"
# url = "https://www.kantei.go.jp/"
# url = "https://ja.wikipedia.org/wiki/%E3%83%80%E3%82%A6%E3%83%B3%E3%82%BF%E3%82%A6%E3%83%B3_(%E3%81%8A%E7%AC%91%E3%81%84%E3%82%B3%E3%83%B3%E3%83%93)"
# url = "https://ja.wikipedia.org/wiki/%E4%BD%8D%E7%9B%B8%E7%A9%BA%E9%96%93"
url = "https://ja.wikipedia.org/wiki/%E3%82%AC%E3%83%AA%E3%83%AC%E3%82%AA_(%E3%83%86%E3%83%AC%E3%83%93%E3%83%89%E3%83%A9%E3%83%9E)"
filename = "../data/wiki.txt"

document = fetch_url(url)
text = extract(document)
print(text[:1000])

with open(filename, "w", encoding="utf-8") as f:
    f.write(text)

In [None]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores.faiss import FAISS
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_core.prompts.prompt import PromptTemplate

In [None]:
loader = TextLoader(filename, encoding="utf-8")
documents = loader.load()

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=300,
    chunk_overlap=20,
)
texts = text_splitter.split_documents(documents)
print(len(texts))

In [None]:
texts[:3]

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")
db = FAISS.from_documents(texts, embeddings)

# 一番類似するチャンクをいくつロードするかを変数kに設定できる
retriever = db.as_retriever(search_kwargs={"k": 3})

# LLM をセットアップ

In [None]:
import pathlib

model_file = "../data/ELYZA-japanese-Llama-2-7b-fast-instruct-q8_0.gguf"
# model_file = "../data/ELYZA-japanese-Llama-2-13b-fast-instruct-q8_0.gguf"
# model_file = "./ELYZA-japanese-Llama-2-7b-fast-instruct-q8_0.gguf"    # self-making
# model_file = "./ELYZA-japanese-CodeLlama-7b-instruct-q8_0.gguf"       # self-making
pathlib.Path(model_file).exists()

In [None]:
from langchain_core.runnables.config import RunnableConfig
from langchain.callbacks import StreamingStdOutCallbackHandler


config = RunnableConfig(callbacks=[StreamingStdOutCallbackHandler()])

In [None]:
from app.llama2cpp.component.llama2cpp import LlamaCppCustom


n_gqa = 8 if "70b" in model_file else 1
llm = LlamaCppCustom(
    model_path=model_file,
    n_ctx=1024,
    temperature=0,
    max_tokens=256,
    n_gqa=n_gqa,
    n_gpu_layers=-1,
    verbose=False,
    streaming=True,
)

In [None]:
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
DEFAULT_SYSTEM_PROMPT = "参考情報を元に、ユーザーからの質問にできるだけ正確に答えてください。"
text = "{context}\nユーザからの質問は次のとおりです。{question}"
template = "{bos_token}{b_inst} {system}{prompt} {e_inst} ".format(
    bos_token="",
    b_inst=B_INST,
    system=f"{B_SYS}{DEFAULT_SYSTEM_PROMPT}{E_SYS}",
    prompt=text,
    e_inst=E_INST,
)
PROMPT = PromptTemplate(
    template=template,
    input_variables=["context", "question"],
    template_format="f-string",
)


chain_type_kwargs = {"prompt": PROMPT}

In [None]:
# q = "ニコ・ロビンの職業は何ですか？"
# q = "2023年1月時点での日本の首相は誰ですか？"
# q = "ダウンタウンのメンバは？"
# q = "位相空間の定義は？"
q = "ガリレオの主人公は？"
prompt_in = template.format(context="", question=q)

In [None]:
for tkn in llm.stream(input=prompt_in, stop=None, config=config):
    # NOTE: printing each token in callback handler
    pass

In [None]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs,
    verbose=False,
)

In [None]:
async for tkn in qa.astream(q, config=config):
    pass

In [None]:
print(tkn["result"])
for doc in tkn["source_documents"]:
    print("-" * 80)
    print(f'[{doc.metadata["source"]}]')
    print(doc.page_content)