In [None]:
pip install langchain
pip install -U langchain-community
pip install -U langchain-aws
pip install chromadb
pip install -U langchain-chroma

In [None]:
import datetime
import dotenv
import os
import time

import langchain
from langchain.document_loaders import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_aws.embeddings import BedrockEmbeddings
from langchain_aws import ChatBedrock
from langchain_chroma import Chroma
from langchain.chains import RetrievalQA

In [None]:
print(langchain.__version__)

In [None]:
dotenv.load_dotenv()
EMBEDDING_MODEL_ID = os.getenv("BEDROCK_EMBEDDING_MODEL_ID")
CLAUDE_MODEL_ID = os.getenv("BEDROCK_CLAUDE_MODEL_ID")

In [None]:
def set_llm_model_and_vector_db(vector_db_name, vector_db_dir):
    # BedrockのEmbeddingモデルとclaudeモデルを設定
    langchain_bedrock_embedding = BedrockEmbeddings(region_name="ap-northeast-1",
                                                    model_id=EMBEDDING_MODEL_ID)
    langchain_bedrock_chat_message = ChatBedrock(region_name="ap-northeast-1",
                                                 model_id=CLAUDE_MODEL_ID,
                                                 provider="anthropic")
    # 空のchromaベクトルDBを作成 or 既設のchromaベクトルDBを再読み込み
    langchain_vector_db = Chroma(collection_name=vector_db_name,
                                 persist_directory=vector_db_dir,
                                 embedding_function=langchain_bedrock_embedding)
    count_for_unique_record_id = langchain_vector_db._collection.count()
    return langchain_bedrock_embedding, langchain_bedrock_chat_message, langchain_vector_db, count_for_unique_record_id

In [None]:
def get_chunked_document_from_csv(csv_filepath, csv_col_name, chunk_size, chunk_overlap_size, chunk_separator):
    # CSV読み込み(1行 = 1ドキュメント)
    langchain_csv_loader = CSVLoader(file_path=csv_filepath,
                                     source_column=csv_col_name,  # カラム名を指定 # csvの1行目はヘッダー
                                     encoding="utf-8")
    langchain_loaded_document = langchain_csv_loader.load()
    # チャンク分割
    langchain_text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,  # チャンクの最大文字数
                                                             chunk_overlap=chunk_overlap_size,  # チャンクに含める前後データの文字数
                                                             separators=chunk_separator)  # チャンクする際の境目の対象となる文字列
    langchain_chunked_loaded_document = langchain_text_splitter.split_documents(documents=langchain_loaded_document)
    # チャンク分割後のドキュメントを文字列の要素の形でリスト型として保存
    chunked_doc_list = []
    for langchain_chunked_doc in langchain_chunked_loaded_document:
        chunked_doc_list.append(langchain_chunked_doc.page_content)
    return chunked_doc_list

In [None]:
def embedding_process(chunked_doc_list, langchain_bedrock_embedding, count_for_unique_record_id):
    unique_record_id_list = []
    embedding_vector_list = []
    print(datetime.datetime.now())
    for chunked_doc in chunked_doc_list:
        try:
            embedding_vector = langchain_bedrock_embedding.embed_query(chunked_doc)
            embedding_vector_list.append(embedding_vector)
            unique_record_id = "id{a}".format(a=count_for_unique_record_id)
            unique_record_id_list.append(unique_record_id)
            count_for_unique_record_id = count_for_unique_record_id + 1
            time.sleep(0.5)  # aws bedrockのAPIリクエスト制限に抵触しないように
        except Exception as e:
            print("要素{a}でエラーになりました。".format(a=count_for_unique_record_id))
    print(datetime.datetime.now())
    return unique_record_id_list, embedding_vector_list

In [None]:
def add_doc_and_vector_in_vector_db(langchain_vector_db, chunked_doc_list, embedding_vector_list, unique_record_id_list):
    langchain_vector_db._collection.add(documents=chunked_doc_list,
                                        embeddings=embedding_vector_list,
                                        ids=unique_record_id_list)
    return langchain_vector_db

In [None]:
def carry_out_rag_and_llm(user_prompt, langchain_vector_db, langchain_bedrock_chat_message):
    langchain_vector_db_retriever = langchain_vector_db.as_retriever(search_kwargs={"k": 7})  # 類似度の高い順で7件をベクトルDBの検索結果とする
    langchain_qa_chain = RetrievalQA.from_chain_type(llm=langchain_bedrock_chat_message,
                                                     retriever=langchain_vector_db_retriever,
                                                     chain_type="stuff")  # ベクトルDBでの検索結果を単純にそのままプロンプトに繋げてLLMに投げる方法
    result_from_llm = langchain_qa_chain.invoke(input={"query": user_prompt})
    retriever_result_documents = langchain_vector_db_retriever.invoke(input=user_prompt)
    return result_from_llm, retriever_result_documents

In [None]:
def main(user_prompt, update_vector_db_flag=False, csv_file_path=None):
    # 1. LLMモデルとベクトルDBを設定する(ベクトルDBは新規作成 or 既存を再読み込み)
    embed_model, message_model, vector_db, record_number = set_llm_model_and_vector_db(vector_db_name="hogehoge",
                                                                                       vector_db_dir="./chroma_db_for_RAG")
    if update_vector_db_flag == True:
        # 2. csvファイルからドキュメントをロードしてチャンク化する
        doc_list = get_chunked_document_from_csv(csv_filepath=csv_file_path,
                                                 csv_col_name="comment",
                                                 chunk_size=800,
                                                 chunk_overlap_size=50,
                                                 chunk_separator=["\n"])
        # 3. チャンク化したドキュメントをベクトル化する
        id_list, vector_list = embedding_process(chunked_doc_list=doc_list,
                                                 langchain_bedrock_embedding=embed_model,
                                                 count_for_unique_record_id=record_number)
        # 4. 「ドキュメント」と「ベクトル」と「一意なID」をベクトルDBに追加する(「一意なID」が重複すると上書きされる)
        vector_db = add_doc_and_vector_in_vector_db(langchain_vector_db=vector_db,
                                                    chunked_doc_list=doc_list,
                                                    embedding_vector_list=vector_list,
                                                    unique_record_id_list=id_list)
    # 5. RAG & LLM
    result_from_llm, retriever_result_documents = carry_out_rag_and_llm(user_prompt=user_prompt,
                                                                        langchain_vector_db=vector_db,
                                                                        langchain_bedrock_chat_message=message_model)
    print("【LLMからの回答】", result_from_llm["result"], sep="\n")
    print("*****")
    for i, retriever_result_document in enumerate(retriever_result_documents):
        print("【ベクトルDBでの検索結果：{a}つ目】".format(a=i+1), sep="\n")
        print(retriever_result_document.page_content)
        print("*****")

In [None]:
main(user_prompt="Netflixを解約する理由は？")

In [None]:
# document_csv_file_path = "/home/ec2-user/hogehoge.csv"
# main(user_prompt="Netflixを解約する理由は？",
#      update_vector_db_flag=True,
#      csv_file_path=document_csv_file_path)