In [1]:
%load_ext dotenv
%dotenv

In [2]:
from langchain_openai import AzureOpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.prompts import PromptTemplate
from langchain_openai import AzureChatOpenAI
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables import RunnableParallel
from langchain_core.output_parsers import StrOutputParser

In [4]:
emb = AzureOpenAIEmbeddings(model='text-embedding-ada-002')
vectorstore = Chroma(persist_directory='./data',embedding_function=emb)

In [6]:
len(vectorstore.get()['documents'])

41

In [8]:
retriever = vectorstore.as_retriever(
    search_type='mmr',
    search_kwargs={
        'k': 5,
        'lambda_mult':0.4
        })

In [None]:
# context に、検索結果のドキュメントをそのまま埋め込んでいく(Stuffing)
# Document の List が長いと、List の 最初と最後が重視されるという特徴がある
TEMPLATE = '''
Answer the following question:
{question}

To answer the question, use only the following context:
{context}

At the end of the response, specify the name of the lecture this context is taken from in the format:
Resources: *Lecture Title*
where *Lecture Title* should be substituted with the title of all resource lectures.
'''

prompt_template = PromptTemplate.from_template(TEMPLATE)

In [32]:
llm = AzureChatOpenAI(
    azure_deployment="gpt-4o",
    api_version="2025-01-01-preview",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=6
)

In [12]:
q = 'what software do data scientists use?'

In [15]:
# このコードは、質問 q を retriever に投げて関連ドキュメント（context）を取得し、同時に question に q をそのまま残した結果（例: {"context": <retrieverの出力>, "question": "..." }）を返します。
# 最終的には、プロンプトに入れるための dict を作りたい
# RunnableParallel は、中に定義している dict で定義された関数に対して、同じ入力(invoke() の引数)を与えて並列に実行し、その結果を dict で返す
chain = RunnableParallel({'context': retriever,
         'question': RunnablePassthrough()})
chain.invoke(q)

{'context': [Document(id='2853e743-e11c-461b-9fd8-ce921810fe57', metadata={'section title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need', 'course title': 'Introduction'}, page_content='As you can see from the infographic, R, and Python are the two most popular tools across all columns. Their biggest advantage is that they can manipulate data and are integrated within multiple data and data science software platforms. They are not just suitable for mathematical and statistical computations. In other words, R, and Python are adaptable. They can solve a wide variety of business and data-related problems from beginning to the end'),
  Document(id='e1420b4c-035e-4f02-a644-42e40c09b5e5', metadata={'section title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need', 'course title': 'Introduction'}, page_content='Alright! So… How are the techniques used in data, business intelligence, or predictive analytics applied in r

In [18]:
# prompt_value = prompt_template.invoke(chain.invoke(q))
# print(prompt_value)

In [35]:
# chain で書く
chain = ({'context': retriever, 'question': RunnablePassthrough()} | prompt_template | llm | StrOutputParser())

In [36]:
chain.invoke(q)

'Data scientists use a variety of software tools to perform their tasks. The most popular programming languages are **R** and **Python**, which are highly adaptable and capable of handling mathematical, statistical computations, and solving diverse business and data-related problems. For big data applications, tools like **Apache Hadoop**, **Apache Hbase**, and **MongoDB** are commonly employed, with Hadoop being particularly notable for its ability to manage large-scale data processing. Additionally, tools like **Excel** and **SPSS** are widely used for statistical analysis and data visualization, offering ease of use and applicability across multiple categories such as traditional data, business intelligence, and data science.\n\nResources: *Programming Languages & Software Employed in Data Science - All the Tools You Need*'

# Document Refinement
Stuffing の代替策として、Document Refinement がある。これは、都度与える Document は１つで、前の Document に対する結果を Prompt に入れていく方法。コンテキストとして与えるドキュメントのサイズは小さくできるが、LLM のコール数が増える