## Index

In [1]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
from dotenv import load_dotenv
load_dotenv()

AZURE_DEPLOYMENT_NAME= os.environ['AZURE_DEPLOYMENT_NAME']
os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_KEY"] = os.environ['AZURE_OPEN_API_KEY']
os.environ["OPENAI_API_BASE"] = os.environ['AZURE_API_BASE']
os.environ["OPENAI_VERSION"] = "2023-03-15-preview"   
    

In [2]:
import wikipedia
wikipedia.set_lang("ja")
wp = wikipedia.page("こちら葛飾区亀有公園前派出所")
with open('./data/summary.txt', 'w', encoding='utf-8') as f:
   f.write(wp.summary)  
with open('./data/content.txt', 'w', encoding='utf-8') as f:
   f.write(wp.content)

In [3]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.document_loaders import TextLoader
from langchain.vectorstores import FAISS

def pretty_print_docs(docs):
    print(f"\n{'-' * 10}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

documents = TextLoader('data/content.txt').load()
text_splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=50)  
texts = text_splitter.split_documents(documents)
len(texts)


Created a chunk of size 842, which is longer than the specified 300
Created a chunk of size 1694, which is longer than the specified 300
Created a chunk of size 1183, which is longer than the specified 300
Created a chunk of size 426, which is longer than the specified 300
Created a chunk of size 661, which is longer than the specified 300
Created a chunk of size 995, which is longer than the specified 300
Created a chunk of size 1030, which is longer than the specified 300
Created a chunk of size 415, which is longer than the specified 300
Created a chunk of size 1233, which is longer than the specified 300
Created a chunk of size 480, which is longer than the specified 300
Created a chunk of size 898, which is longer than the specified 300
Created a chunk of size 312, which is longer than the specified 300
Created a chunk of size 1829, which is longer than the specified 300
Created a chunk of size 310, which is longer than the specified 300
Created a chunk of size 774, which is longe

32

In [28]:
from langchain.document_loaders import TextLoader

loader = TextLoader('data/content.txt', encoding='utf8')

In [29]:
from langchain.indexes import VectorstoreIndexCreator

vectorstore= VectorstoreIndexCreator()
vectorstore.embedding = OpenAIEmbeddings(deployment="text-embedding-ada-002",chunk_size=1)
index = vectorstore.from_loaders([loader])

Using embedded DuckDB without persistence: data will be transient


////////////
200
////////////
200
////////////
200
////////////
200
////////////
200
////////////
200
////////////
200
////////////
200
////////////
200
////////////
200
////////////
200
////////////
200
////////////
200
////////////
200
////////////
200
////////////
200
////////////
200
////////////
200
////////////
200
////////////
200
////////////
200
////////////
200
////////////
200
////////////
200
////////////
200
////////////
200


In [20]:
from langchain.chat_models import AzureChatOpenAI

llm = AzureChatOpenAI(
    client=None,
    deployment_name=AZURE_DEPLOYMENT_NAME,
    openai_api_base=os.environ['AZURE_API_BASE'],
    openai_api_version="2023-03-15-preview",
    openai_api_key=os.environ['AZURE_OPEN_API_KEY'],
    temperature=0,
    request_timeout=180,
) 


In [25]:
query = "両津の職業は？"
index.query(question=query,llm=llm)

////////////
200
////////////
200


'両津勘吉の職業は警察官で、階級は巡査長です。'

In [27]:
query = "中川の職業は？"
index.query_with_sources(query,llm=llm)

////////////
200
////////////
200


{'question': '中川の職業は？', 'answer': '中川の職業は警官。\n', 'sources': 'data/content.txt'}

In [23]:
index.vectorstore

<langchain.vectorstores.chroma.Chroma at 0x14067f310>

In [24]:
index.vectorstore.as_retriever()


VectorStoreRetriever(vectorstore=<langchain.vectorstores.chroma.Chroma object at 0x14067f310>, search_type='similarity', search_kwargs={})

## Use Chroma

In [3]:
from langchain.document_loaders import TextLoader
loader = TextLoader('data/content.txt', encoding='utf8')


In [4]:
documents = loader.load()


In [6]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)


Created a chunk of size 1694, which is longer than the specified 1000
Created a chunk of size 1183, which is longer than the specified 1000
Created a chunk of size 1030, which is longer than the specified 1000
Created a chunk of size 1233, which is longer than the specified 1000
Created a chunk of size 1829, which is longer than the specified 1000


In [10]:
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(deployment="text-embedding-ada-002",chunk_size=1)



In [33]:
import chromadb
from langchain.vectorstores import Chroma

DB_DIR = os.path.join("./", "db")

client_settings = chromadb.config.Settings(
        chroma_db_impl="duckdb+parquet",
        persist_directory=DB_DIR,
        anonymized_telemetry=False
    )

vectorstore = Chroma(
        collection_name="langchain_store",
        embedding_function=embeddings,
        client_settings=client_settings,
        persist_directory=DB_DIR,
    )

Using embedded DuckDB with persistence: data will be stored in: ./db


In [34]:
from langchain.vectorstores import Chroma
db = vectorstore.from_documents(texts, embeddings)


Using embedded DuckDB without persistence: data will be transient


////////////
200
////////////
200
////////////
200


Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Requests to the Get a vector representation of a given input that can be easily consumed by machine learning models and algorithms. Operation under Azure OpenAI API version 2023-03-15-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 1 second. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit..


////////////
200
////////////
429
////////////
200
////////////
200
////////////
200
////////////
200
////////////
200
////////////
200
////////////
200
////////////
200
////////////
200
////////////
200
////////////
200
////////////
200
////////////
200
////////////
200
////////////
200
////////////
200
////////////
200


In [36]:
retriever = db.as_retriever()


In [37]:
from langchain.chains import RetrievalQA
from langchain.chat_models import AzureChatOpenAI

llm = AzureChatOpenAI(
    client=None,
    deployment_name=AZURE_DEPLOYMENT_NAME,
    openai_api_base=os.environ['AZURE_API_BASE'],
    openai_api_version="2023-03-15-preview",
    openai_api_key=os.environ['AZURE_OPEN_API_KEY'],
    temperature=0,
    request_timeout=180,
) 

qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)


In [38]:
query = "両津の職業は？"
qa.run(query)


Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Requests to the Get a vector representation of a given input that can be easily consumed by machine learning models and algorithms. Operation under Azure OpenAI API version 2023-03-15-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 1 second. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit..


////////////
429
////////////
200
////////////
200


'両津勘吉の職業は警察官で、階級は巡査長です。'

In [39]:
query = "中川の年練は？"
qa.run(query)


////////////
200
////////////
200


'文章中に中川の年齢や年練に関する情報は記載されていません。申し訳ありませんが、回答することができません。'

In [40]:
query = "web3とはなんですか？"
qa.run(query)


////////////
200
////////////
200


'Web3は、分散型アプリケーション（DApps）やブロックチェーン技術を活用したWebの次世代バージョンです。Web3は、中央集権的なインターネットの問題点を解決するために開発されました。Web3は、ユーザーが自分自身のデータを管理し、プラットフォームや企業に依存することなく、自分自身のデータを所有することができるようになります。Web3は、分散型アプリケーション（DApps）やブロックチェーン技術を活用して、ユーザーが自分自身のデータを管理し、プラットフォームや企業に依存することなく、自分自身のデータを所有することができるようになります。Web3は、分散型アプリケーション（DApps）やブロックチェーン技術を活用して、ユーザーが自分自身のデータを管理し、プラットフォームや企業に依存することなく、自分自身のデータを所有することができるようになります。'

In [43]:
vectorstore = Chroma(
        collection_name="langchain_store",
        embedding_function=embeddings,
        client_settings=client_settings,
        persist_directory=DB_DIR,
    )

Using embedded DuckDB with persistence: data will be stored in: ./db


In [45]:
# import json
# from fastapi.encoders import jsonable_encoder

# result = vectorstore.similarity_search_with_score(query="who is FREDERICK?", k=4)
# jsonable_result = jsonable_encoder(result)
# print(json.dumps(jsonable_result, indent=2))

In [None]:
# import json
# from fastapi.encoders import jsonable_encoder

# result = vectorstore.similarity_search_with_score(query="who is FREDERICK?", k=4)
# jsonable_result = jsonable_encoder(result)
# print(json.dumps(jsonable_result, indent=2))