# Lanchain OpenAI Setup

In [3]:
import os
api_key = open('./openai_key.txt').read()
os.environ['OPENAI_API_KEY'] = api_key

In [17]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAI
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader

In [10]:
# Caching
from langchain.cache import InMemoryCache
from langchain.cache import SQLiteCache
from langchain.globals import set_llm_cache


cacheType = 'in_memory'

if cacheType == 'in_memory':
    set_llm_cache(InMemoryCache())
elif cacheType == 'sqlite':
    set_llm_cache(SQLiteCache(database_path=".langchain.db"))

In [19]:
# Set OpenAI API key and create LLM and Chat LLM. Note that key can be stored in a separate file or as an environment variable. Refer to docs.

llm = OpenAI()
chat = ChatOpenAI()

# Load Multiple documents and process

In [43]:
# Load all files in dirl
oader = DirectoryLoader('./new_articles/', glob="./*.txt", loader_cls=TextLoader)

documents = loader.load()

In [47]:
# split 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [49]:
texts[3]

Document(page_content='Customers can customize the tools and apps or build their own using Pandoâ€™s APIs. This, along with the platformâ€™s emphasis on no-code capabilities, differentiates Pando from incumbents like SAP, Oracle, Blue Yonder and E2Open, Jayakrishnan asserts.\n\n"Pando comes pre-integrated with leading enterprise resource planning (ERPs) systems and has ready APIs and a professional services team to integrate with any new ERPs and enterprise systems," he added. "Pandoâ€™s no-code capabilities enable business users to customize the apps while maintaining platform integrity â€” reducing the need for IT resources for each customization."', metadata={'source': 'new_articles\\05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt'})

# Create DB

In [56]:
persist_dir = 'articles_db'

# Embeddings
embedding = OpenAIEmbeddings()

vecordb = Chroma.from_documents(documents=texts,
                                embedding=embedding,
                                persist_directory=persist_dir)

In [60]:
vecordb.persist()
vecordb = None

In [66]:
# Now load DB from disk and use it as normal
vecordb = Chroma(persist_directory =persist_dir,
                 embedding_function=embedding)

# Retriever

In [83]:
retriever = vecordb.as_retriever(search_kwargs={"k": 2})

In [85]:
docs = retriever.get_relevant_documents("Databricks Okera")

In [86]:
docs[0]

Document(page_content='Databricks today announced that it has acquired Okera, a data governance platform with a focus on AI. The two companies did not disclose the purchase price. According to Crunchbase, Okera previously raised just under $30 million. Investors include Felicis, Bessemer Venture Partners, Cyber Mentor Fund, ClearSky and Emergent Ventures.', metadata={'source': 'new_articles\\05-03-databricks-acquires-ai-centric-data-governance-platform-okera.txt'})

# Make Chain

In [89]:
qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                       chain_type="stuff",
                                       retriever=retriever,
                                       return_source_documents=True)

In [91]:
## Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [93]:
# full example
query = "How much money did Pando raise?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 $30 million


Sources:
new_articles\05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt
new_articles\05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt


In [95]:
# break it down
query = "What is the news about Pando?"
llm_response = qa_chain(query)
# process_llm_response(llm_response)
llm_response

{'query': 'What is the news about Pando?',
 'result': "\nPando, a startup focused on developing fulfillment management technologies, has announced that it raised $30 million in a Series B funding round, bringing its total raised to $45 million. The funding will be used to expand Pando's global sales, marketing, and delivery capabilities. The company was co-launched by Nitin Jayakrishnan and Abhijeet Manohar, who previously worked together at iDelivery, and aims to solve logistics challenges for manufacturers, distributors, and retailers through a software-as-a-service platform.",
 'source_documents': [Document(page_content='Signaling that investments in the supply chain sector remain robust, Pando, a startup developing fulfillment management technologies, today announced that it raised $30 million in a Series B round, bringing its total raised to $45 million.\n\nIron Pillar and Uncorrelated Ventures led the round, with participation from existing investors Nexus Venture Partners, Chira

In [97]:
query = "Who led the round in Pando?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Iron Pillar and Uncorrelated Ventures


Sources:
new_articles\05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt
new_articles\05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt


In [99]:
query = "What did databricks acquire?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Okera, a data governance platform with a focus on AI.


Sources:
new_articles\05-03-databricks-acquires-ai-centric-data-governance-platform-okera.txt
new_articles\05-03-databricks-acquires-ai-centric-data-governance-platform-okera.txt


In [101]:
query = "What is generative ai?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Generative AI is a technology that uses algorithms and machine learning to generate new content such as art, code, and text. It involves training models on large datasets and then using those models to create new content that adheres to certain rules or guidelines. However, there have been legal disputes over the use of copyrighted data to train these models, leading to the development of tools like BrandGuard and BrandGPT to protect brand integrity.


Sources:
new_articles\05-03-nova-is-building-guardrails-for-generative-ai-content-to-protect-brand-integrity.txt
new_articles\05-03-spawning-lays-out-its-plans-for-letting-creators-opt-out-of-generative-ai-training.txt
