# Lanchain OpenAI Setup

In [3]:
import os
api_key = open('./openai_key.txt').read()
os.environ['OPENAI_API_KEY'] = api_key

In [4]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAI
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader

In [6]:
# Caching
from langchain.cache import InMemoryCache
from langchain.cache import SQLiteCache
from langchain.globals import set_llm_cache


cacheType = 'in_memory'

if cacheType == 'in_memory':
    set_llm_cache(InMemoryCache())
elif cacheType == 'sqlite':
    set_llm_cache(SQLiteCache(database_path=".langchain.db"))

In [7]:
# Set OpenAI API key and create LLM and Chat LLM. Note that key can be stored in a separate file or as an environment variable. Refer to docs.

llm = OpenAI()
chat = ChatOpenAI()

# Load Multiple documents and process

In [9]:
# Load all files in dirl
loader = DirectoryLoader('./new_articles/', glob="./*.txt", loader_cls=TextLoader)

documents = loader.load()

In [12]:
# split 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [13]:
texts[3]

Document(page_content='Customers can customize the tools and apps or build their own using Pandoâ€™s APIs. This, along with the platformâ€™s emphasis on no-code capabilities, differentiates Pando from incumbents like SAP, Oracle, Blue Yonder and E2Open, Jayakrishnan asserts.\n\n"Pando comes pre-integrated with leading enterprise resource planning (ERPs) systems and has ready APIs and a professional services team to integrate with any new ERPs and enterprise systems," he added. "Pandoâ€™s no-code capabilities enable business users to customize the apps while maintaining platform integrity â€” reducing the need for IT resources for each customization."', metadata={'source': 'new_articles\\05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt'})

# Create DB

In [16]:
persist_dir = 'articles_db'

# Embeddings
embedding = OpenAIEmbeddings()

vecordb = Chroma.from_documents(documents=texts,
                                embedding=embedding,
                                persist_directory=persist_dir)

  warn_deprecated(


In [17]:
vecordb.persist()
vecordb = None

In [18]:
# Now load DB from disk and use it as normal
vecordb = Chroma(persist_directory =persist_dir,
                 embedding_function=embedding)

In [39]:
from langchain.schema import AIMessage, HumanMessage, SystemMessage

# Import chat templates
from langchain.prompts import (
    ChatPromptTemplate,
    PromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)


In [27]:
test1 = documents[0]

In [123]:
test1.metadata['source']

'new_articles\\05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt'

In [29]:
from langchain.chains.summarize import load_summarize_chain
chain = load_summarize_chain(llm=chat, chain_type="stuff")

In [35]:
r1 = chain.run([test1])

In [60]:
q_template = """Create a few questions and answers covering the below text starting and ending with ### :
###{text}###

INSTUCTIONS:
Format each question with 'Q: ' prepended on a new line.
Format each question with 'A: ' prepended on a new line.
Add an additional new line after each pair.
If there is not enough infomation, return an empty string.
"""

chat_prompt = ChatPromptTemplate.from_template(q_template)
request = chat_prompt.format_prompt(text = r1).to_messages()

In [62]:
resp1 = chat(request)

In [63]:
print(resp1.content)

Q: How much funding did Pando raise in its Series B funding round?
A: Pando raised $30 million in its Series B funding round.

Q: What is the total amount raised by Pando so far?
A: Pando has raised a total of $45 million.

Q: What are the intended uses of the funds raised by Pando?
A: The funds raised by Pando will be used to expand the company's sales, marketing, and delivery capabilities.

Q: What does Pando's software-as-a-service platform offer?
A: Pando's software-as-a-service platform consolidates supply chain data and provides tools for freight procurement, trade management, and more.

Q: How does Pando's platform differentiate itself?
A: Pando's platform differentiates itself with its no-code capabilities and integration with leading enterprise resource planning systems.

Q: How much has Pando's revenue increased since its Series A funding?
A: Pando's revenue has increased 8 times since its Series A funding.

Q: How much has Pando's customer base expanded since its Series A fu

## Pydantic 

In [176]:
from pydantic import BaseModel, Field
from typing import List
from langchain.output_parsers import PydanticOutputParser

In [177]:
class QAModel(BaseModel):
    question: str = Field(description="Question")
    answer: str = Field(description="Answer to question")

class QADocument(BaseModel):
    metadata: dict = None
    original: str= None
    source: str = Field(description="Metdata for the source document")
    summary: str = Field(description="Original content being processed")
    items: List[QAModel] = []

In [254]:
from langchain.schema import Document

def creat_document_lc(page_content:str, metadata:dict):
    return Document(page_content=page_content, metadata=metadata)

def format_page_content(response: QAModel):
    return f"""Question: {response.question}\nAnswer: {response.answer}"""

def format_page_metadata(result: QADocument):
    qaCount = 0 if not result.items else len(result.items)
    return {
        'source': result.source,
        'qaCount': qaCount
        # Additional properties here if necessary
    }

def export_document(result: QADocument) -> list :
    metadata = format_page_metadata(result)
    final_str = f"***Summary***:\n\n{result.summary}\n\n***Possible QA***\n"
    
    for item in result.items:
        qa = format_page_content(item)
        final_str += f"\n{qa}\n"
    
    final_str += f"\n***Original***\n\n{result.original}"
    
    doc = creat_document_lc(page_content=final_str, metadata=metadata)
    return doc
        

In [255]:
human_template = """{request}\n{format_instructions}"""
human_prompt = HumanMessagePromptTemplate.from_template(human_template)
chat_prompt = ChatPromptTemplate.from_messages([human_prompt])

In [270]:
def generate_request_for_questions(source_name: str, summary: str):
    q_template1 = f"""Create a few questions and answers covering the below text starting and ending with ### :
    ###{summary}###

    Text metadata to be included in results:
    source: {source_name}
    summary: [This should be the text provided igenerate_request_for_questionsn above with ###]
    original: [This can be left blank]
    metadata: [This can be left blank]
    """

    request = chat_prompt.format_prompt(request=q_template1,
                                       format_instructions=parser.get_format_instructions()).to_messages()
    
    return request


In [260]:
parser = PydanticOutputParser(pydantic_object=QADocument)
# parser.get_format_instructions()

In [282]:
# This will genrate a Summary -> Generate Questions -> Create new document wrapper
def process_document(document: Document) -> Document:
    print("Processing document: ", document.metadata['source'])
    
    try:
        # Create summary
        doc_name = document.metadata['source']
        summary_resp = chain.run([document])

        # Create request for questions
        question_request = generate_request_for_questions(source_name=doc_name, summary=summary_resp)
        question_response = chat(question_request, temperature=0.0)
        parsed_questions_qadoc = parser.parse(question_response.content)

        # Set core details
        parsed_questions_qadoc.original = document.page_content

        # Generate new doc
        final_doc = export_document(parsed_questions_qadoc)

    except Exception as error:
        # handle the exception
        print("An exception occurred:", error)
    
    print("Done processing document: ", document.metadata['source'])
    
    return final_doc
    
    
    

In [289]:
# Run through all documents

enriched_documents = []
for doc in documents:
    enriched = process_document(document=doc)
    enriched_documents.append( enriched )
    

Processing document:  new_articles\05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt
Done processing document:  new_articles\05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt
Processing document:  new_articles\05-03-ai-replace-tv-writers-strike.txt
Done processing document:  new_articles\05-03-ai-replace-tv-writers-strike.txt
Processing document:  new_articles\05-03-chatgpt-everything-you-need-to-know-about-the-ai-powered-chatbot.txt
Done processing document:  new_articles\05-03-chatgpt-everything-you-need-to-know-about-the-ai-powered-chatbot.txt
Processing document:  new_articles\05-03-checks-the-ai-powered-data-protection-project-incubated-in-area-120-officially-exits-to-google.txt
Done processing document:  new_articles\05-03-checks-the-ai-powered-data-protection-project-incubated-in-area-120-officially-exits-to-google.txt
Processing document:  new_articles\05-03-databricks-acquires-ai-centric-data-governance-platform-okera.txt
Done processing d

In [291]:
enriched_documents[2]

Document(page_content='***Summary***:\n\nChatGPT, an AI-powered chatbot developed by OpenAI, has gained popularity for its ability to generate text in various domains. OpenAI has made significant updates to ChatGPT, including the integration of GPT-4, the latest language-writing model, and the introduction of plugins that allow access to external knowledge sources. However, there have been some concerns and controversies surrounding ChatGPT, such as the rise of malware posing as ChatGPT, potential misuse for fraud, and issues related to data privacy and plagiarism. Despite these challenges, ChatGPT continues to be widely used and has found applications in ad and marketing copy generation, customer service, and more.\n\n***Possible QA***\n\nQuestion: What is ChatGPT?\nAnswer: ChatGPT is an AI-powered chatbot developed by OpenAI.\n\nQuestion: What updates have been made to ChatGPT?\nAnswer: OpenAI has integrated GPT-4, the latest language-writing model, and introduced plugins that allow 

# Retriever

In [83]:
retriever = vecordb.as_retriever(search_kwargs={"k": 2})

In [85]:
docs = retriever.get_relevant_documents("Databricks Okera")

In [86]:
docs[0]

Document(page_content='Databricks today announced that it has acquired Okera, a data governance platform with a focus on AI. The two companies did not disclose the purchase price. According to Crunchbase, Okera previously raised just under $30 million. Investors include Felicis, Bessemer Venture Partners, Cyber Mentor Fund, ClearSky and Emergent Ventures.', metadata={'source': 'new_articles\\05-03-databricks-acquires-ai-centric-data-governance-platform-okera.txt'})

# Make Chain

In [89]:
qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                       chain_type="stuff",
                                       retriever=retriever,
                                       return_source_documents=True)

In [91]:
## Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [93]:
# full example
query = "How much money did Pando raise?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 $30 million


Sources:
new_articles\05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt
new_articles\05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt


In [95]:
# break it down
query = "What is the news about Pando?"
llm_response = qa_chain(query)
# process_llm_response(llm_response)
llm_response

{'query': 'What is the news about Pando?',
 'result': "\nPando, a startup focused on developing fulfillment management technologies, has announced that it raised $30 million in a Series B funding round, bringing its total raised to $45 million. The funding will be used to expand Pando's global sales, marketing, and delivery capabilities. The company was co-launched by Nitin Jayakrishnan and Abhijeet Manohar, who previously worked together at iDelivery, and aims to solve logistics challenges for manufacturers, distributors, and retailers through a software-as-a-service platform.",
 'source_documents': [Document(page_content='Signaling that investments in the supply chain sector remain robust, Pando, a startup developing fulfillment management technologies, today announced that it raised $30 million in a Series B round, bringing its total raised to $45 million.\n\nIron Pillar and Uncorrelated Ventures led the round, with participation from existing investors Nexus Venture Partners, Chira

In [97]:
query = "Who led the round in Pando?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Iron Pillar and Uncorrelated Ventures


Sources:
new_articles\05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt
new_articles\05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt


In [99]:
query = "What did databricks acquire?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Okera, a data governance platform with a focus on AI.


Sources:
new_articles\05-03-databricks-acquires-ai-centric-data-governance-platform-okera.txt
new_articles\05-03-databricks-acquires-ai-centric-data-governance-platform-okera.txt


In [101]:
query = "What is generative ai?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Generative AI is a technology that uses algorithms and machine learning to generate new content such as art, code, and text. It involves training models on large datasets and then using those models to create new content that adheres to certain rules or guidelines. However, there have been legal disputes over the use of copyrighted data to train these models, leading to the development of tools like BrandGuard and BrandGPT to protect brand integrity.


Sources:
new_articles\05-03-nova-is-building-guardrails-for-generative-ai-content-to-protect-brand-integrity.txt
new_articles\05-03-spawning-lays-out-its-plans-for-letting-creators-opt-out-of-generative-ai-training.txt
