# Lanchain OpenAI Setup

In [3]:
import os
api_key = open('./openai_key.txt').read()
os.environ['OPENAI_API_KEY'] = api_key

In [4]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAI
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader

In [6]:
# Caching
from langchain.cache import InMemoryCache
from langchain.cache import SQLiteCache
from langchain.globals import set_llm_cache


cacheType = 'in_memory'

if cacheType == 'in_memory':
    set_llm_cache(InMemoryCache())
elif cacheType == 'sqlite':
    set_llm_cache(SQLiteCache(database_path=".langchain.db"))

In [7]:
# Set OpenAI API key and create LLM and Chat LLM. Note that key can be stored in a separate file or as an environment variable. Refer to docs.

llm = OpenAI()
chat = ChatOpenAI()

# Load Multiple documents and process

In [9]:
# Load all files in dirl
loader = DirectoryLoader('./new_articles/', glob="./*.txt", loader_cls=TextLoader)

documents = loader.load()

In [39]:
from langchain.schema import AIMessage, HumanMessage, SystemMessage

# Import chat templates
from langchain.prompts import (
    ChatPromptTemplate,
    PromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)


In [29]:
from langchain.chains.summarize import load_summarize_chain
chain = load_summarize_chain(llm=chat, chain_type="stuff")

## Pydantic 

In [176]:
from pydantic import BaseModel, Field
from typing import List
from langchain.output_parsers import PydanticOutputParser

In [177]:
class QAModel(BaseModel):
    question: str = Field(description="Question")
    answer: str = Field(description="Answer to question")

class QADocument(BaseModel):
    metadata: dict = None
    original: str= None
    source: str = Field(description="Metdata for the source document")
    summary: str = Field(description="Original content being processed")
    items: List[QAModel] = []

In [254]:
from langchain.schema import Document

def creat_document_lc(page_content:str, metadata:dict):
    return Document(page_content=page_content, metadata=metadata)

def format_page_content(response: QAModel):
    return f"""Question: {response.question}\nAnswer: {response.answer}"""

def format_page_metadata(result: QADocument):
    qaCount = 0 if not result.items else len(result.items)
    return {
        'source': result.source,
        'qaCount': qaCount
        # Additional properties here if necessary
    }

def export_document(result: QADocument) -> list :
    metadata = format_page_metadata(result)
    final_str = f"***Summary***:\n\n{result.summary}\n\n***Possible QA***\n"
    
    for item in result.items:
        qa = format_page_content(item)
        final_str += f"\n{qa}\n"
    
    final_str += f"\n***Original***\n\n{result.original}"
    
    doc = creat_document_lc(page_content=final_str, metadata=metadata)
    return doc
        

In [255]:
human_template = """{request}\n{format_instructions}"""
human_prompt = HumanMessagePromptTemplate.from_template(human_template)
chat_prompt = ChatPromptTemplate.from_messages([human_prompt])

In [270]:
def generate_request_for_questions(source_name: str, summary: str):
    q_template1 = f"""Create a few questions and answers covering the below text starting and ending with ### :
    ###{summary}###

    Text metadata to be included in results:
    source: {source_name}
    summary: [This should be the text provided igenerate_request_for_questionsn above with ###]
    original: [This can be left blank]
    metadata: [This can be left blank]
    """

    request = chat_prompt.format_prompt(request=q_template1,
                                       format_instructions=parser.get_format_instructions()).to_messages()
    
    return request


In [260]:
parser = PydanticOutputParser(pydantic_object=QADocument)
# parser.get_format_instructions()

In [282]:
# This will genrate a Summary -> Generate Questions -> Create new document wrapper
def process_document(document: Document) -> Document:
    print("Processing document: ", document.metadata['source'])
    
    try:
        # Create summary
        doc_name = document.metadata['source']
        summary_resp = chain.run([document])

        # Create request for questions
        question_request = generate_request_for_questions(source_name=doc_name, summary=summary_resp)
        question_response = chat(question_request, temperature=0.0)
        parsed_questions_qadoc = parser.parse(question_response.content)

        # Set core details
        parsed_questions_qadoc.original = document.page_content

        # Generate new doc
        final_doc = export_document(parsed_questions_qadoc)

    except Exception as error:
        # handle the exception
        print("An exception occurred:", error)
    
    print("Done processing document: ", document.metadata['source'])
    
    return final_doc
    
    
    

In [289]:
# Run through all documents

enriched_documents = []
for doc in documents:
    enriched = process_document(document=doc)
    enriched_documents.append( enriched )
    

Processing document:  new_articles\05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt
Done processing document:  new_articles\05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt
Processing document:  new_articles\05-03-ai-replace-tv-writers-strike.txt
Done processing document:  new_articles\05-03-ai-replace-tv-writers-strike.txt
Processing document:  new_articles\05-03-chatgpt-everything-you-need-to-know-about-the-ai-powered-chatbot.txt
Done processing document:  new_articles\05-03-chatgpt-everything-you-need-to-know-about-the-ai-powered-chatbot.txt
Processing document:  new_articles\05-03-checks-the-ai-powered-data-protection-project-incubated-in-area-120-officially-exits-to-google.txt
Done processing document:  new_articles\05-03-checks-the-ai-powered-data-protection-project-incubated-in-area-120-officially-exits-to-google.txt
Processing document:  new_articles\05-03-databricks-acquires-ai-centric-data-governance-platform-okera.txt
Done processing d

In [296]:
enriched_documents[2].page_content[:300]

'***Summary***:\n\nChatGPT, an AI-powered chatbot developed by OpenAI, has gained popularity for its ability to generate text in various domains. OpenAI has made significant updates to ChatGPT, including the integration of GPT-4, the latest language-writing model, and the introduction of plugins that a'

# Retriever

In [317]:
# split 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(enriched_documents)

In [319]:
texts[3]

Document(page_content='"We will not expand into new industries or adjacent product areas," he told TechCrunch in an email interview. "Great talent is the foundation of the business â€” we will continue to augment our teams at all levels of the organization. Pando is also open to exploring strategic partnerships and acquisitions with this round of funding."\n\nPando was co-launched by Jayakrishnan and Abhijeet Manohar, who previously worked together at iDelivery, an India-based freight tech marketplace â€” and their first startup. The two saw firsthand manufacturers, distributors and retailers were struggling with legacy tech and point solutions to understand, optimize and manage their global logistics operations â€” or at least, thatâ€™s the story Jayakrishnan tells.', metadata={'source': 'new_articles\\05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt', 'qaCount': 6})

In [322]:
persist_dir = 'articles_db2'

# Embeddings
embedding = OpenAIEmbeddings()

vecordb = Chroma.from_documents(documents=texts,
                                embedding=embedding,
                                persist_directory=persist_dir)

vecordb.persist()

In [323]:
retriever = vecordb.as_retriever(search_kwargs={"k": 5})

In [324]:
docs = retriever.get_relevant_documents("Databricks Okera")

In [330]:
len(docs)

5

# Make Chain

In [347]:
qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                       chain_type="refine",
                                       retriever=retriever,
                                       return_source_documents=True)

In [349]:
## Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [351]:
# full example
query = "How much money did Pando raise?"
llm_response = qa_chain(query)
process_llm_response(llm_response)



Pando has raised a total of $45 million in funding, with $30 million coming from its Series B round. The company plans to use these funds to expand its sales, marketing, and delivery capabilities, as well as to explore strategic partnerships and acquisitions. Pando's software-as-a-service platform aims to revolutionize the digital logistics market, which is projected to reach $46.5 billion by 2025. The company has experienced significant growth since its Series A funding in 2020, with revenue increasing 8x and the customer base expanding 5x. According to co-founder Jayakrishnan, Pando has a strong balance sheet and profit and loss statement, and is focused on profitable growth. The company is scaling operations in North America, Europe, and India, and has already secured marquee customer wins and a network of strong partners. Overall, Pando is well-positioned to drive supply chain agility for the 2030 economy.


Sources:
new_articles\05-03-ai-powered-supply-chain-startup-pando-lands-

In [352]:
# break it down
query = "What is the news about Pando?"
llm_response = qa_chain(query)
process_llm_response(llm_response)
# llm_response


The news is that Pando, a global logistics company, has secured new funding to expand its operations and enhance its capabilities in supply chain management. This funding will also be used to develop its no-code capabilities and machine learning algorithms, as well as integrate with leading enterprise resource planning (ERP) systems. Pando's platform also has the ability to detect anomalies and anticipate supply chain risks, making it a strong competitor in the digital logistics market. With a growing list of notable customers and a strong financial standing, Pando is well-positioned to capitalize on the increasing demand for supply chain visibility and management tools.


Sources:
new_articles\05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt
new_articles\05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt
new_articles\05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt
new_articles\05-03-ai-powered-supply-chain-startup-pando-lands

In [353]:
query = "Who led the round in Pando?"
llm_response = qa_chain(query)
process_llm_response(llm_response)



Iron Pillar and Uncorrelated Ventures led the $30 million Series B funding round in Pando, with participation from other investors. This brings Pando's total funding to $45 million. The company plans to use the funds to expand its sales, marketing, and delivery capabilities. Pando offers a software-as-a-service platform that consolidates supply chain data and provides tools for freight procurement, trade management, and more. The platform differentiates itself with its no-code capabilities and integration with leading enterprise resource planning systems. Pando has seen significant growth since its Series A funding in 2020, with revenue increasing 8x and the customer base expanding 5x. The company aims to capitalize on the growing digital logistics market, which is estimated to reach $46.5 billion by 2025. According to Pando's CEO, the company has a strong financial position and is focused on profitable growth. They have already achieved success with marquee customer wins and a netwo

In [354]:
query = "What did databricks acquire?"
llm_response = qa_chain(query)
process_llm_response(llm_response)



Databricks announced the acquisition of Okera, a data governance platform with a focus on AI, for an undisclosed amount. Okera had previously raised nearly $30 million from investors such as Felicis, Bessemer Venture Partners, Cyber Mentor Fund, ClearSky, and Emergent Ventures. As part of the acquisition, Okera co-founder and CEO Nong Li joined Databricks, bringing his expertise in data storage and governance, including creating the Apache Parquet data storage format and working at Cloudera. This acquisition will allow Databricks to integrate Okera's secure and scalable governance solutions into their offerings, providing a modern, AI-centric governance solution for enterprises dealing with large volumes of data. Li's involvement in the integration process is expected to enhance Databricks' capabilities and bring more value to its customers.


Sources:
new_articles\05-03-databricks-acquires-ai-centric-data-governance-platform-okera.txt
new_articles\05-03-databricks-acquires-ai-centri

In [355]:
query = "What is generative ai?"
llm_response = qa_chain(query)
process_llm_response(llm_response)


Generative AI is a technology that uses machine learning algorithms to generate new content, such as art, music, or text, based on patterns and data it has been trained on. It has gained attention for its ability to create realistic and coherent outputs. One popular example of generative AI is ChatGPT, an AI-powered chatbot developed by OpenAI. ChatGPT uses the latest language-writing model, GPT-4, and has introduced plugins that allow access to external knowledge sources. However, there have been concerns about potential misuse of ChatGPT, such as the rise of malware posing as the chatbot and its potential use for fraud. There have also been ethical and legal issues raised, such as data privacy and plagiarism. Additionally, there have been recent legal disputes over the use of copyrighted data, particularly art, in training generative AI models. To address this, the startup Spawning AI has developed a website, HaveIBeenTrained, which allows creators to opt out of having their work us