# Lanchain Mistral Setup

In [16]:
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader

In [17]:
# Caching
from langchain.cache import InMemoryCache
from langchain.cache import SQLiteCache
from langchain.globals import set_llm_cache


cacheType = 'in_memory'

if cacheType == 'in_memory':
    set_llm_cache(InMemoryCache())
elif cacheType == 'sqlite':
    set_llm_cache(SQLiteCache(database_path=".langchain.db"))

# Setup LLM - Google Flan T5 Large

In [21]:
from langchain_mistralai.chat_models import ChatMistralAI

chat = ChatMistralAI(endpoint="http://192.168.68.76:1234", 
                     mistral_api_key="not-needed", 
                     temprature=0,
                    verbose=True)


In [22]:
from langchain.schema import AIMessage, HumanMessage, SystemMessage

# Import chat templates
from langchain.prompts import (
    ChatPromptTemplate,
    PromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)


In [24]:
template1 = 'Give me a simple bullet point outline for a blog post on:\n {topic}'
first_prompt = ChatPromptTemplate.from_template(template1)
hmsg = first_prompt.format_prompt(topic='nike shoes').to_messages()


chat.invoke(hmsg)

AIMessage(content="1. Introduction to Nike Shoes: Brief history and popularity\n2. Types of Nike Shoes: Running, Basketball, Training, and Lifestyle\n3. Top Nike Shoes Models: Air Jordan, Air Force 1, Pegasus, etc.\n4. Design and Technology: Innovations in materials, cushioning, and fit\n5. Customization: NikeID and other personalization options\n6. Collaborations: Sneakerheads favorite partnerships with artists, musicians, and athletes\n7. Fashion Trends: How Nike shoes fit into current styles and streetwear culture\n8. Sustainability: Nike's efforts towards more eco-friendly production methods\n9. Buying and Collecting: Tips for purchasing authentic shoes and starting a collection\n10. Conclusion: The enduring appeal of Nike shoes and their impact on popular culture.")

## HF Instructor Embeddings

In [27]:
from langchain.embeddings import HuggingFaceInstructEmbeddings, HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-small-en"
model_kwargs = {"device": "cuda:1"}
encode_kwargs = {"normalize_embeddings": True}
embedding = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)


# Load Multiple documents and process

In [29]:
# Load all files in dirl
loader = DirectoryLoader('./new_articles/', glob="./*.txt", loader_cls=TextLoader)

documents = loader.load()

In [40]:
from langchain.chains.summarize import load_summarize_chain

# Options are - stuff, refine, map_reduce
chain = load_summarize_chain(llm=chat, chain_type="map_reduce")

In [42]:
chain.invoke([documents[0]])

Downloading vocab.json:   0%|          | 0.00/1.04M [00:02<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

{'input_documents': [Document(page_content='Signaling that investments in the supply chain sector remain robust, Pando, a startup developing fulfillment management technologies, today announced that it raised $30 million in a Series B round, bringing its total raised to $45 million.\n\nIron Pillar and Uncorrelated Ventures led the round, with participation from existing investors Nexus Venture Partners, Chiratae Ventures and Next47. CEO and founder Nitin Jayakrishnan says that the new capital will be put toward expanding Pandoâ€™s global sales, marketing and delivery capabilities.\n\n"We will not expand into new industries or adjacent product areas," he told TechCrunch in an email interview. "Great talent is the foundation of the business â€” we will continue to augment our teams at all levels of the organization. Pando is also open to exploring strategic partnerships and acquisitions with this round of funding."\n\nPando was co-launched by Jayakrishnan and Abhijeet Manohar, who previo

In [43]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
texts = text_splitter.split_documents([documents[0]])

chain.run(texts)

'Pando, a supply chain visibility platform founded by Nitin Jayakrishnan and Abhijeet Manohar, raised $30 million in Series B funding led by Iron Pillar and Uncorrelated Ventures. The company aims to expand globally, hire top talent, and explore partnerships or acquisitions. With increasing demand for greater supply chain visibility, Pando consolidates data from various silos using no-code tools and apps for logistics management. Its machine learning algorithms predict and optimize fulfillment strategies for clients like P&G and J&J. The digital logistics market is projected to reach $46.5 billion by 2025 due to disruptions and investments, with Pando expanding in North America, Europe, and India.'

## Pydantic 

In [46]:
from pydantic import BaseModel, Field
from typing import List
from langchain.output_parsers import PydanticOutputParser

In [48]:
class QAModel(BaseModel):
    question: str = Field(description="Question")
    answer: str = Field(description="Answer to question")

class QADocument(BaseModel):
    metadata: dict = None
    original: str= None
    source: str = Field(description="Metdata for the source document")
    summary: str = Field(description="Original content being processed")
    items: List[QAModel] = []

In [50]:
from langchain.schema import Document

def creat_document_lc(page_content:str, metadata:dict):
    return Document(page_content=page_content, metadata=metadata)

def format_page_content(response: QAModel):
    return f"""Question: {response.question}\nAnswer: {response.answer}"""

def format_page_metadata(result: QADocument):
    qaCount = 0 if not result.items else len(result.items)
    return {
        'source': result.source,
        'qaCount': qaCount
        # Additional properties here if necessary
    }

def export_document(result: QADocument) -> list :
    metadata = format_page_metadata(result)
    final_str = f"***Summary***:\n\n{result.summary}\n\n***Possible QA***\n"
    
    for item in result.items:
        qa = format_page_content(item)
        final_str += f"\n{qa}\n"
    
    final_str += f"\n***Original***\n\n{result.original}"
    
    doc = creat_document_lc(page_content=final_str, metadata=metadata)
    return doc
        

In [52]:
human_template = """{request}\n{format_instructions}"""
human_prompt = HumanMessagePromptTemplate.from_template(human_template)
chat_prompt = ChatPromptTemplate.from_messages([human_prompt])

In [54]:
def generate_request_for_questions(source_name: str, summary: str):
    q_template1 = f"""Create a few questions and answers covering the below text starting and ending with ### :
    ###{summary}###

    Text metadata to be included in results:
    source: {source_name}
    summary: [This should be the text provided igenerate_request_for_questionsn above with ###]
    original: [This can be left blank]
    metadata: [This can be left blank]
    """

    request = chat_prompt.format_prompt(request=q_template1,
                                       format_instructions=parser.get_format_instructions()).to_messages()
    
    return request


In [56]:
parser = PydanticOutputParser(pydantic_object=QADocument)
# parser.get_format_instructions()

In [58]:
# This will genrate a Summary -> Generate Questions -> Create new document wrapper
def process_document(document: Document) -> Document:
    print("Processing document: ", document.metadata['source'])
    
    try:
        # Create summary
        doc_name = document.metadata['source']
        summary_resp = chain.run([document])

        # Create request for questions
        question_request = generate_request_for_questions(source_name=doc_name, summary=summary_resp)
        question_response = chat(question_request, temperature=0.0)
        parsed_questions_qadoc = parser.parse(question_response.content)

        # Set core details
        parsed_questions_qadoc.original = document.page_content

        # Generate new doc
        final_doc = export_document(parsed_questions_qadoc)

    except Exception as error:
        # handle the exception
        print("An exception occurred:", error)
    
    print("Done processing document: ", document.metadata['source'])
    
    return final_doc
    
    
    

In [62]:
# question_request[0].content
chat(question_request[1].content, temperature=0.0)


NameError: name 'question_request' is not defined

In [64]:
document = documents[1]

# Create summary
doc_name = document.metadata['source']
summary_resp = chain.run([document])

# Create request for questions
question_request = generate_request_for_questions(source_name=doc_name, summary=summary_resp)
question_response = chat(question_request, temperature=0.0)
parsed_questions_qadoc = parser.parse(question_response.content)

# Set core details
parsed_questions_qadoc.original = document.page_content

# Generate new doc
final_doc = export_document(parsed_questions_qadoc)

  warn_deprecated(


In [68]:
print(final_doc.page_content)

***Summary***:

The Writers Guild of America (WGA) is currently on strike due to various demands including better working conditions and improved streaming residuals. One contentious issue is the use of AI in writers' rooms, with the WGA proposing that they should not be required to adapt or consider AI-generated output as their work. The Alliance of Motion Picture and Television Producers (AMPTP) has refused to engage with this proposal, instead offering annual meetings to discuss technological advances. While current text-generating algorithms like ChatGPT can produce responses based on desired outputs, they cannot create nuanced, entertaining dialogue or navigate the complexities of human collaboration. Some writers see potential for AI to facilitate brainstorming sessions but worry about its impact on working conditions and fair compensation. The legal status of copyrightable AI-generated content is also unclear, potentially putting studios at risk. The primary concerns for the WGA

In [None]:
# Run through all documents

enriched_documents = []
for doc in documents:
    enriched = process_document(document=doc)
    enriched_documents.append( enriched )
    

In [None]:
enriched_documents[2].page_content[:300]

# Retriever

In [None]:
# split 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(enriched_documents)

In [None]:
texts[3]

In [None]:
persist_dir = 'articles_db_flant5'

vecordb = Chroma.from_documents(documents=texts,
                                embedding=embedding,
                                persist_directory=persist_dir)

vecordb.persist()

In [None]:
retriever = vecordb.as_retriever(search_kwargs={"k": 5})

In [None]:
docs = retriever.get_relevant_documents("Databricks Okera")

In [None]:
len(docs)

# Make Chain

In [None]:
qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                       chain_type="refine",
                                       retriever=retriever,
                                       return_source_documents=True)

In [None]:
## Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [None]:
# full example
query = "How much money did Pando raise?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [None]:
# break it down
query = "What is the news about Pando?"
llm_response = qa_chain(query)
process_llm_response(llm_response)
# llm_response

In [None]:
query = "Who led the round in Pando?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [None]:
query = "What did databricks acquire?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [None]:
query = "What is generative ai?"
llm_response = qa_chain(query)
process_llm_response(llm_response)