In [None]:
import secrets
import json
from pathlib import Path
import pprint
import pdb
from typing import Any

from etl import markdown, pdfs, shared, videos

import docstore
import vecstore
from utils import pretty_log

pp = pprint.PrettyPrinter(indent=2)

import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline, TextStreamer
import json
import textwrap
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain, RetrievalQA
from langchain.memory import ConversationBufferMemory
import langchain
import time

from langchain.chains.conversational_retrieval.prompts import CONDENSE_QUESTION_PROMPT
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.chains.conversational_retrieval.base import ConversationalRetrievalChain


%load_ext autoreload
import ciptest_qanda as test

In [None]:
# Get the embeddings, tokenizer and model
embedding_engine = vecstore.get_embedding_engine(allowed_special="all")

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
                                          token=True)

model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
                                             device_map='auto',
                                             torch_dtype=torch.bfloat16,
                                             token=True,
                                             #  load_in_8bit=True,
                                             #  load_in_4bit=True,
                               
                                             )

# Create a streamer and a text generation pipeline
streamer = TextStreamer(tokenizer, skip_prompt=True)

pipe = pipeline("text-generation",
                model=model,
                tokenizer=tokenizer,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                max_new_tokens=4096,
                do_sample=True,
                temperature=0.1,
                top_p=0.95,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.eos_token_id,
                streamer=streamer,
                )

# Create the llm here
llm = HuggingFacePipeline(pipeline=pipe)


########### Connecting to the vector storage and load it #############

pretty_log("connecting to vector storage")
vector_index = vecstore.connect_to_vector_index(vecstore.INDEX_NAME, embedding_engine)
pretty_log("connected to vector storage")
pretty_log(f"found {vector_index.index.ntotal} vectors to search over")


In [None]:
########## THE LAMA 2 DEMO ############## - MORE GENERIC AND CUSTOMIZED 
langchain.debug=False 

# THE OTHER ONE
instruction = "Chat History:\n\n{chat_history} \n\nUser: {user_input}"
system_prompt = """\
""Consider that I'm a beginner in networking and security things. \n
Give me a concise answer with with a single step at a time. \n
Limit your resonse to maximum 128 words.
Do not provide any additional text or presentation. Only steps and actions.
If possible use concrete names of software or tools that could help on each step."""
    
    

llama_docs_template = """
[INST]Use the following pieces of context to answer the question. If no context provided, tell the user that you did not find any context about the question and you will answer to the question as you know the best as you can"
{context}
Question: {question} [/INST]
"""


llama_docs_template_alternative="""
[INST]Given the following extracted parts of a long document and a question, create a final answer with "SOURCES" that represent exactly the Source name and link given.
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
ALWAYS return a "SOURCES" part in your answer.

QUESTION: {question}

{summaries}

FINAL ANSWER: [/INST]
"""



llama_docs_prompt = PromptTemplate(template=llama_docs_template, input_variables=["context", "question"])
llama_doc_chain = load_qa_with_sources_chain(llm, chain_type="stuff", prompt= llama_docs_prompt, document_variable_name="context", verbose=False)

llama_condense_template = """
[INST]Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question: [/INST]"""
llama_condense_prompt = PromptTemplate(template=llama_condense_template, input_variables=["chat_history", "question"])
llama_question_generator_chain = LLMChain(llm=llm, prompt=llama_condense_prompt, verbose=False)

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

llama_v2_chain = ConversationalRetrievalChain(
    retriever=vector_index.as_retriever(search_kwargs={'k': 6}),
    question_generator=llama_question_generator_chain,
    combine_docs_chain=llama_doc_chain,
    memory=memory
)

# VERY USEFULL FOR checking the sources and context
######################################################
def test_vectorDatasets_similarityScores_and_responses_no_memory(run_llm_chain: bool):
    def sim_que(query : str, run_llm_chain: bool):
        pretty_log("selecting sources by similarity to query")
        sources_and_scores = vector_index.similarity_search_with_score(query, k=3)
    
        sources, scores = zip(*sources_and_scores)
        print(sources_and_scores)
    
        if run_llm_chain:
            result = llama_doc_chain(
                    {"input_documents": sources, "question": query}, return_only_outputs=True
                )
        
            answer = result["output_text"]
            print(answer)
    
    query1 = "What models use human instructions?"
    sim_que(query1, run_llm_chain=False)
    
    query2 = "Are there any model trained on medical knowledge?"
    sim_que(query2, run_llm_chain=False)
    
test_vectorDatasets_similarityScores_and_responses_no_memory(run_llm_chain=False)
######################################################

In [None]:
print(llama_v2_chain({"question": "What models use human instructions?"}))

print(llama_v2_chain({"question": "Which are the advantage of each of these models?"}))

print(llama_v2_chain({"question": "What are the downsides of your last model suggested above ?"}))


In [None]:
test.qanda_llama2_cont()

In [None]:
import time 

chat_history = ""
input_list = [{"user_input": "Give me some indications to solve a denial of service attack.", "chat_history":chat_history}]

start=time.time()
test.llm_chain.generate(input_list)

test.llm_chain.generate([{"user_input" : "What question did I asked you previously"}])

end=time.time()

print(f"Total time: {end-start}")

In [None]:
import time 

chat_history = ""
input_list = [{"user_input": "Give me some indications to solve a denial of service attack.", "chat_history":chat_history}]

start=time.time()
answer1=test.llm_chain.predict(user_input="Give me some indications to solve a denial of service attack.")
print(answer1)
answer2=test.llm_chain.predict(user_input="What question did I asked you previously")
print(answer2)
end=time.time()

print(f"Total time: {end-start}")

In [None]:
#response = test.qanda_llama2("Can we combine LMMs and OCR?", with_logging=True)


In [None]:
#response = test.qanda_llama2_withRAG("Can we combine LMMs and OCR?", with_logging=True)

In [None]:
#test.qanda_llama2_withRAG("Can we combine LLMs and OCR", with_logging=True)


In [None]:
#test.ask_question_withRAG("Can we combine LLMs and OCR", with_logging=False)

In [None]:
import bs4
from langchain import hub
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import WebBaseLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import StrOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.schema.runnable  import RunnablePassthrough
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable.utils import ConfigurableField



In [None]:
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

vectorstore = Chroma.from_documents(documents=splits, embedding=test.embedding_engine)
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

prompt = hub.pull("rlm/rag-prompt-llama")
llm=test.base_llm

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


########### 
from langchain.prompts import PromptTemplate
from operator import itemgetter
from langchain.schema.runnable import RunnableParallel

template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use 4 sentences maximum and keep the answer as concise as possible.
{context}
Question: {question}
Helpful Answer:"""
rag_prompt_custom = ChatPromptTemplate.from_template(template)

#Always say "thanks for asking!" at the end of the answer.

rag_chain_from_docs = (
    {
        "context": lambda input: format_docs(input["documents"]),
        "question": itemgetter("question"),        
    }
    | rag_prompt_custom
    | llm
    | StrOutputParser()
)
rag_chain_with_source = RunnableParallel(
    {"documents": retriever, "question": RunnablePassthrough()}
) | {
    "documents": lambda input: [doc.metadata for doc in input["documents"]],
    "answer": rag_chain_from_docs,
}


In [None]:
rag_chain_with_source.invoke({question:"What is Task Decomposition?"})#"Always say ```thanks for asking!``` at the end of the answer."})

In [None]:
print(
    prompt.invoke(
        {"context": "filler context", "question": "filler question"}
    ).to_string()
)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer, pipeline
streamer = TextStreamer(test.tokenizer, skip_prompt=True)
pipe = pipeline(
    "text-generation",
    model=test.base_llm,
    tokenizer=test.tokenizer,
    max_length=2048,
    temperature=0.6,
    pad_token_id=test.tokenizer.eos_token_id,
    top_p=0.95,
    repetition_penalty=1.2,
    device=0,
    streamer=streamer
)
pipe(prompts[0])

inputs = test.tokenizer(prompts[0], return_tensors="pt").to(device)
streamer = TextStreamer(tokenizer, skip_prompt=True)
_ = model_fintuned.generate(**inputs, streamer=streamer, pad_token_id=tokenizer.eos_token_id, max_length=248, temperature=0.8, top_p=0.8,
                        repetition_penalty=1.25)