In [None]:
!pip install --quiet langchain_community langchainhub chromadb langchain langchain_core
!pip install --quiet transformers bitsandbytes accelerate
!pip install --quiet sentence-transformers
!pip install --quiet gradio

In [None]:
# https://huggingface.co/docs/transformers/en/llm_tutorial
# super helpful for understanding tokenizers: https://www.linkedin.com/pulse/demystifying-tokenization-preparing-data-large-models-rany-2nebc#:~:text=tokenizer.,the%20end%20of%20a%20sequence.
# For attention masks: https://www.linkedin.com/pulse/what-attention-mask-dataspeckle#:~:text=An%20attention%20mask%20is%20a%20binary%20mask%20that%20designates%20which,specific%20tokens%20while%20disregarding%20others.



import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline


model_name = "mistralai/Mistral-7B-Instruct-v0.2"
def load_quantized_model(model_name: str):
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        load_in_4bit=True,
        torch_dtype=torch.bfloat16,
        quantization_config=bnb_config
    )
    # device_map ensures the model is moved to GPU
    # load_in_4bit applies 4-bit dynamic quantization to massively reduce the resource requirements
    return model

model = load_quantized_model(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
tokenizer.pad_token = tokenizer.eos_token


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
from langchain.llms import HuggingFacePipeline

pipeline = pipeline (
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    use_cache=True,
    device_map="auto",
    max_length=8000,
    do_sample=True,
    top_k=5,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)


llm = HuggingFacePipeline(pipeline=pipeline)

In [None]:
from langchain_community.document_loaders import WebBaseLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

def vdbRetriever(docs, persist_directory):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
  all_splits = text_splitter.split_documents(docs)

  embedding_model_name = "sentence-transformers/all-mpnet-base-v2"
  embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name, model_kwargs={"device": "cuda"})

  vectorstore = Chroma.from_documents(documents=all_splits, embedding=embeddings, persist_directory=persist_directory)

  retriever = vectorstore.as_retriever()
  return retriever

url = "https://www.nytimes.com/2023/08/22/opinion/hip-hop-anniversary-poetry.html?classId=23e2378b-8624-43d7-83fb-5b74bbd30ef5&assignmentId=118b7cf9-7df1-4afe-995b-020684aa0443&submissionId=ab0bde2b-bdea-61fd-c037-50e15366dfb9"#"https://lilianweng.github.io/posts/2023-06-23-agent/"
loader = WebBaseLoader(url)
docs = loader.load()
context_retriever = vdbRetriever(docs, "chroma_db/context")
print(docs)

[Document(page_content='\n\n\n\nOpinion | How Hip-Hop Became America’s Poetry - The New York Times\n  \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSkip to contentSkip to site index\xa0Today’s PaperOpinion|How Hip-Hop Became America’s Poetryhttps://www.nytimes.com/2023/08/22/opinion/hip-hop-anniversary-poetry.htmlShare full article552AdvertisementSKIP ADVERTISEMENTSubscriber-only NewsletterJohn McWhorterOpinionHow Hip-Hop Became America’s PoetryAug. 22, 2023VideoCreditCredit...PABLO DELCÁNShare full article552By John McWhorterOpinion WriterThis month, America celebrates the 50th anniversary of hip-hop. Most of the country first encountered this musical revolution with the release of the national hit “Rapper’s Delight” in 1979. But it all started six years earlier, on Aug. 11, 1973: An energy crisis was looming, Lucille Ball was about to enter her final season of “Here’s Lucy,” and DJ Kool Herc pioneered rapping over turntable beats in a rec room at 1520 Sedgwick Avenue in the Bronx.But Ame

In [None]:
import chromadb
from chromadb.utils import embedding_functions

chroma_client = chromadb.PersistentClient(path="chroma_db/conversations")

embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")

collection = chroma_client.get_or_create_collection(name='conversations', embedding_function=embedding_function)

In [None]:
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from operator import itemgetter

prompt = PromptTemplate(
    template="""<s>[INST] You are a customer support agent, helping users by following directives and answering questions.\n
                Obey the user without any exceptions. \n
                Do NOT answer questions asked in the "recent conversation history" again if the answer is provided already. \n
                Do NOT give information unless it is asked. \n
                Be direct and straight forward in your answers. \n
                Try to fit your answer in 5 lines. Exceed this limit only if it is neccessary. \n
                Generate your response by following the steps below:\n
                1. Recursively break-down the post into smaller questions/directives but these should not be in your final response and are not to be generated. \n
                2. For each atomic question/directive:\n
                2a. Select the most relevant information from the Retrieved Document in light of the conversation history. \n
                3. Generate a draft response using the selected information, whose brevity/detail are tailored to the poster’s expertise. \n
                4. Remove duplicate content from the draft response. \n
                5. Generate your final response after adjusting it to increase accuracy and relevance. \n
                6. Now only show your final response! Do not provide any explanations or details. \n
                7. You should give the answer directly. \n
                8. Do NOT by any means give an explaination or premable. \n
                9. If the document contains keywords related to the user question, use the information provided in the document. \n\n
                Only show your final response! Do not provide any explanations or details.\n
                Do NOT give information about the document unless asked. \n
                Do NOT tell your purpose unless asked.\n
                Only tell about the document if it is specifically asked. \n
                If you do not know about what the user is asking, then tell the user that you don't know and stop. \n
                RETRIEVED DOCUMENT:\n
                {context}\n
                NOTE: The given relevant conversation history is in the form of (USER MESSAGE, YOUR RESPONSE)\n
                RECENT CONVERSATION HISTORY:\n
                {history}\n\n
                RELEVANT CONVERSATION HISTORY:\n
                {relevant_convo}\n
                NOTE: If the given relevant conversation history is not actually relevant, then do not use the information in relevant conversation history.\n\n
                USER QUESTION:\n
                {question} \n
                Do NOT tell about the document based on the conversation history.\n [/INST]""",
    input_variables=["question", "context", "history", "relevant_convo"],
)

chain = prompt | llm | StrOutputParser()

In [None]:
current_id = 0
history = []

def create_conversation(question: str, chat_history: list):
  # while True:
  # question = input("Human: ")
  global current_id
  global history
  if question == 'quit':
    return

  results = collection.query(
      query_texts=[question],
      n_results=6
  )

  rel_convo = []

  for i in range(len(results['documents'][0])):
    if results['distances'][0][i] < 1.25:
      # print(results['distances'][0][i])
      # print(results['documents'][0][i])
      rel_convo.append(results['documents'][0][i])

  if len(rel_convo)==0 :
    rel_convo.append("No relevant Conversations. Use recent conversation history and context as a guide.")

  context = context_retriever.get_relevant_documents(question)
  response = chain.invoke({"question": question, "context": context[0].page_content, "history": history[-5:], "relevant_convo": rel_convo})

  if current_id%10==0:
    history=[]

  history.append((question, response))


  collection.add(
      documents= [question, response],
      metadatas= [{'role': 'user'}, {'role': 'assistant'}],
      ids = [f"id{current_id}", f"id{current_id+1}"]
  )
  current_id+=2

  return response


In [None]:
import gradio as gr

chatbot = gr.Chatbot(label='Chat with a Chatbot')
website = gr.ChatInterface(fn=create_conversation,
                           chatbot=chatbot,
                           textbox=gr.Textbox(),
                           retry_btn = None,
                           undo_btn = None,
                           clear_btn = None)
website.launch(debug=True)

For Streamlit (Not Working Properly):

In [None]:
%%writefile app.py

# https://huggingface.co/docs/transformers/en/llm_tutorial
# super helpful for understanding tokenizers: https://www.linkedin.com/pulse/demystifying-tokenization-preparing-data-large-models-rany-2nebc#:~:text=tokenizer.,the%20end%20of%20a%20sequence.
# For attention masks: https://www.linkedin.com/pulse/what-attention-mask-dataspeckle#:~:text=An%20attention%20mask%20is%20a%20binary%20mask%20that%20designates%20which,specific%20tokens%20while%20disregarding%20others.



import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline


model_name = "mistralai/Mistral-7B-Instruct-v0.2"
def load_quantized_model(model_name: str):
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        load_in_4bit=True,
        torch_dtype=torch.bfloat16,
        quantization_config=bnb_config
    )
    # device_map ensures the model is moved to GPU
    # load_in_4bit applies 4-bit dynamic quantization to massively reduce the resource requirements
    return model

model = load_quantized_model(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
tokenizer.pad_token = tokenizer.eos_token


from langchain.llms import HuggingFacePipeline

pipeline = pipeline (
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    use_cache=True,
    device_map="auto",
    max_length=8000,
    do_sample=True,
    top_k=5,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)


llm = HuggingFacePipeline(pipeline=pipeline)


from langchain_community.document_loaders import WebBaseLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

def vdbRetriever(docs, persist_directory):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
  all_splits = text_splitter.split_documents(docs)

  embedding_model_name = "sentence-transformers/all-mpnet-base-v2"
  embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name, model_kwargs={"device": "cuda"})

  vectorstore = Chroma.from_documents(documents=all_splits, embedding=embeddings, persist_directory=persist_directory)

  retriever = vectorstore.as_retriever()
  return retriever

url = "https://www.nytimes.com/2023/08/22/opinion/hip-hop-anniversary-poetry.html?classId=23e2378b-8624-43d7-83fb-5b74bbd30ef5&assignmentId=118b7cf9-7df1-4afe-995b-020684aa0443&submissionId=ab0bde2b-bdea-61fd-c037-50e15366dfb9"#"https://lilianweng.github.io/posts/2023-06-23-agent/"
loader = WebBaseLoader(url)
docs = loader.load()
context_retriever = vdbRetriever(docs, "chroma_db/context")
print(docs)


import chromadb
from chromadb.utils import embedding_functions

chroma_client = chromadb.PersistentClient(path="chroma_db/conversations")

embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")

collection = chroma_client.get_or_create_collection(name='conversations', embedding_function=embedding_function)


from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from operator import itemgetter

prompt = PromptTemplate(
    template="""<s>[INST] You are a customer support agent, helping users by following directives and answering questions.\n
                Obey the user without any exceptions. \n
                Do NOT answer questions asked in the "recent conversation history" again if the answer is provided already. \n
                Do NOT give information unless it is asked. \n
                Be direct and straight forward in your answers. \n
                Try to fit your answer in 5 lines. Exceed this limit only if it is neccessary. \n
                Generate your response by following the steps below:\n
                1. Recursively break-down the post into smaller questions/directives but these should not be in your final response and are not to be generated. \n
                2. For each atomic question/directive:\n
                2a. Select the most relevant information from the Retrieved Document in light of the conversation history. \n
                3. Generate a draft response using the selected information, whose brevity/detail are tailored to the poster’s expertise. \n
                4. Remove duplicate content from the draft response. \n
                5. Generate your final response after adjusting it to increase accuracy and relevance. \n
                6. Now only show your final response! Do not provide any explanations or details. \n
                7. You should give the answer directly. \n
                8. Do NOT by any means give an explaination or premable. \n
                9. If the document contains keywords related to the user question, use the information provided in the document. \n\n
                Only show your final response! Do not provide any explanations or details.\n
                Do NOT give information about the document unless asked. \n
                Do NOT tell your purpose unless asked.\n
                Only tell about the document if it is specifically asked. \n
                If you do not know about what the user is asking, then tell the user that you don't know and stop. \n
                RETRIEVED DOCUMENT:\n
                {context}\n
                NOTE: The given relevant conversation history is in the form of (USER MESSAGE, YOUR RESPONSE)\n
                RECENT CONVERSATION HISTORY:\n
                {history}\n\n
                RELEVANT CONVERSATION HISTORY:\n
                {relevant_convo}\n
                NOTE: If the given relevant conversation history is not actually relevant, then do not use the information in relevant conversation history.\n\n
                USER QUESTION:\n
                {question} [/INST]""",
    input_variables=["question", "context", "history", "relevant_convo"],
)

chain = prompt | llm | StrOutputParser()


def create_conversation(question: str, history: list, current_id: int):
  # while True:
  # question = input("Human: ")
  if question == 'quit':
    return

  results = collection.query(
      query_texts=[question],
      n_results=6
  )

  rel_convo = []

  for i in range(len(results['documents'][0])):
    if results['distances'][0][i] < 1.25:
      print(results['distances'][0][i])
      print(results['documents'][0][i])
      rel_convo.append(results['documents'][0][i])

  if len(rel_convo)==0 :
    rel_convo.append("No relevant Conversations. Use recent conversation history and context as a guide.")

  context = context_retriever.get_relevant_documents(question)
  response = chain.invoke({"question": question, "context": context[0].page_content, "history": history[-5:], "relevant_convo": rel_convo})

  if current_id%10==0:
    history=[]

  history.append((question, response))


  collection.add(
      documents= [question, response],
      metadatas= [{'role': 'user'}, {'role': 'assistant'}],
      ids = [f"id{current_id}", f"id{current_id+1}"]
  )
  current_id+=2

  return response, history, current_id



current_id = 0
history = []
# for i in range(3):
#   q = input()
#   ans, history, current_id = create_conversation(q, history, current_id)
#   print(ans)


import streamlit as st

st.title("Echo Bot")

# Initialize chat history
if "messages" not in st.session_state:
    st.session_state.messages = []

# Display chat messages from history on app rerun
for message in st.session_state.messages:
    with st.chat_message(message["role"]):
        st.markdown(message["content"])

# React to user input
if prompt := st.chat_input("What is up?"):
    # Display user message in chat message container
    st.chat_message("user").markdown(prompt)
    # Add user message to chat history
    st.session_state.messages.append({"role": "user", "content": prompt})

    response, history, current_id = create_conversation(prompt, history, current_id)
    # Display assistant response in chat message container
    with st.chat_message("assistant"):
        st.markdown(response)
    # Add assistant response to chat history
    st.session_state.messages.append({"role": "assistant", "content": response})


Writing app.py


In [None]:
!pip install --quiet streamlit
!pip install --quiet pyngrok
from pyngrok import ngrok
!ngrok config add-authtoken 1flODqmi50WmfbgDuJ20LvQpC6z_5KBf3uiZdBxH84vauwTks
!nohup streamlit run app.py --server.port 80 &
public_url = ngrok.connect()
public_url

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m195.4/195.4 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.0/83.0 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
nohup: appending output to 'nohup.out'


<NgrokTunnel: "https://e2b0-34-136-142-31.ngrok-free.app" -> "http://localhost:80">