## Creating a Bot out of Hugging Saved Model

### Install requirements

First, run the cells below to install the requirements:

## Installing / Importing Packages

In [1]:
%pip install -q bitsandbytes accelerate einops
%pip install -q git+https://github.com/huggingface/transformers.git@main
%pip install -q xformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m60.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━

## Imports

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

## Initial Settings

In [3]:
dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] == 8 else torch.float16
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

In [4]:
#MODEL_ID = 'tiiuae/falcon-7b-instruct'
#MODEL_ID = 'TariqJamil/falcon-7b-peft-qlora-finetuned-0706-r1'
MODEL_ID = 'TariqJamil/falcon-7b-peft-qlora-finetuned-0704-instruct-r1'

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    load_in_8bit=True,
    device_map="auto",
    torch_dtype = dtype, #torch.bfloat16
)
model = model.eval()

print(f"Model device: {model.device}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, padding_side="left", truncation=True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/773 [00:00<?, ?B/s]

Downloading (…)/configuration_RW.py:   0%|          | 0.00/2.61k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-7b-instruct:
- configuration_RW.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)main/modelling_RW.py:   0%|          | 0.00/47.5k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-7b-instruct:
- modelling_RW.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)model.bin.index.json:   0%|          | 0.00/16.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

### Helper Functions / Classes

In [None]:
from transformers import GenerationConfig, TextStreamer, pipeline
from pprint import pprint
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
from transformers.generation.utils import StoppingCriteria, List, StoppingCriteriaList

class StopGenerationCriteria(StoppingCriteria):
    def __init__(
        self, tokens: List[List[str]], tokenizer: AutoTokenizer, device: torch.device
    ):
        stop_token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens]
        self.stop_token_ids = [
            torch.tensor(x, dtype=torch.long, device=device) for x in stop_token_ids
        ]

    def __call__(
        self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
    ) -> bool:
        for stop_ids in self.stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids) :], stop_ids).all():
                return True
        return False

stop_tokens = [["Human", ":"], ["AI", ":"]]
stopping_criteria = StoppingCriteriaList(
    [StopGenerationCriteria(stop_tokens, tokenizer, DEVICE)])

In [None]:
generation_config = model.generation_config
generation_config.temperature = 0.00001
generation_config.num_return_sequences = 1
generation_config.max_new_tokens = 150  #256
generation_config.use_cache = False
generation_config.repetition_penalty = 1.2
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id
generation_config

In [None]:
prompt = """
The following is a friendly conversation between a human and an AI. The AI is
talkative and provides lots of specific details from its context.

Current conversation:

Human: Hi?
AI:
""".strip()

input_ids = tokenizer(prompt, return_tensors="pt").input_ids
input_ids = input_ids.to(model.device)

with torch.inference_mode():
    outputs = model.generate(
        input_ids=input_ids,
        generation_config=generation_config,
    )

In [None]:
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

## Langchain Imports

In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
%pip install -Uqqq pip --progress-bar off
%pip install -qqq langchain==0.0.228 --progress-bar off
%pip install -qqq chromadb==0.3.26 --progress-bar off
%pip install -qqq sentence-transformers==2.2.2 --progress-bar off
!pip install -qqq unstructured==0.8.0 --progress-bar off

In [None]:
from pathlib import Path

from langchain import PromptTemplate, LLMChain

from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings

from langchain.chains import RetrievalQA
from langchain.chains import ConversationalRetrievalChain
from langchain.chains import ConversationChain
from langchain.chains.question_answering import load_qa_chain

from langchain.memory import ConversationBufferMemory
from langchain.chains.conversation.memory import ConversationBufferWindowMemory

from langchain.document_loaders import WebBaseLoader, DirectoryLoader
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter

from langchain.schema import BaseOutputParser
from langchain.chains.conversation.memory import ConversationSummaryBufferMemory

## Model Pipeline / LLM

In [None]:
from langchain import HuggingFacePipeline
from transformers import pipeline

pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,
    task="text-generation",
    stopping_criteria=stopping_criteria,
    generation_config=generation_config,
)

llm = HuggingFacePipeline(pipeline=pipeline)

In [None]:
llm('Who is Dwight K Schrute?')

In [None]:
llm(prompt)

In [None]:
!pip install -qq gradio
import gradio as gr
import random
import time

### Embed Documents

In [None]:
hf_embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={"device": DEVICE},
    )

https://github.com/curiousily/Get-Things-Done-with-Prompt-Engineering-and-LangChain/blob/master/11.chatbot-with-local-llm-falcon-7b.ipynb

In [None]:
!gdown --fuzzy 'https://drive.google.com/file/d/19WmTpoBk68uDkgeIHE-UNijop3QM-Vjq/view?usp=sharing'
file_path = "/content/Pak History.pdf"

In [None]:
from langchain.document_loaders import WebBaseLoader, UnstructuredPDFLoader
from langchain.chains import RetrievalQA
import os

ABS_PATH: str = os.path.dirname('/content/sample_data')
DB_DIR: str = os.path.join(ABS_PATH, "db1")

# Load data from the specified URL
url = 'https://www.daraz.pk/'
#url = 'https://www.bbc.com/weather'
#url = 'https://www.dawn.com/'
data = WebBaseLoader(url).load()

# Split the loaded data
text_splitter = CharacterTextSplitter(separator='\n', chunk_size=2000, chunk_overlap=200)
docs = text_splitter.split_documents(data)

vectordb = Chroma.from_documents(documents=docs, embedding=hf_embeddings, persist_directory=DB_DIR)
vectordb.persist()

retriever = vectordb.as_retriever(search_kwargs={"k": 2})

In [None]:
history=''
prompt_template ="""
The following is a friendly conversation between a human and an AI. The AI is
talkative and provides lots of specific details from its context. If the AI does not
know the answer to a question, it truthfully says it does not know.

Current conversation:
{history}

Human: {input}
AI:""".strip()

prompt = PromptTemplate(template=prompt_template, input_variables=["history", "input"])

'''memory=ConversationBufferMemory(
    #ai_prefix="AI:",
    #human_prefix='Human:',
    memory_key="history", k=6,
    return_only_outputs=True,
    #input_key="question",
    #output_key='response',
    )
'''

#memory = ConversationSummaryBufferMemory(
#    memory_key='history', ai_prefix="AI", llm=llm, max_token_limit=20)


chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    memory=memory,
    #prompt=prompt,
    #verbose=True,
    retriever=retriever,
    )


memory = ConversationBufferWindowMemory(
    memory_key = 'history',
    #chat_memory=history,
    k=3,
    ai_prefix="AI:",
)

'''conversation = ConversationChain(
    llm=llm,
    memory=memory,
    verbose=False,
    #chain_type="stuff",
    )
'''
def get_conversation(text):
    resp = chain(text)
    start_index = 1#resp.find('<bot>')
    if start_index != -1:
        # Extract the substring before '<bot>:'
        output_text = resp[:start_index]
        return output_text.strip()
    else:
        return resp

In [None]:
#get_conversation('Top world news at the moment')
chain('what is price of Eggs in Karachi, today')

In [None]:
messages = []

with gr.Blocks() as mychatbot:  # Blocks is a low-level API that allows
                                # you to create custom web applications
    chatbot = gr.Chatbot(height=750)      # displays a chatbot
    question = gr.Textbox()     # for user to ask a question
    clear = gr.Button("Clear Conversation")  # Clear button

    # function to clear the conversation
    def clear_messages():
        global messages
        messages = []    # reset the messages list

    def chat(message, chat_history):
        global messages
        messages.append({"role": "user", "content": message})
        response = get_conversation(message)
        content = response#['choices'][0]['message']['content']
        messages.append({"role":"assistant", "content": content})

        chat_history.append((message, content))
        return "", chat_history

    # wire up the event handler for Submit button (when user press Enter)
    question.submit(fn = chat,
                    inputs = [question, chatbot],
                    outputs = [question, chatbot])

    # wire up the event handler for the Clear Conversation button
    clear.click(fn = clear_messages,
                inputs = None,
                outputs = chatbot,
                queue = False)

mychatbot.launch(debug=True, share=True)

## Support Chatbot

In [None]:
def process_output(text):
    start_index = text.find('<bot>')
    if start_index != -1:
        # Extract the substring before '<bot>:'
        output_text = text[:start_index]
        return output_text.strip()
    else:
        return text

while True:
    user_input = input("You: ")
    if user_input.lower() in ["bye", "goodbye"]:
        break
    answer = chain(user_input)
    pprint(process_output(answer['response']))
    #print('>>',answer)
    print()

In [None]:
!pip install -qq gradio

In [None]:
chat_history =''
def process_output(text):
    start_index = text.find('<bot>')
    if start_index != -1:
        # Extract the substring before '<bot>'
        output_text = text[:start_index]
        return output_text
    else:
        return text

while True:
    user_input = input("You: ")
    if user_input.lower() in ["bye", "goodbye"]:
        break
    answer = chatbot(user_input)
    pprint(process_output(answer))
    print('>>', answer)
    print()

In [None]:
print_response(result["response"])