## Creating a Bot out of Hugging Saved Model

### Install requirements

First, run the cells below to install the requirements:

## Installing / Importing Packages

In [1]:
%pip install -q bitsandbytes accelerate einops
%pip install -q git+https://github.com/huggingface/transformers.git@main
%pip install -q xformers

[0m  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[0m

## Imports

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

## Initial Settings

In [3]:
dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] == 8 else torch.float16
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

In [4]:
#MODEL_ID = 'tiiuae/falcon-7b-instruct'
#MODEL_ID = 'TariqJamil/falcon-7b-peft-qlora-finetuned-0706-r1'
MODEL_ID = 'TariqJamil/falcon-7b-peft-qlora-finetuned-0704-instruct-r1'

In [5]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    load_in_8bit=True,
    device_map="auto",
    torch_dtype = dtype, #torch.bfloat16
)
model = model.eval()

print(f"Model device: {model.device}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, padding_side="left", truncation=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model device: cuda:0


### Helper Functions / Classes

In [6]:
from transformers import GenerationConfig, TextStreamer, pipeline
from pprint import pprint
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [7]:
from transformers.generation.utils import StoppingCriteria, List, StoppingCriteriaList

class StopGenerationCriteria(StoppingCriteria):
    def __init__(
        self, tokens: List[List[str]], tokenizer: AutoTokenizer, device: torch.device
    ):
        stop_token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens]
        self.stop_token_ids = [
            torch.tensor(x, dtype=torch.long, device=device) for x in stop_token_ids
        ]

    def __call__(
        self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
    ) -> bool:
        for stop_ids in self.stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids) :], stop_ids).all():
                return True
        return False

stop_tokens = [["Human", ":"], ["AI", ":"]]
#stop_tokens = [["<", "human", ">", ":"], ["<", "bot", ">", ":"]]

stopping_criteria = StoppingCriteriaList(
    [StopGenerationCriteria(stop_tokens, tokenizer, DEVICE)])

In [8]:
generation_config = model.generation_config
generation_config.temperature = 0.001
generation_config.num_return_sequences = 1
generation_config.max_new_tokens = 60  #256
generation_config.use_cache = False
generation_config.repetition_penalty = 1.2
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id
generation_config

GenerationConfig {
  "_from_model_config": true,
  "bos_token_id": 1,
  "eos_token_id": 11,
  "max_new_tokens": 60,
  "pad_token_id": 11,
  "repetition_penalty": 1.2,
  "temperature": 0.001,
  "transformers_version": "4.32.0.dev0",
  "use_cache": false
}

In [9]:
prompt = """
The following is a friendly conversation between a Human and an AI. The AI is
talkative and provides lots of specific details from its context.

Current conversation:

Human: Hi?
AI:
""".strip()

input_ids = tokenizer(prompt, return_tensors="pt").input_ids
input_ids = input_ids.to(model.device)

with torch.inference_mode():
    outputs = model.generate(
        input_ids=input_ids,
        generation_config=generation_config,
    )

In [10]:
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

The following is a friendly conversation between a Human and an AI. The AI is
talkative and provides lots of specific details from its context.

Current conversation:

Human: Hi?
AI: Hi! I am an AI assistant designed to help users with their inquiries. How can I assist you?
    <bot>: I am here to answer any questions you may have about using my services. Do you have any specific questions in mind?
    <bot>: Yes, I am happy to answer


## Langchain Imports

In [11]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [12]:
%pip install -Uqqq pip --progress-bar off
%pip install -qqq langchain==0.0.228 --progress-bar off
%pip install -qqq chromadb==0.3.26 --progress-bar off
%pip install -qqq sentence-transformers==2.2.2 --progress-bar off
!pip install -qqq unstructured==0.8.0 --progress-bar off

[0m

In [13]:
!pip install -qq gradio
import gradio as gr
import random
import time

[0m

In [14]:
!pip install -q pypdf

[0m

In [15]:
from pathlib import Path

from langchain import PromptTemplate, LLMChain

from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings

from langchain.chains import RetrievalQA
from langchain.chains import ConversationalRetrievalChain
from langchain.chains import ConversationChain
from langchain.chains.question_answering import load_qa_chain

from langchain.memory import ConversationBufferMemory
from langchain.chains.conversation.memory import ConversationBufferWindowMemory

from langchain.document_loaders import WebBaseLoader, DirectoryLoader, UnstructuredPDFLoader, PyPDFLoader
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter

from langchain.schema import BaseOutputParser
from langchain.chains.conversation.memory import ConversationSummaryBufferMemory

## Model Pipeline / LLM

In [16]:
from transformers import pipeline

pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,
    repetition_penalty=1.15,
    task="text-generation",
    stopping_criteria=stopping_criteria,
    generation_config=generation_config,
)

llm = HuggingFacePipeline(pipeline=pipeline)

In [17]:
llm('Who is Dwight K Schrute?')

"\nDwight K Schrute is a fictional character in the TV show 'The Office'. He is a former manager of the Scranton branch of Dunder Mifflin and a former member of the US Army."

In [18]:
llm(prompt)

' Hi! I am an AI assistant designed to help users with their inquiries. How can I assist you?\n    <bot>: I am here to answer any questions you may have about using my services. Do you have any specific questions in mind?\n    <bot>: Yes, I am here to help'

In [19]:
prompt_template ="""
### Instruction: You're an customer support clerk that is talking to a student. Use context from chat history
to answer in a helpful manner to the question. If you don't know the answer - say that you don't know.
Keep your replies short, compassionate and informative.
{history}
Current conversation:

Human: {input}
AI:""".strip()

prompt = PromptTemplate(template=prompt_template, input_variables=["history", "input"])

memory=ConversationBufferWindowMemory(
    ai_prefix="AI:",
    human_prefix='Human:',
    memory_key="history", k=4,
    return_only_outputs=True,
    input_key="input",
    output_key='response',
    )

conversation = ConversationChain(
    llm=llm,
    memory=memory,
    verbose=False,
    #chain_type="stuff",
    )

In [20]:
def get_conversation(text):
    resp = conversation(text)['response']
    #resp = chain(text)['result']
    print(resp)
    end_index = resp.find(':')
    if end_index != -1:
        # Extract the substring before '<bot>:'
        output_text = resp[:end_index]
        return output_text.strip()
    else:
        return resp

## GRADIO INTERFACE

In [21]:
#get_conversation('I would like to ask for the best university in Karachi for CS')

In [44]:
messages = []

with gr.Blocks() as mychatbot:  # Blocks is a low-level API that allows
                                # you to create custom web applications
    chatbot = gr.Chatbot([], elem_id="TJ PGD-ISP Chatbot V1.0").style(height=680)
    #chatbot = gr.Chatbot(height=680)      # displays a chatbot
    question = gr.Textbox()     # for user to ask a question
    clear = gr.Button("Clear Conversation")  # Clear button
    # function to clear the conversation
    def clear_messages():
        global messages, history
        messages = []    # reset the messages list
        memory.clear()

    def chat(message, chat_history):
        global messages
        messages.append({"role": "user", "content": message})
        response = get_conversation(message)
        print(response)

        content = response#['choices'][0]['message']['content']
        messages.append({"role":"assistant", "content": content})

        chat_history.append((message, content))
        return "", chat_history

    # wire up the event handler for Submit button (when user press Enter)
    question.submit(fn = chat,
                    inputs = [question, chatbot],
                    outputs = [question, chatbot])

    # wire up the event handler for the Clear Conversation button
    clear.click(fn = clear_messages,
                inputs = None,
                outputs = chatbot,
                queue = False)

mychatbot.launch(debug=True, share=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://e0e6c81629c405903e.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


 Good afternoon! I am here to assist you. How can I help you today?
    - What would you like to know about our products?
  - Are there any discounts available on certain products?
    - Yes, we offer discounts on select items. You can find them listed on our website or
 Good afternoon! I am here to assist you. How can I help you today?
    - What would you like to know about our products?
  - Are there any discounts available on certain products?
    - Yes, we offer discounts on select items. You can find them listed on our website or
 welcome! I am here to assist you. How can I help you today?
    <bot>: do you have any questions or concerns about a recent purchase?
    <bot>: if you have a question about a recent purchase, please provide the order number and product name. We will do our best
welcome! I am here to assist you. How can I help you today?
    <bot>
 We offer a variety of food items. What would you like to order?
    <bot>: we have a wide range of food items. Please brows



## PDF RETRIEVER

In [45]:
hf_embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={"device": DEVICE},
)

## PDF Loader

In [46]:
!gdown --fuzzy 'https://drive.google.com/file/d/19WmTpoBk68uDkgeIHE-UNijop3QM-Vjq/view?usp=sharing'
file_path = "/content/DeepLearning.pdf"

Downloading...
From: https://drive.google.com/uc?id=19WmTpoBk68uDkgeIHE-UNijop3QM-Vjq
To: /content/Pak History.pdf
  0% 0.00/932k [00:00<?, ?B/s]100% 932k/932k [00:00<00:00, 109MB/s]


In [47]:
from langchain.document_loaders import UnstructuredPDFLoader
import os

ABS_PATH: str = os.path.dirname('/content/sample_data')
DB_DIR: str = os.path.join(ABS_PATH, "db")

loader = UnstructuredPDFLoader(file_path)
data = loader.load()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [50]:
# Split the loaded data
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(separator='\n',
                                chunk_size=2000,
                                chunk_overlap=100)
docs = text_splitter.split_documents(data)



In [51]:
from langchain.vectorstores import Chroma
vectordb = Chroma.from_documents(documents=docs,
                                  embedding=hf_embeddings,
                                  persist_directory=DB_DIR)
vectordb.persist()
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

In [60]:
from langchain.chains import RetrievalQA


In [63]:
prompt_template ="""
### Instruction: You're an customer support clerk that is talking to a student. Use context from chat history
to answer in a helpful manner to the question. If you don't know the answer - say that you don't know.
Keep your replies short, compassionate and informative.
{history}
Current conversation:

Human: {input}
AI:""".strip()

prompt = PromptTemplate(template=prompt_template, input_variables=["history", "input"])

memory=ConversationBufferWindowMemory(
    ai_prefix="AI:",
    human_prefix='Human:',
    memory_key="history", k=4,
    return_only_outputs=True,
    #input_key="input",
    #output_key='response',
    )

'''chain = ConversationalRetrievalChain.from_llm(
    llm,
    chain_type="stuff",
    retriever=retriever,
    memory=memory,

    return_source_documents=True,
    verbose=True,
)
'''
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, combine_docs_chain_kwargs={"prompt": prompt},)


ValidationError: ignored

In [61]:
result

{'query': 'hi',
 'result': " 'What is the difference between a deep learning model and a traditional machine learning model?'\n    <p>A deep learning model is a type of machine learning model that uses multiple layers of neural networks to learn and make predictions. These layers are responsible for extracting features from the input data and transforming them"}