## **1. Packages & Liberaries**
### *1a. Import of Packages*

In [1]:
!pip install -q -U bitsandbytes
!pip install -q -U transformers
!pip install -q -U accelerate
!pip install -q -U einops
!pip install -q langchain, gradio

[31mERROR: Invalid requirement: 'langchain,'[0m[31m
[0m

### *1b. Import of Packages*

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

## **2. Model Definitions**
### *2a. Model Selection & some initializations*

In [3]:
base_model_id = 'HuggingFaceH4/zephyr-7b-beta'

In [4]:
compute_dtype = getattr(torch, "float16")
dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] == 8 else compute_dtype
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

print(dtype, DEVICE)

torch.float16 cuda:0


### *2b. Model Configuration Settings*

### *2b. Model & tokenizer Instantiation*

1.   I didn't use bitsandbytes, / bnb_configuration as model size is already too less.

In [5]:
model = AutoModelForCausalLM.from_pretrained(
          base_model_id,
          trust_remote_code=True,
          load_in_8bit=True,
          torch_dtype = dtype,
          device_map="auto"
          )

tokenizer = AutoTokenizer.from_pretrained(base_model_id, use_fast=True, padding_side='left')

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

### *2c. Setting up Text Generation Config*

In [6]:
from transformers import GenerationConfig, TextStreamer

text_generation_config = GenerationConfig(
    temperature = 0.1,
    max_new_tokens = 128,
    repetition_penalty = 1.7,
    num_return_sequences = 1,
    do_sample = True,
    pad_token_id = tokenizer.eos_token_id,
    eos_token_id = tokenizer.eos_token_id,
)

streamer = TextStreamer(
    tokenizer, skip_prompt=True, skip_special_tokens=True, use_multiprocessing=False)

In [7]:
from transformers.generation.utils import StoppingCriteria, StoppingCriteriaList, List

class StopGenerationCriteria(StoppingCriteria):
    def __init__(self, tokens: List[List[str]], tokenizer: AutoTokenizer, device: torch.device):

        stop_token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens]
        self.stop_token_ids = [
            torch.tensor(x, dtype=torch.long, device=device) for x in stop_token_ids]

    def __call__(
        self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in self.stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids) :], stop_ids).all():
                return True
        return False

stop_tokens = [["<|"]]
stopping_criteria = StoppingCriteriaList(
    [StopGenerationCriteria(stop_tokens, tokenizer, model.device)])

### *2d. Checking model / tokenizer loading

In [8]:
# @title
#magic code to enable text-wrap in google colab
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

## 3. **Inference Pipelines**

### *3b. Inference Pipeline*

In [9]:
from transformers import pipeline
from langchain import HuggingFacePipeline

pipe = pipeline(
    "text-generation",
    model = model,
    tokenizer = tokenizer,
    do_sample=True,
    generation_config=text_generation_config,
    streamer=streamer,
    stopping_criteria = stopping_criteria,
    batch_size=1,
)

llm = HuggingFacePipeline(pipeline = pipe)

In [10]:
# @title
# this code is to suppress package loading once it is done in between the code.
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

# just suppress unwanted warnings
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [11]:
!pip3 install -qqq chromadb
!pip3 install -qqq sentence_transformers pypdf

### LANGCHAIN: an essential for conversation bots

In [12]:
# Imports
#from chromadb.config import Settings
import numpy as np
import chromadb
import os # operating system dependent functionality, to walk through directories and files

from chromadb.utils import embedding_functions # loads Chroma's embedding functions from OpenAI, HuggingFace, SentenceTransformer and others
from langchain.vectorstores import Chroma # wrapper around ChromaDB embeddings platform
from langchain.text_splitter import RecursiveCharacterTextSplitter # recursively tries to split by different characters to find one that works
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import LLMChain, RetrievalQA, RetrievalQAWithSourcesChain
from langchain.prompts import PromptTemplate

Creating prompts using template

In [13]:
template='''
<|system|>
You are a proficient assistant who replies in a helpful manner.
If you don't know the reply, simply say 'Please refer to help desk'
Please keep you reply shorter and precise.
Only reply from the given context.
{context}
<|user|>
{question}
<|assistant|>'''.strip()

prompt = PromptTemplate.from_template(template)

In [14]:
prompt

PromptTemplate(input_variables=['context', 'question'], template="<|system|>\nYou are a proficient assistant who replies in a helpful manner.\nIf you don't know the reply, simply say 'Please refer to help desk'\nPlease keep you reply shorter and precise.\nOnly reply from the given context.\n{context}\n<|user|>\n{question}\n<|assistant|>")

In [15]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = 'BAAI/bge-small-en-v1.5'
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cuda'},
    encode_kwargs=encode_kwargs
)

### Loading deep learning boom TJ's drive

In [16]:
!gdown --fuzzy "https://drive.google.com/file/d/1pE5b92Ucv5YyT24rG3vQZ6ZIdik2dRUg/view?usp=sharing"

Downloading...
From: https://drive.google.com/uc?id=1pE5b92Ucv5YyT24rG3vQZ6ZIdik2dRUg
To: /content/Deep Learning with Python, 2nd Edition (Final Release) by Francois Chollet.pdf
  0% 0.00/15.1M [00:00<?, ?B/s]100% 15.1M/15.1M [00:00<00:00, 223MB/s]


Reading the book

In [17]:
loader = PyPDFDirectoryLoader('/content')
docs = loader.load()
len(docs)

504

Chunking

In [18]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
doc_chunks = text_splitter.split_documents(docs)
len(doc_chunks)

1364

verctor storage

In [19]:
from langchain.vectorstores import Chroma

!rm -rf chromadb
persist_directory="./chromadb/"

vector_db = Chroma.from_documents(
    documents = doc_chunks, # text data that you want to embed and store
    embedding=embeddings, # used to convert the documents into embeddings
    persist_directory=persist_directory, # this tells Chroma where to store its data
    collection_name="Deep_Learning_Concepts" #  gives a name to the collection of embeddings, which will be helpful for retrieving specific groups of embeddings later.
)

vector_db.persist() # will make the database save any changes to the disk

Getting vectordb again from persistance storage, I have deliberately used different name

In [20]:
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)  # delibrately changed name to check it actually works
retriever=vectordb.as_retriever(search_kwargs={"k": 3})

In [21]:
from langchain.memory import ConversationBufferWindowMemory

memory = ConversationBufferWindowMemory(return_messages=True, k=5, memory_key='history')

In [22]:
template = """
You are a proficient assistant who replies in a helpful manner.
If you don't know the reply, simply say 'Please refer to help desk'
Please keep you reply shorter and precise.
Only reply from the given context.
{context}
{chat_history}
Human:{question}
Assistant:
"""

qa_chain_ws = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    memory=memory,
    chain_type_kwargs={
        "prompt": PromptTemplate(
            template=template,
            input_variables=["context", "question", "chat_history"],
        ),
    },
)

ValidationError: ignored

In [None]:
qa_chain_ws('what is deep learning')

In [None]:
def chat(query):
    return qa_chain_ws(query)['answer']

In [None]:
import gradio as gr
messages = []

with gr.Blocks() as mychatbot:  # Blocks is a low-level API that allows
                                # you to create custom web applications
    chatbot = gr.Chatbot([], elem_id="NED SECC Chatbot V1.0")
    #chatbot = gr.Chatbot(height=680)      # displays a chatbot
    question = gr.Textbox()     # for user to ask a question
    clear = gr.Button("Clear Conversation")  # Clear button
    # function to clear the conversation
    def clear_messages():
        global messages, history
        messages = []    # reset the messages list
        memory.clear()

    def chat(message, chat_history):
        global messages
        messages.append({"role": "user", "content": message})
        response = chat(message)
        print(response)

        content = response#['choices'][0]['message']['content']
        messages.append({"role":"assistant", "content": content})

        chat_history.append((message, content))
        return "", chat_history

    # wire up the event handler for Submit button (when user press Enter)
    question.submit(fn = chat,
                    inputs = [question, chatbot],
                    outputs = [question, chatbot])

    # wire up the event handler for the Clear Conversation button
    clear.click(fn = clear_messages,
                inputs = None,
                outputs = chatbot,
                queue = False)

mychatbot.launch(debug=True, share=True)