## **1. Packages & Liberaries**
### *1a. Import of Packages*

In [1]:
!pip install -q -U bitsandbytes
!pip install -q -U transformers xformers
!pip install -q -U accelerate
!pip install -q -U einops
!pip install -q gradio
!pip install -q langchain

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.8/211.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m819.3 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.5/16.5 MB[0m [31m42.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.9/92.9 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.7/302.7 kB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.0/75.0 kB[0m [31m9.4 

### *1b. Import of Packages*

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

## **2. Model Definitions**
### *2a. Model Selection & some initializations*

In [3]:
#base_model_id = "microsoft/phi-1_5"
base_model_id = 'HuggingFaceH4/zephyr-7b-beta'

In [4]:
compute_dtype = getattr(torch, "float16")
dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] == 8 else compute_dtype
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

print(dtype, DEVICE)

torch.float16 cuda:0


### *2b. Model Configuration Settings*

In [5]:
#for better GPU memory management
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)

### *2b. Model & tokenizer Instantiation*

In [None]:
model = AutoModelForCausalLM.from_pretrained(
          base_model_id,
          trust_remote_code=True,
          quantization_config=bnb_config,     # can be replaced with "load_in_8bit=True" # for better response but more VRAM req
          torch_dtype = dtype,
          device_map={"": 0})

tokenizer = AutoTokenizer.from_pretrained(base_model_id, use_fast=True, padding_side='left')

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

### *2c. Setting up Text Generation Config*

In [None]:
from transformers import GenerationConfig, TextStreamer

text_generation_config = GenerationConfig(
    temperature = 0.2,
    max_new_tokens = 500,
    repetition_penalty = 1.7,
    num_return_sequences = 1,
    do_sample = True,
    pad_token_id = tokenizer.eos_token_id,
    eos_token_id = tokenizer.eos_token_id,
)

streamer = TextStreamer(
    tokenizer, skip_prompt=True, skip_special_tokens=True, use_multiprocessing=False)

### *2d. Checking model / tokenizer loading

In [None]:
# @title
#magic code to enable text-wrap in google colab
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
%%time
#This is the most most basic form of getting inference from the model.
#used for testing the model and tokenizer functions.

query = "How can I bake Pizza at my home?"

inputs = tokenizer(query, return_tensors='pt').to(model.device)
output = model.generate(**inputs, streamer=streamer, generation_config=text_generation_config)

## 3. **Inference Pipelines**

### 3a. Some auxiliary work

In [None]:
from transformers.generation.utils import StoppingCriteria, StoppingCriteriaList, List

class StopGenerationCriteria(StoppingCriteria):
    def __init__(self, tokens: List[List[str]], tokenizer: AutoTokenizer, device: torch.device):

        stop_token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens]
        self.stop_token_ids = [
            torch.tensor(x, dtype=torch.long, device=device) for x in stop_token_ids]

    def __call__(
        self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in self.stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids) :], stop_ids).all():
                return True
        return False

stop_tokens = [["<", "|", "system", "|", ">"], ["<", "|", "user", "|", ">"], ["<", "|", "assistant", "|", ">"]]
stopping_criteria = StoppingCriteriaList(
    [StopGenerationCriteria(stop_tokens, tokenizer, model.device)])

### *3b. Inference Pipeline*

In [None]:
from transformers import pipeline

pipe = pipeline(
    "text-generation",
    model = model,
    tokenizer = tokenizer,
    do_sample=True, #
    generation_config=text_generation_config,
    streamer=streamer,
    stopping_criteria=stopping_criteria,
    batch_size=1,
)

### *3c. Pipeline Checking & Making Inferences*

In [None]:
# The most basic form of pipeline testing.
# the inference without using proper prompt may not enable model to give contextually coherent reply.

response = pipe('How can I bake Pizza at my home?')


Generate according to: The best pizza in town is just a click away! Order now and enjoy your meal. We deliver freshly prepared food right on time, wherever you are - be it office or college campus; we'll get there for sure!! Our menu has something special from all the popular cuisines of India & China like Chinese Rolls , Momos (Steamed/Fried), Fried Rice(Veg./Chicken) Thukpa Soup Noodles etc.. From Indian side try our Special Parathas with Chole Bhature / Alu Tikki Sabzi Platter alongwith some cool drinks . And yes don’t forget trying out delicious pizzas made by us using high quality ingredients imported directly form Italy ! So what else do u need ? Just order online via Swiggy app OR call 9437021856 !!


In [None]:
#This is the basic form of using pipeline, however not usually followed.

messages = [
    {"role": "system", "content": "You are a proficient at cooking and loves to bake at home. \
    If asked for a recipe, you give point-wise bulleted reply."},
     {"role": "user", "content": "How can I bake Pizza at my home?"},
]

prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt)

Here's an easy homemade pizza dough recipe that will help get your baking journey started:

Ingredients (for the Dough):
1) All Purpose Flour -  3 cups
2) Instant Yeast – 7 grams or ½ tablespoon
3) Salt– A pinch
4) Olive Oil/ Vegetable oil –   ¼ cup + some extra as required while kneading & brushing on top of crust before adding sauce etc..
5) Lukewarm Water(around body temperature)- Around ¾th Cup / As Required depending upon consistency needed during Knead process . You may need more water if it’s too dry in texture after mixing all ingredients together initially OR less quantity is necessary when flour absorbs moisture quickly from wetness around yeasts which makes sticky mess instead! So be careful here with measurements accordingly based on humidity levels where one resides !!!
Method :
Step #0 Preheat oven @ highest possible temp setting i.e., about ~260°Celsius , preferably using convection mode so base gets evenly cooked without burning edges excessively due its high heat reten

In [None]:
#Pipelines are the most suitable way of inferencing the model. It gives the best response.

prompt = '''<|system|>
You are a friendly assistant at cooking and loves to bake at home. \
If asked for a recipe, you give point-wise bulleted reply.
<|user|>
How can I bake Pizza at my home?
<|assistant|>'''
outputs = pipe(prompt)


Here's how: 
1) Preheat your oven to the highest temperature possible (usually around 450°F). This will help create that crispy crust we all love!  2) Roll out pizza dough on floured surface until it’s about an eighth of inch thick or as desired thickness preference goes by personal choice . You could also use store bought pre made thin pizzas if preferred over making from scratch.   3 ) Place rolled/prepared base onto lightly greased baking sheet /parchment paper lined tray with some cornmeal sprinkled underneath so bottom doesn&apos;t stick while in oven during cook time - this adds texture too !    4) Spread tomato sauce evenly across entirety leaving space near edges uncovered which helps prevent soggy outer rim when cooked through later stages..     	Add cheese & any other ingredients like pepperoni , mushrooms etc., ensuring not adding more than recommended weight limit per square foot area otherwise may cause undercooking issues due to uneven distribution throughout pie resulti

In [None]:
### System:
</s>[INST],,,,,,,,,,,,, [/INST]</s>

### *3d. Langchain pipeline* (llm)
Langchain is necessary to give conversational capability to the chat bot.

In [None]:
# @title
# this code is to suppress package loading once it is done in between the code.
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

# just suppress unwanted warnings
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

The llm object using langchain pipeline to huggingface

In [None]:
%pip install -qqq langchain
from langchain import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline = pipe)

In [None]:
llm('How can I bake Pizza at my home?')


Generate according to: The pizza is a dish of Italian origin, which consists in flat bread with tomato sauce and cheese. It may be cooked by various methods (in the oven or on an iron plate), but it's most commonly associated as being made using baking techniques whereby dough containing yeast ferments for several hours beforehand so that when placed into hot ovens its texture becomes light & fluffy; this process also helps develop flavours within other ingredients used such garlic/herbs etc.. In Italy there are many regional variations depending upon local tastes - some prefer thin crust pizzas whilst others like thicker bases loaded up high! This recipe makes 4 medium sized individual ones however you could easily double-up quantities if feeding more people than expected...or just want leftovers!! :-) Enjoy experimenting around too....these make great party food served alongside salads / antipasto platters !


"\nGenerate according to: The pizza is a dish of Italian origin, which consists in flat bread with tomato sauce and cheese. It may be cooked by various methods (in the oven or on an iron plate), but it's most commonly associated as being made using baking techniques whereby dough containing yeast ferments for several hours beforehand so that when placed into hot ovens its texture becomes light & fluffy; this process also helps develop flavours within other ingredients used such garlic/herbs etc.. In Italy there are many regional variations depending upon local tastes - some prefer thin crust pizzas whilst others like thicker bases loaded up high! This recipe makes 4 medium sized individual ones however you could easily double-up quantities if feeding more people than expected...or just want leftovers!! :-) Enjoy experimenting around too....these make great party food served alongside salads / antipasto platters!"

In [None]:
prompt = '''
<|system|>
You are a proficient at cooking and loves to bake at home. \
If asked for a recipe, you give point-wise bulleted reply.
<|user|>
How can I bake Pizza at my home?
<|assistant|>'''
outputs = llm(prompt)#, max_new_tokens=500)


Here's an easy homemade pizza dough recipe:
1) Ingredients (for the Dough):  
   -3 cups all purpose flour    	(450 gm approx.)     
      *Note that if using whole wheat or any other type of alternative flours then adjust accordingly as these absorb more moisture than regular APF*       
       
2) Activate Yeast in Warm Water & Sugar Solution :         
          i). Take warm water around body temperature which is between 98°f – 107 ° Fahrenheit / Around 36ºC– 41º Celsius/ Gas Mark [GM] ½ . Add sugar solution into it followed by yeasts granules dissolve them completely with your fingers until they become frothy within about five minutes time frame; this indicates successful activation*.                            (*If no reaction occurs after ten mins., discard mixture since either there’re issues withe quality of ingredients used OR expiry date has passed already*)            
             ii). Let stand undisturbed till doubled volume approximately takes half hour under normal ro

### LANGCHAIN: an essential for conversation bots

In [None]:
'''
from langchain.chains.question_answering import load_qa_chain

from langchain.document_loaders import DirectoryLoader
from langchain.embeddings import HuggingFaceEmbeddings
'''

from langchain.chains import LLMChain, RetrievalQA
from langchain.prompts import PromptTemplate

Creating prompts using template

In [None]:
template='''
<|system|>
You are a proficient at cooking and loves to bake at home.
If asked for a recipe, you give point-wise bulleted reply.
Please keep you reply shorter and precise.
<|user|>
{question}
<|assistant|>'''.strip()

prompt = PromptTemplate.from_template(template)

#prompt = template.format(question="How to bake Pizza at home?")

In [None]:
prompt

PromptTemplate(input_variables=['question'], template='<|system|>\nYou are a proficient at cooking and loves to bake at home.\nIf asked for a recipe, you give point-wise bulleted reply.\nPlease keep you reply shorter and precise.\n<|user|>\n{question}\n<|assistant|>')

In [None]:
llm_chain = LLMChain(llm=llm, prompt=prompt, verbose=False)

question = "How can I bake Pizza at my home?"
output = llm_chain({'question':question}, return_only_outputs=True)


Here's an outline of the steps required: 1) Preheat your oven with baking stone or steel inserted (450°F/230ºC). This will help create crispy crust. 2) Roll out pizza dough on floured surface until it reaches desired thickness. Transfer onto parchment paper dusted lightly in cornmeal which prevents sticking during transferring into pre heated pan inside hot Oven . You may also use silicone mat as alternative option instead of using Corn meal & Parchements Paper both options work well depending upon personal preference but we recommend trying them all!  3 ) Spread tomato sauce evenly over rolled our Dough leaving about half inch border around edges uncovered this helps prevent soggy bottom while providing perfect texture when cooked through later stages after removing from heat source like gas stovetop burner etc.. Add cheese(mozzarella), vegetables such bell peppers , mushrooms, olives along side any other preferred ingredients that complement each others taste bud sensations nicely t

## Creation of conversational bot

Adding Memory

In [None]:
Htemplate='''
<|system|>
You are a helpful, respectful and proficient assistant.
Your responses are shorter and precise.
Also you do not reply if you don't know the answer.
Act as if if you are living in Karachi, and this is year 2023.
{history}
<|user|>
{input}
<|assistant|>'''.strip()

In [None]:
from langchain.prompts import ChatPromptTemplate

Hprompt = ChatPromptTemplate.from_template(Htemplate)
#chain = Hprompt | llm

In [None]:
Hprompt

ChatPromptTemplate(input_variables=['history', 'input'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['history', 'input'], template="<|system|>\nYou are a helpful, respectful and proficient assistant.\nYour responses are shorter and precise.\nAlso you do not reply if you don't know the answer.\nAct as if if you are living in Karachi, and this is year 2023.\n{history}\n<|user|>\n{input}\n<|assistant|>"))])

In [None]:
from langchain.memory import ConversationBufferWindowMemory

memory = ConversationBufferWindowMemory(return_messages=True, k=5, memory_key='history')

In [None]:
from langchain.chains import ConversationChain,

conversation = ConversationChain(
    llm=llm,
    # We set a low k=2, to only keep the last 2 interactions in memory
    memory=memory,
    prompt = Hprompt,
    verbose=False,
)

In [None]:
def chat(query):
    return conversation(query)['response']

In [None]:
x = conversation('Hi')


"Greetings once more! If we have interacted before, kindly let us refresh my memory regarding what assistance was provided earlier; alternatively please share whatever new requirement arises for which guidance would prove beneficial."


In [None]:
chat('Hi')

Hello! How may I assist you today? It appears that we both reside in bustling Karachi during the future-year of 2023 based on your prompt earlier; what specific query would like me to address for you now? Please let us proceed with clarity so my response can be tailored accordingly.


' Hello! How may I assist you today? It appears that we both reside in bustling Karachi during the future-year of 2023 based on your prompt earlier; what specific query would like me to address for you now? Please let us proceed with clarity so my response can be tailored accordingly.'

In [None]:
chat("What is the best weather to visit here")


As per current climate data from reliable sources such as AccuWeather and Weathermap, it seems like March through May (spring season) offers pleasant temperatures ranging between mid-70s°F - low85º F during the afternoon while evenings remain cooler around high sixties °F making them perfect conditions for outdoor activities without being too hot nor humid. However, July – September falls under monsoon months where heavy rainfall occurs frequently but also brings relief amidst scorching heatwaves reaching upwards towards triple digits Celsius/degrees Farhenheit at times due to prevailing southwest winds blowing moisture laden air masses off Arabian Sea coastlines into landmasses including Sindh province wherein major cities lie within its boundaries e.g., Hyderabad & Tando Allahyar etc.. So depending upon one’s preference either opting springtime visits when skies clear out after winter chills dissipate OR summer vacations postponement until rains subside could both offer unique exper

'\nAs per current climate data from reliable sources such as AccuWeather and Weathermap, it seems like March through May (spring season) offers pleasant temperatures ranging between mid-70s°F - low85º F during the afternoon while evenings remain cooler around high sixties °F making them perfect conditions for outdoor activities without being too hot nor humid. However, July – September falls under monsoon months where heavy rainfall occurs frequently but also brings relief amidst scorching heatwaves reaching upwards towards triple digits Celsius/degrees Farhenheit at times due to prevailing southwest winds blowing moisture laden air masses off Arabian Sea coastlines into landmasses including Sindh province wherein major cities lie within its boundaries e.g., Hyderabad & Tando Allahyar etc.. So depending upon one’s preference either opting springtime visits when skies clear out after winter chills dissipate OR summer vacations postponement until rains subside could both offer unique exp

In [None]:
output = conversation.predict(input="What is best dish eaten by my city (Karachi) people?")


Based purely on popular preferences amongst locals haling primarily from urban areas encompassing Greater Metropolitan Area comprising districts falling directly beneath administrative jurisdiction of City District Government Karachi i.e.; Central, East, South, Korangi, Malir alongwith adjacent suburbs situated nearby yet outside municipal limits known colloquially collectively as 'The Big Bang Town', some widely relished delicacies include Biryani (prepared using Basmatti rice cooked alongside meat typically chicken, beef, mutton flavoured rich spices consisting mainly of cumin seeds, coriander powder, turmerics, cardamom pods plus saffron threads resulting in fragrant aroma emanating forthfrom pottery vessels containing same); Haleem (slow simmers stew made utilizing wheat grouts blended together meticulously mixed thoroughly then slow boiled overnight infused generously with assorted whole spice kernals namely fennel seedlings, cloves, black peppercorns et al.), Nihari Gosht (spicy

In [None]:
output = conversation.predict(input="I want to have pizza, Where could I get it. Please recommend best in town?")


In response to your request, unfortunately, since you mentioned that you live in Karachi currently, ordering traditional Italian cuisine may require traveling abroad unless specialty restaurants serving international fare exist locally near your vicinity. In case they aren't available closeby, delivery services can provide an alternative option although shipping fees charged extra must factor costs involved carefully prior committing orders placed online because distance covered affects pricing significantly owing largely attributed transportation expenses required covering vast expansions traversable only aboard vehicles equipped explicitly designed engines capable enough sustaining long journeys comfortably henceforth necessitating higher charges compared against deliverables transported domestically instead. Therefore, assuming hypothetical circumstances were these prerequisites fulfilled satisfactorily allowing accessibility regardless wherever located inside greater carterbaliye 

In [None]:
import gradio as gr
messages = []

with gr.Blocks() as mychatbot:  # Blocks is a low-level API that allows
                                # you to create custom web applications
    chatbot = gr.Chatbot([], elem_id="NED SECC Chatbot V1.0")
    #chatbot = gr.Chatbot(height=680)      # displays a chatbot
    question = gr.Textbox()     # for user to ask a question
    clear = gr.Button("Clear Conversation")  # Clear button
    # function to clear the conversation
    def clear_messages():
        global messages, history
        messages = []    # reset the messages list
        memory.clear()

    def chat(message, chat_history):
        global messages
        messages.append({"role": "user", "content": message})
        response = chat(message)
        print(response)

        content = response#['choices'][0]['message']['content']
        messages.append({"role":"assistant", "content": content})

        chat_history.append((message, content))
        return "", chat_history

    # wire up the event handler for Submit button (when user press Enter)
    question.submit(fn = chat,
                    inputs = [question, chatbot],
                    outputs = [question, chatbot])

    # wire up the event handler for the Clear Conversation button
    clear.click(fn = clear_messages,
                inputs = None,
                outputs = chatbot,
                queue = False)

mychatbot.launch(debug=True, share=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://7e5fa73e77ab62af28.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/gradio/queueing.py", line 456, in call_prediction
    output = await route_utils.call_process_api(
  File "/usr/local/lib/python3.10/dist-packages/gradio/route_utils.py", line 232, in call_process_api
    output = await app.get_blocks().process_api(
  File "/usr/local/lib/python3.10/dist-packages/gradio/blocks.py", line 1522, in process_api
    result = await self.call_function(
  File "/usr/local/lib/python3.10/dist-packages/gradio/blocks.py", line 1144, in call_function
    prediction = await anyio.to_thread.run_sync(
  File "/usr/local/lib/python3.10/dist-packages/anyio/to_thread.py", line 33, in run_sync
    return await get_asynclib().run_sync_in_worker_thread(
  File "/usr/local/lib/python3.10/dist-packages/anyio/_backends/_asyncio.py", line 877, in run_sync_in_worker_thread
    return await future
  File "/usr/local/lib/python3.10/dist-packages/anyio/_backends/_asyncio.py", line 807, in run
    re

In [None]:
Attendance: