In [1]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM

In [4]:
!pip install accelerate

Collecting accelerate
  Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.26.1


In [3]:
#using replica of llama2 weights

#tokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
                                          use_auth_token=True,)

#model-->using llama 2 7 billion parameter model which is fine tuned for chatting
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
                                             device_map='auto',
                                             torch_dtype=torch.float16,
                                             use_auth_token=True,

                                             #cant use both below in colab
                                            #  load_in_8bit=True,
                                            #  load_in_4bit=True
                                             )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [4]:
# Making Pipeline using model, tokenizer and other settings
from transformers import pipeline

pipe = pipeline("text-generation",
                model=model,
                tokenizer= tokenizer,
                torch_dtype=torch.bfloat16,
                device_map="auto",

                #max token being generated in the output
                max_new_tokens = 512,

                #to get high probable token in output from samples of tokens
                do_sample=True,

                #take 30 samples and pick high probable samples for output
                top_k=30,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id
                )

In [5]:
import json
import textwrap

#so there is specific prompt format for the lamma that starts with [INST] and ends with [/INST]
#and for the system instruction <<SYS>> nad <</SYS>> are used

#for start and end of the prompt
B_INST, E_INST = "[INST]", "[/INST]"
#for system instruction
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"


DEFAULT_SYSTEM_PROMPT = """\
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""


#making the prompt for llama using its format
def get_prompt(instruction, new_system_prompt=DEFAULT_SYSTEM_PROMPT ):

  #Insert system instruction tags arround the instruction
    SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS

  #Insert beggining and ending tags arround the whole template
    prompt_template =  B_INST + SYSTEM_PROMPT + instruction + E_INST
    return prompt_template


#for making the output better
def cut_off_text(text, prompt):
    cutoff_phrase = prompt
    index = text.find(cutoff_phrase)
    if index != -1:
        return text[:index]
    else:
        return text

def remove_substring(string, substring):
    return string.replace(substring, "")


#to generate the output from model using user's text and making it prompt suitable for the llama and then passing it to model
def generate(text):
    prompt = get_prompt(text)
    with torch.autocast('cuda', dtype=torch.bfloat16):
      #using llama tokenizer to tokenize the input and getting back a tensor
        inputs = tokenizer(prompt, return_tensors="pt").to('cuda')

      #giving input to our model and generating output
        outputs = model.generate(**inputs,
                                 max_new_tokens=512,
                                 eos_token_id=tokenizer.eos_token_id,
                                 pad_token_id=tokenizer.eos_token_id,
                                 )
        #processing the output
        final_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        final_outputs = cut_off_text(final_outputs, '</s>')
        final_outputs = remove_substring(final_outputs, prompt)

    return final_outputs

def parse_text(text):
        wrapped_text = textwrap.fill(text, width=100)
        print(wrapped_text +'\n\n')
        # return assistant_text


In [6]:
instruction = "What is the temperature in Melbourne?"

get_prompt(instruction)

"[INST]<<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<</SYS>>\n\nWhat is the temperature in Melbourne?[/INST]"

In [8]:
!pip install langchain

Collecting langchain
  Downloading langchain-0.1.1-py3-none-any.whl (802 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m802.4/802.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.3-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langchain-community<0.1,>=0.0.13 (from langchain)
  Downloading langchain_community-0.0.13-py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-core<0.2,>=0.1.9 (from langchain)
  Downloading langchain_core-0.1.12-py3-none-any.whl (218 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m218.9/218.9 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langsmith<0.1.0,>=0.0.77 (from langchain)
  Downloading langsmith

## Using LangChain:

In [9]:
from langchain import HuggingFacePipeline
from langchain import PromptTemplate,  LLMChain


In [10]:
#using llama pipeline to make the llm from which we can generate the answers
llm = HuggingFacePipeline(pipeline = pipe, model_kwargs = {'temperature':0})

In [12]:
#making the prompt
system_prompt = "You are an advanced assistant that excels at translation. "
instruction = "Convert the following text from English to French:\n\n {text}"
template = get_prompt(instruction, system_prompt)
print(template)


[INST]<<SYS>>
You are an advanced assistant that excels at translation. 
<</SYS>>

Convert the following text from English to French:

 {text}[/INST]


In [14]:
#making LangChain Prompt Template
prompt = PromptTemplate(template=template, input_variables=["text"])

#making llm chain that will run queries against our LLM using Prompt Template that we designed
llm_chain = LLMChain(prompt=prompt, llm=llm)

In [15]:
text = "how are you today?"
output = llm_chain.run(text)

parse_text(output)

  warn_deprecated(


  Bien sûr! Here is the translation of "how are you today?" in French:  Comment allez-vous
aujourd'hui?


