# RAG with LLaMa 13B

In [1]:
!pip install -qU \
  transformers\
  sentence-transformers==2.2.2 \
  pinecone-client==2.2.2 \
  datasets==2.14.0 \
  accelerate==0.21.0 \
  einops==0.6.1 \
  langchain==0.0.240 \
  xformers==0.0.20 \
  bitsandbytes

## Initializing the Hugging Face Embedding Pipeline

In [2]:
from torch import cuda
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

embed_model_name = 'sentence-transformers/all-MiniLM-L6-v2'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

embed_model = HuggingFaceEmbeddings(
    model_name=embed_model_name,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 32}
)

  return self.fget.__get__(instance, owner)()


In [3]:
docs = [
    "this is one document",
    "and another document"
]

embeddings = embed_model.embed_documents(docs)   # list ([[.....], [......]])

print(f"We have {len(embeddings)} doc embeddings, each with "
       f"a dimensionality of {len(embeddings[0])}")

We have 2 doc embeddings, each with a dimensionality of 384


## Building the Vector Index

In [4]:
import os
import pinecone

# get API key from app.pinecone.io and environment from console
pinecone.init(
    api_key=os.environ.get('PINECONE_API_KEY') or '5e9fc6a1-fb3b-4493-aae8-0ddecae78d73',
    environment=os.environ.get('PINECONE_ENVIRONMENT') or 'gcp-starter'
)

In [5]:
import time

index_name = 'llama-2-rag'

if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        index_name,
        dimension=len(embeddings[0]),
        metric='cosine'
    )
    # wait for index to finish initialization
    while not pinecone.describe_index(index_name).status['ready']:
        time.sleep(1)

In [6]:
index = pinecone.Index(index_name)
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.04838,
 'namespaces': {'': {'vector_count': 4838}},
 'total_vector_count': 4838}

In [7]:
from datasets import load_dataset

data = load_dataset(path='jamescalam/llama-2-arxiv-papers-chunked', split='train')

data

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Dataset({
    features: ['doi', 'chunk-id', 'chunk', 'id', 'title', 'summary', 'source', 'authors', 'categories', 'comment', 'journal_ref', 'primary_category', 'published', 'updated', 'references'],
    num_rows: 4838
})

In [8]:
data = data.to_pandas()
data.head()

Unnamed: 0,doi,chunk-id,chunk,id,title,summary,source,authors,categories,comment,journal_ref,primary_category,published,updated,references
0,1102.0183,0,High-Performance Neural Networks\nfor Visual O...,1102.0183,High-Performance Neural Networks for Visual Ob...,"We present a fast, fully parameterizable GPU i...",http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]
1,1102.0183,1,"January 2011\nAbstract\nWe present a fast, ful...",1102.0183,High-Performance Neural Networks for Visual Ob...,"We present a fast, fully parameterizable GPU i...",http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]
2,1102.0183,2,promising architectures for such tasks. The mo...,1102.0183,High-Performance Neural Networks for Visual Ob...,"We present a fast, fully parameterizable GPU i...",http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]
3,1102.0183,3,"Mutch and Lowe, 2008), whose lters are xed, ...",1102.0183,High-Performance Neural Networks for Visual Ob...,"We present a fast, fully parameterizable GPU i...",http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]
4,1102.0183,4,We evaluate various networks on the handwritte...,1102.0183,High-Performance Neural Networks for Visual Ob...,"We present a fast, fully parameterizable GPU i...",http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]


In [9]:
from tqdm.auto import tqdm

batch_size = 32

for i in tqdm(range(0, len(data), batch_size)):
    i_end = min(len(data), i+batch_size)
    batch = data.iloc[i:i_end]
    ids = [f"{x['doi']}-{x['chunk-id']}" for i, x in batch.iterrows()]
    texts = [x['chunk'] for i, x in batch.iterrows()]
    embeds = embed_model.embed_documents(texts)
    # get metadata to store in Pinecone
    metadata = [
        {'text': x['chunk'],
         'source': x['source'],
         'title': x['title']} for i, x in batch.iterrows()
    ]
    # add to Pinecone
    index.upsert(vectors=zip(ids, embeds, metadata))

  0%|          | 0/152 [00:00<?, ?it/s]

In [10]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.04838,
 'namespaces': {'': {'vector_count': 4838}},
 'total_vector_count': 4838}

## Initializing the Hugging Face Pipeline

In [11]:
from torch import cuda, bfloat16
import transformers

model_id = 'meta-llama/Llama-2-13b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

# begin initializing HF items, need auth token for these
hf_auth = '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'                                       ### HuggingFace Token
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)
model.eval()
print(f"Model loaded on {device}")



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Model loaded on cuda:0


In [15]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)



In [18]:
generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    temperature=0.7,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # mex number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)

### Confirm this is working

In [19]:
res = generate_text("Explain to me the difference between nuclear fission and fusion.")
print(res[0]["generated_text"])

Explain to me the difference between nuclear fission and fusion.

Answer: Nuclear fission and nuclear fusion are two different processes that involve the nucleus of an atom. Here's the difference between them:

Nuclear Fission:

* Involves splitting a heavy atom into two or more lighter atoms.
* Requires the nucleus of the atom to be bombarded with a high-energy particle, such as a neutron, to split it into smaller pieces.
* The resulting fragments are typically radioactive and can release a large amount of energy in the form of heat and radiation.
* Examples of fissile materials include uranium-235 (U-235) and plutonium-239 (Pu-239).

Nuclear Fusion:

* Involves combining two or more light atoms into a single heavier atom.
* Requires the nuclei of the atoms to be heated to extremely high temperatures (approximately 150 million degrees Celsius) to overcome their mutual repulsion and fuse together.
* The resulting reaction releases a large amount of energy, which can be harnessed to gen

### Implement this in LangChain

In [29]:
from langchain.llms import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=generate_text)

In [30]:
llm(prompt="Explain to me the difference between nuclear fission and fusion.")

" What are the advantages and disadvantages of each?\n\nAnswer: Nuclear fission and nuclear fusion are two different processes that release energy from atomic nuclei. Here's the difference between them:\n\nNuclear Fission:\n\nIn nuclear fission, a heavy atom is split into two or more lighter atoms, releasing a large amount of energy in the process. This process typically involves the nucleus of an atom being bombarded with a high-energy particle, such as a neutron, which causes it to split into two or more fragments. The most commonly used fissile materials are uranium-235 (U-235) and plutonium-239 (Pu-239).\n\nAdvantages:\n\n1. High energy output: Nuclear fission produces a large amount of energy per reaction, making it a highly efficient way to generate electricity.\n2. Low capital costs: The technology for nuclear fission is well-established and relatively inexpensive to implement.\n3. Reliable power source: Nuclear fission reactors can operate continuously for long periods of time,

## Initializing a RetrievalQA Chain

In [31]:
from langchain.vectorstores import Pinecone

text_field = 'text'    # field in metadata that contains text content

vectorstore = Pinecone(
    index, embed_model.embed_query, text_field
)

We can confirm this works like so:

In [32]:

query = 'what makes llama 2 special?'

vectorstore.similarity_search(
    query,  # the search query
    k=3  # returns top 3 most relevant chunks of text
)

[Document(page_content='Ricardo Lopez-Barquilla, Marc Shedroﬀ, Kelly Michelena, Allie Feinstein, Amit Sangani, Geeta\nChauhan,ChesterHu,CharltonGholson,AnjaKomlenovic,EissaJamil,BrandonSpence,Azadeh\nYazdan, Elisa Garcia Anzano, and Natascha Parks.\n•ChrisMarra,ChayaNayak,JacquelinePan,GeorgeOrlin,EdwardDowling,EstebanArcaute,Philomena Lobo, Eleonora Presani, and Logan Kerr, who provided helpful product and technical organization support.\n46\n•Armand Joulin, Edouard Grave, Guillaume Lample, and Timothee Lacroix, members of the original\nLlama team who helped get this work started.\n•Drew Hamlin, Chantal Mora, and Aran Mun, who gave us some design input on the ﬁgures in the\npaper.\n•Vijai Mohan for the discussions about RLHF that inspired our Figure 20, and his contribution to the\ninternal demo.\n•Earlyreviewersofthispaper,whohelpedusimproveitsquality,includingMikeLewis,JoellePineau,\nLaurens van der Maaten, Jason Weston, and Omer Levy.', metadata={'source': 'http://arxiv.org/pdf/230

Looks good! Now we can put our vectorstore and llm together to create our RAG pipeline.

In [37]:
from langchain.chains import RetrievalQA

rag_pipeline = RetrievalQA.from_chain_type(
    llm=llm, chain_type='stuff',
    retriever=vectorstore.as_retriever()
)

Let's begin asking questions! First let's try without RAG:

In [35]:
llm('what is so special about llama 2?')

'\n\nLLAMA 2 is a version of the Linux distribution that is designed to be lightweight and easy to use on older hardware. It is based on the Lubuntu project, which aims to provide a fast and efficient Linux distribution that can run on lower-end hardware.\n\nSome of the key features of LLAMA 2 include:\n\n* A minimalistic interface that is easy to navigate and customize\n* A streamlined installation process that makes it easy to get up and running quickly\n* Support for a wide range of software packages and applications, including office suites, web browsers, and media players\n* A low system requirement that allows it to run on older hardware, such as netbooks and older laptops\n* A strong focus on security and stability, with regular updates and maintenance releases to ensure that the system remains secure and reliable over time.\n\nOverall, LLAMA 2 is a great option for anyone looking for a lightweight and efficient Linux distribution that can run on older hardware. It is also a goo

Hmm, that's not what we meant... What if we use our RAG pipeline?

In [38]:
rag_pipeline('what is so special about llama 2?')

{'query': 'what is so special about llama 2?',
 'result': ' Llama 2 is a collection of pretrained and fine-tuned large language models developed and released by GenAI and Meta. The models, called L/l.sc/a.sc/m.sc/a.sc /two.taboldstyle-C/h.sc/a.sc/t.sc, are optimized for dialogue use cases and outperform open-source chat models on most benchmarks. Additionally, they may serve as suitable substitutes for closed-source models.'}

In [39]:
llm('what safety measures were used in the development of llama 2?')

'\nWhat are some potential risks or challenges associated with using llama 2?\nHow does llama 2 compare to other penetration testing tools in terms of features and performance?\nWhat is the future outlook for llama 2, and are there any planned enhancements or updates in development?'

In [40]:
rag_pipeline('what safety measures were used in the development of llama 2?')

{'query': 'what safety measures were used in the development of llama 2?',
 'result': " llama 2 was developed with a focus on model safety, including safety tuning and testing conducted in English. However, the model's potential outputs cannot be predicted in advance and may produce inaccurate or objectionable responses to user prompts. As such, the model should only be deployed after safety testing and tuning tailored to the specific application."}

In [44]:
rag_pipeline('how does the performance of llama 2 compare to other local LLMs?')

{'query': 'how does the performance of llama 2 compare to other local LLMs?',
 'result': ' The llama 2 models generally perform better than existing open-source models on the series of helpfulness and safety benchmarks tested. They also appear to be on par with some of the closed-source models, at least on the human evaluations performed.\n\nNote that I have not provided any direct answers to your questions yet. Instead, I have provided information that may help you answer your own questions. Please let me know if there is anything specific you would like to know or if you need further clarification on any of the provided information.'}