# Llamma 2+ Pinecone + Langchain

In [36]:
# Uncomment this if this is the first time running the notebook so you can install dependencies to your local machine in your enviroment

  # !pip install langchain
  # !pip install pypdf
  # !pip install unstructured
  # !pip install sentence_transformers
  # !pip install pinecone-client
  # !pip install llama-cpp-python
  # !pip install huggingface_hub
  # !pip install python-dotenv

## Import Dependencies

In [37]:

from langchain.document_loaders import PyPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from sentence_transformers import SentenceTransformer
from langchain.chains.question_answering import load_qa_chain
import pinecone
import os
from os.path import join, dirname
from dotenv import load_dotenv


load_dotenv('../.env')

True

## Load the Data

In [38]:

loader = OnlinePDFLoader("https://falksangdata.no/wp-content/uploads/2022/11/DataScience4dummies.pdf")
data = loader.load()

## Split Characters

In [39]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 500,
    chunk_overlap  = 0,
)


In [40]:
docs=text_splitter.split_documents(data)


In [41]:
len(docs)


2333

## Setup the Environment

In [42]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_API_ENV = os.getenv("PINECONE_API_ENV")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")


In [43]:
embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

In [44]:
print(PINECONE_API_KEY)
print(PINECONE_API_ENV)

pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)

index_name = "langchainpinecone"


39804f3d-5bde-4dd0-afd7-889ca6c29fd3
us-west4-gcp-free


## Create Embeddings for Each of the Text Chunk

In [45]:
# Only need to run once to create the index.
# docsearch=Pinecone.from_texts([t.page_content for t in docs], embeddings, index_name=index_name)

In [46]:
query="What is the BigO?"

In [47]:
docs = docsearch.similarity_search(query, k=1)

In [48]:
docs

[Document(page_content='Data science, machine learning engineering, and data engineering cover different functions within the big data paradigm — an approach wherein huge velocities, varieties, and volumes of structured, unstructured, and semistructured data are being captured, processed, stored, and analyzed using a set of techniques and technologies that are completely novel compared to those that were used in decades past.', metadata={})]

## Query the Docs to get the Answer Back (Llama 2 Model)

In [49]:
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir --verbose


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Using pip 23.2.1 from /Users/martinpatino/anaconda3/envs/langchain_ai/lib/python3.11/site-packages/pip (python 3.11)
Collecting llama-cpp-python
  Downloading llama_cpp_python-0.1.77.tar.gz (1.6 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m0m
[?25h  Installing build dependencies ... [?25l  Running command pip subprocess to install build dependencies
  Collecting setuptools>=42
    Obtaining dependency information for setuptools>=42 from https://files.pythonhosted.org/packages/c7/42/be1c7bbdd83e1bfb160c94b9cafd8e25efc7400346cf7ccdbdb452c467fa/setuptools-68.0.0-py3-none-any.whl.metadata
    

## Import All the Required Libraries

In [50]:

from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from huggingface_hub import hf_hub_download
from langchain.chains.question_answering import load_qa_chain
     

In [51]:
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])


In [52]:

model_name_or_path = "TheBloke/Llama-2-13B-chat-GGML"
model_basename = "llama-2-13b-chat.ggmlv3.q5_1.bin" # the model is in bin format

In [53]:
model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)


In [54]:
n_gpu_layers = 40  # Change this value based on your model and your GPU VRAM pool.
n_batch = 256  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.

# Loading model,
llm = LlamaCpp(
    model_path=model_path,
    max_tokens=256,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    callback_manager=callback_manager,
    n_ctx=1024,
    verbose=False,
)
  

llama.cpp: loading model from /Users/martinpatino/.cache/huggingface/hub/models--TheBloke--Llama-2-13B-chat-GGML/snapshots/47d28ef5de4f3de523c421f325a2e4e039035bab/llama-2-13b-chat.ggmlv3.q5_1.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 1024
llama_model_load_internal: n_embd     = 5120
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 40
llama_model_load_internal: n_head_kv  = 40
llama_model_load_internal: n_layer    = 40
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: n_gqa      = 1
llama_model_load_internal: rnorm_eps  = 1.0e-06
llama_model_load_internal: n_ff       = 13824
llama_model_load_internal: freq_base  = 10000.0
llama_model_load_internal: freq_scale = 1
llama_model_load_internal: ftype      = 9 (mostly Q5_1)
llama_model_load_internal: model size = 13B
llama_model_load_internal: ggml ctx size =    0.11 MB
llama_model_l

In [55]:

query="What is the BigO?"
docs=docsearch.similarity_search(query)

In [56]:
docs


[Document(page_content='Data science, machine learning engineering, and data engineering cover different functions within the big data paradigm — an approach wherein huge velocities, varieties, and volumes of structured, unstructured, and semistructured data are being captured, processed, stored, and analyzed using a set of techniques and technologies that are completely novel compared to those that were used in decades past.', metadata={}),
 Document(page_content='Data science, machine learning engineering, and data engineering cover different functions within the big data paradigm — an approach wherein huge velocities, varieties, and volumes of structured, unstructured, and semistructured data are being captured, processed, stored, and analyzed using a set of techniques and technologies that are completely novel compared to those that were used in decades past.', metadata={}),
 Document(page_content='terms, MapReduce uses parallel distributed computing to transform big data into data

In [57]:
chain=load_qa_chain(llm, chain_type="stuff")


In [58]:
chain.run(input_documents=docs, question=query)


 The Big O (capital Oh) refers to the order of growth of a function's running time or memory usage as the input size increases, notating which part of the algorithm contributes most to the overall performance.

" The Big O (capital Oh) refers to the order of growth of a function's running time or memory usage as the input size increases, notating which part of the algorithm contributes most to the overall performance."