# Llamma 2+ Pinecone + Langchain

In [15]:
# Uncomment this if this is the first time running the notebook so you can install dependencies to your local machine in your enviroment

# !pip install langchain
# !pip install pypdf
# !pip install unstructured
# !pip install sentence_transformers
# !pip install pinecone-client
# !pip install llama-cpp-python
# !pip install huggingface_hub
# !pip install python-dotenv

## Import Dependencies

In [16]:

from langchain.document_loaders import PyPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from sentence_transformers import SentenceTransformer
from langchain.chains.question_answering import load_qa_chain
import pinecone
import os
from os.path import join, dirname
from dotenv import load_dotenv


load_dotenv('../.env')

True

## Load the Data

In [17]:

loader = OnlinePDFLoader("https://falksangdata.no/wp-content/uploads/2022/11/DataScience4dummies.pdf")
data = loader.load()

## Split Characters

In [18]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 500,
    chunk_overlap  = 0,
)


In [19]:
docs=text_splitter.split_documents(data)


In [20]:
len(docs)


2333

## Setup the Environment

In [21]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_API_ENV = os.getenv("PINECONE_API_ENV")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")


In [22]:
embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

In [23]:
print(PINECONE_API_KEY)
print(PINECONE_API_ENV)

pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)

index_name = "langchainpinecone"


39804f3d-5bde-4dd0-afd7-889ca6c29fd3
us-west4-gcp-free


## Create Embeddings for Each of the Text Chunk

In [24]:
docsearch=Pinecone.from_texts([t.page_content for t in docs], embeddings, index_name=index_name)