In [None]:
!pip install ctransformers==0.2.5
!pip install sentence-transformers==2.2.2
!pip install pinecone-client
!pip install flask
!pip install pypdf
!pip install python-dotenv
!pip install langchain==0.0.225

In [1]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
#from langchain.vectorstores import Pinecone
from pinecone import Pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

  from tqdm.autonotebook import tqdm


In [2]:
PINECONE_API_KEY = "1ac95287-4650-4ecf-adad-25ac642dd101"
PINECONE_API_ENV = "gcp-starter"

In [3]:
#Extract data from the PDF
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    
    documents = loader.load()

    return documents

In [5]:
extracted_data = load_pdf("data/")

In [6]:
#Create text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 800, chunk_overlap = 50)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [29]:
print(type(text_split(extracted_data)))

<class 'list'>


In [7]:
text_chunks = text_split(extracted_data)
print("length of my chunk:", len(text_chunks))


length of my chunk: 5370


In [8]:
print(text_chunks)

[Document(page_content='FUNDAMENT ALS OF\nDatabase \nSystems\nSEVENTH EDITION', metadata={'source': 'data\\Azal2020-01-22-12-28-11-76901 book.pdf', 'page': 1}), Document(page_content='This page intentionally left blank', metadata={'source': 'data\\Azal2020-01-22-12-28-11-76901 book.pdf', 'page': 2}), Document(page_content='FUNDAMENT ALS OF \nDatabase \nSystems\nSEVENTH EDITION\nRamez Elmasri\nDepartment of Computer Science and Engineering\nThe University of Texas at Arlington\nShamkant B. Navathe\nCollege of ComputingGeorgia Institute of Technology\nBoston  Columbus  Indianapolis  New Y ork  San Francisco  Hoboken \nAmsterdam  Cape Town  Dubai  London  Madrid  Milan  Munich  Paris  Montreal  Toronto \nDelhi  Mexico City  São Paulo  Sydney  Hong Kong  Seoul  Singapore  Taipei  Tokyo', metadata={'source': 'data\\Azal2020-01-22-12-28-11-76901 book.pdf', 'page': 3}), Document(page_content='Vice President and Editorial Director, ECS:  \n Marcia J. Horton\nAcquisitions Editor: Matt Goldstein

In [9]:
for index, text in enumerate(text_chunks):
    print(index, text)
    #print (text.page_content)

0 page_content='FUNDAMENT ALS OF\nDatabase \nSystems\nSEVENTH EDITION' metadata={'source': 'data\\Azal2020-01-22-12-28-11-76901 book.pdf', 'page': 1}
1 page_content='This page intentionally left blank' metadata={'source': 'data\\Azal2020-01-22-12-28-11-76901 book.pdf', 'page': 2}
2 page_content='FUNDAMENT ALS OF \nDatabase \nSystems\nSEVENTH EDITION\nRamez Elmasri\nDepartment of Computer Science and Engineering\nThe University of Texas at Arlington\nShamkant B. Navathe\nCollege of ComputingGeorgia Institute of Technology\nBoston  Columbus  Indianapolis  New Y ork  San Francisco  Hoboken \nAmsterdam  Cape Town  Dubai  London  Madrid  Milan  Munich  Paris  Montreal  Toronto \nDelhi  Mexico City  São Paulo  Sydney  Hong Kong  Seoul  Singapore  Taipei  Tokyo' metadata={'source': 'data\\Azal2020-01-22-12-28-11-76901 book.pdf', 'page': 3}
3 page_content='Vice President and Editorial Director, ECS:  \n Marcia J. Horton\nAcquisitions Editor: Matt Goldstein\nEditorial Assistant: Kelsey Loanes\n

In [13]:
pc_all = []
# Iterate through the documents and access metadata
for d in text_chunks:
  pc = d.page_content
  pc_all.append(pc)

print(f"Content of first page: {pc_all[2]}")

Content of first page: 7


In [10]:
#download embedding model
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [11]:
embeddings = download_hugging_face_embeddings()

In [12]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [14]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))
print(query_result)

Length 384
[-0.03447723761200905, 0.031023172661662102, 0.006734967231750488, 0.02610894665122032, -0.03936203941702843, -0.1603025197982788, 0.06692396849393845, -0.006441415287554264, -0.0474504753947258, 0.014758882112801075, 0.07087530195713043, 0.05552757531404495, 0.019193356856703758, -0.02625138871371746, -0.010109545662999153, -0.026940535753965378, 0.02230738289654255, -0.022226667031645775, -0.14969263970851898, -0.01749306730926037, 0.007676335051655769, 0.05435227230191231, 0.003254442475736141, 0.03172587975859642, -0.08462145179510117, -0.02940605953335762, 0.05159563571214676, 0.04812406376004219, -0.0033148345537483692, -0.058279212564229965, 0.04196925461292267, 0.022210633382201195, 0.1281888782978058, -0.02233893983066082, -0.011656169779598713, 0.06292835623025894, -0.03287630155682564, -0.09122607856988907, -0.031175361946225166, 0.05269959196448326, 0.04703481122851372, -0.0842030867934227, -0.030056176707148552, -0.02074478380382061, 0.009517849422991276, -0.003

In [15]:
from tqdm.auto import tqdm
# Get API key from environment variable
api_key = PINECONE_API_KEY

# Create a Pinecone instance
pc = Pinecone(api_key=api_key)

index_name = "personal-data-engineer"

index = pc.Index(index_name)
#docsearch=Pinecone.from_texts([t.page_content for t in text_chunks], embeddings, index_name=index_name)
for i, text in tqdm(enumerate(text_chunks)):
    # Use the index as the ID
    vector_id = f"doc_{i}"  # Create a unique ID using f-string
    embed = embeddings.embed_query(text.page_content)
    
    # Create metadata dictionary
    metadata = {
        'content': text.page_content,  # Assuming 'page_content' as 'title'
        'source': text.metadata['source'],  # Assuming 'source' from metadata
        'page': text.metadata['page']  # Assuming 'page' from metadata
    }
    result = [vector_id, embed, metadata]
    #print(result)
    #print(result[0])
    #print(result[1])
    #print(result[2])
    
    # Append metadata to metadatas list
    #metadatas.append(metadata)
    
    # Add everything to pinecone
    index.upsert(vectors=[(vector_id, embed, metadata)])


4666it [3:41:16,  2.85s/it]


ProtocolError: Failed to connect; did you specify the correct index name?

In [None]:
#import os
#from pinecone import Pinecone

# Get API key from environment variable
api_key = PINECONE_API_KEY

# Create a Pinecone instance
pc = Pinecone(api_key=api_key)

index_name = "personal-data-engineer"

index = pc.Index(index_name)



for i, text in enumerate(text_chunks):
  # Use the index as the ID
  print(text)
  vector_id = f"doc_{i}"  # Create a unique ID using f-string
  print(vector_id)
  print(text)
  embed = embeddings.embed_query(text.page_content)
  print(embed)
  #print(list(zip(vector_id, embed)))
  result = [vector_id, embed]
  print(result)
  print(result[0])
  print(result[1])
  #index.upsert(vectors=[(result[0], result[1])])


In [12]:
%pip install --upgrade --quiet  langchain-pinecone langchain-openai langchain

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\megas\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [26]:
from langchain.vectorstores import Pinecone

text_field = "text"  # the metadata field that contains our text

# initialize the vector store object
vectorstore = Pinecone(
    index, embed.embed_query, text_field
)

PineconeConfigurationError: You haven't specified an Api-Key.

In [11]:
docsearch=Pinecone.from_existing_index(index_name, embeddings)

query = "What are Allergies"

docs=docsearch.similarity_search(query, k=3)

print("Result", docs)

AttributeError: type object 'Pinecone' has no attribute 'from_existing_index'

In [71]:
# Define two lists
list1 = ['a', 'b', 'c']
list2 = [1, 2, 3]

# Use zip to combine elements from both lists
combined = zip(list1, list2)

# Convert the result to a list (optional, depending on use case)
combined_list = list(combined)
print(combined)
# Print the combined list
print(combined_list)


<zip object at 0x000002569E2DA9C0>
[('a', 1), ('b', 2), ('c', 3)]


In [35]:
#Initializing the Pinecone
pinecone.init(api_key=PINECONE_API_KEY,
              environment=PINECONE_API_ENV)

index_name="personal-data-engineer"

#Creating Embeddings for Each of The Text Chunks & storing
docsearch=Pinecone.from_texts([t.page_content for t in text_chunks], embeddings, index_name=index_name)

AttributeError: init is no longer a top-level attribute of the pinecone package.

Please create an instance of the Pinecone class instead.

Example:

    import os
    from pinecone import Pinecone, ServerlessSpec

    pc = Pinecone(
        api_key=os.environ.get("PINECONE_API_KEY")
    )

    # Now do stuff
    if 'my_index' not in pc.list_indexes().names():
        pc.create_index(
            name='my_index', 
            dimension=1536, 
            metric='euclidean',
            spec=ServerlessSpec(
                cloud='aws',
                region='us-west-2'
            )
        )

