In [7]:
from pinecone import Pinecone, ServerlessSpec
from sklearn.feature_extraction.text import TfidfVectorizer
from dotenv import load_dotenv
import os


load_dotenv()
# Initialize Pinecone with the API key and environment variables
api_key = os.getenv("PINECONE_API_KEY")
environment = os.getenv("PINECONE_ENV")

# Initialize Pinecone with the updated class method
# pc = Pinecone(api_key="abbaae8c-8f9e-45fe-8af9-4b3dcb590a1f")
# pc.environment = "us-east-1"

# Initialize Pinecone with the environment variables
pc = Pinecone(api_key=api_key)
pc.environment = environment

# print("API Key loaded successfully:", api_key)

In [3]:
# Define index name and dimension
index_name = "pinecone"
dimension = 18  # Use appropriate dimensions for your vectors

# Check if the index exists, create it if not
if index_name not in pc.list_indexes():
    pc.create_index(
        name=index_name,
        dimension=dimension,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
else:
    print(f"Index '{index_name}' already exists.")


In [4]:
# Connect to the index
index = pc.Index(index_name)

# Check the dimensionality of the index
index_info = index.describe_index_stats()
expected_dim = index_info['dimension']
print(f"Connected to index '{index_name}' with dimension: {expected_dim}")


Connected to index 'pinecone' with dimension: 18


In [5]:
# Documents to be indexed
documents = {
    "doc1": "Machine learning is fascinating.",
    "doc2": "Artificial intelligence is evolving rapidly.",
    "doc3": "Deep learning improves neural networks.",
    "doc4": "Python programming is essential in AI development.",
    "doc5": "region of success."
}

# Convert documents to vectors using TF-IDF with a fixed number of features
vectorizer = TfidfVectorizer(max_features=expected_dim)  # Adjust max_features to match the index dimension
doc_ids = list(documents.keys())
doc_vectors = vectorizer.fit_transform(documents.values()).toarray()

# Upsert (insert or update) documents in Pinecone
for i in range(len(doc_ids)):
    vector = doc_vectors[i].tolist()
    search_results = index.query(vector=vector, top_k=1, namespace="ns-1")
    if search_results['matches']:
        print(f"Updating document ID '{doc_ids[i]}' in the index.")
    else:
        print(f"Inserting new document ID '{doc_ids[i]}' into the index.")
    index.upsert(vectors=[{"id": doc_ids[i], "values": vector}], namespace="ns-1")

print("All documents have been upserted successfully.")


Inserting new document ID 'doc1' into the index.
Inserting new document ID 'doc2' into the index.
Inserting new document ID 'doc3' into the index.
Inserting new document ID 'doc4' into the index.
Inserting new document ID 'doc5' into the index.
All documents have been upserted successfully.


In [6]:
# Query to search for
query = "AI"

# Convert query to vector
query_vector = vectorizer.transform([query]).toarray()
query_vector = query_vector[0].tolist()  # Convert numpy array to list

# Perform search in Pinecone
search_results = index.query(vector=query_vector, top_k=1, namespace="ns-1")

# Display results
for match in search_results['matches']:
    doc_id = match['id']
    score = match['score']
    content = documents.get(doc_id, "Document content not found.")
    print(f"Document ID: {doc_id}\nScore: {score}\nContent: {content}\n")


Document ID: doc4
Score: 0.393795
Content: Python programming is essential in AI development.



In [6]:
# Update content of a document
documents["doc4"] = "Python is widely used for AI and ML development."

# Convert updated document to vector
updated_vector = vectorizer.transform([documents["doc4"]]).toarray()

# Upsert the updated document
index.upsert(vectors=[{"id": "doc4", "values": updated_vector[0]}], namespace="ns-1")
print("Document 'doc4' has been updated successfully.")


Document 'doc4' has been updated successfully.


In [7]:
# Document ID to be deleted
doc_id_to_delete = "doc5"

# Check if the document exists in the index
query_vector = vectorizer.transform([documents[doc_id_to_delete]]).toarray()[0].tolist()
search_results = index.query(vector=query_vector, top_k=1, namespace="ns-1")

if search_results['matches']:
    # If the document is found, proceed with deletion
    index.delete(ids=[doc_id_to_delete], namespace="ns-1")
    print(f"Document ID '{doc_id_to_delete}' has been successfully deleted.")
else:
    print(f"Document ID '{doc_id_to_delete}' not found in the index. No deletion performed.")


Document ID 'doc5' has been successfully deleted.
