In [1]:
!pip install pinecone-client cohere datasets transformers



Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting cohere
  Downloading cohere-5.9.4-py3-none-any.whl.metadata (3.4 kB)
Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client)
  Downloading pinecone_plugin_inference-1.1.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Collecting boto3<2.0.0,>=1.34.0 (from cohere)
  Downloading boto3-1.35.24-py3-none-any.whl.metadata (6.6 kB)
Collecting fastavro<2.0.0,>=1.9.4 (from cohere)
  Downloading fastavro-1.9.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.5 kB)
Collecting httpx>=0.21.2 (from cohere)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx-sse==0.4.0 (from cohere)
  Downloading httpx_sse-

In [27]:
from pinecone import Pinecone, ServerlessSpec
from google.colab import userdata
userdata.get('pinecode_api')

pc = Pinecone(api_key="api_key")
index_name = "quickstart"

# Check if the index exists and delete it if needed
if index_name in pc.list_indexes():
    pc.delete_index(index_name)
    print(f"Index '{index_name}' deleted.")

# Create the index with the correct dimension
pc.create_index(
    name=index_name,
    dimension=384,  # Set to 384 to match your embeddings
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

print(f"Index '{index_name}' created with dimension 384.")

# Now connect to the newly created index
index = pc.Index(index_name)



Index 'quickstart' created with dimension 384.


In [4]:
!pip install transformers torch sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.1.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.1.1-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.3/245.3 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.1.1


In [36]:
from sentence_transformers import SentenceTransformer

# Load the pre-trained model from Hugging Face
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Example document list (you can load any dataset here)
documents = [
    "Artificial Intelligence (AI) is transforming the way businesses operate. Companies are leveraging AI to enhance decision-making, streamline operations, and personalize customer experiences. For instance, AI-powered chatbots provide 24/7 customer service, while machine learning algorithms analyze data to identify trends.",
    "In today's digital age, data security is paramount. Organizations must protect sensitive information from cyber threats. Implementing robust security measures such as encryption, firewalls, and regular security audits can mitigate risks. Additionally, employee training on data protection practices is essential.",
    "Renewable energy sources like solar, wind, and hydroelectric power are vital for sustainable development. They reduce reliance on fossil fuels, decrease greenhouse gas emissions, and promote energy independence. Governments and businesses are investing in renewable technologies to combat climate change and enhance energy security.",
    "E-commerce has revolutionized the retail landscape. Online shopping offers convenience, variety, and competitive pricing. Businesses are increasingly adopting e-commerce platforms to reach a global audience. Effective digital marketing strategies, including SEO and social media marketing, are crucial for success in this space.",
    "Remote work is becoming a permanent feature of the modern workplace. The COVID-19 pandemic accelerated this trend, highlighting the benefits of flexibility and work-life balance. Companies are adopting hybrid models, allowing employees to work from home or the office. This shift requires robust communication tools and management practices to ensure productivity.",
    "Healthcare technology is advancing rapidly, with innovations like telemedicine, wearable devices, and artificial intelligence transforming patient care. Telemedicine allows patients to consult healthcare providers remotely, while wearables monitor vital signs. AI is being used for diagnostics, personalized treatment plans, and predictive analytics.",
    "Mental health awareness is crucial for reducing stigma and promoting well-being. Public campaigns, workplace initiatives, and educational programs can help individuals recognize mental health issues and seek help. Employers are increasingly prioritizing mental health support as part of their employee wellness programs.",
    "The exploration of space is one of humanity's greatest endeavors. Missions to Mars, the study of exoplanets, and the search for extraterrestrial life capture the imagination. Advances in rocket technology and international collaborations are paving the way for future discoveries, expanding our understanding of the universe."
]

# Generate embeddings for each document
embeddings = model.encode(documents)

print(f"Generated embeddings shape: {embeddings.shape}")
print(embeddings)


Generated embeddings shape: (8, 384)
[[-0.05545413 -0.02868909  0.03191197 ...  0.02552249  0.06639387
  -0.04507197]
 [-0.06483939  0.14133471  0.01476007 ... -0.04766766  0.01049874
  -0.01045628]
 [-0.02017534  0.11678743  0.06168872 ...  0.01414315  0.04522388
   0.0090406 ]
 ...
 [-0.06970906  0.03562601  0.00828199 ... -0.0110762   0.05770411
  -0.04286506]
 [ 0.03848302  0.04813217  0.0199913  ... -0.02255969  0.02538407
   0.05896911]
 [-0.02541891  0.01361316  0.00559285 ... -0.01891292 -0.07098498
  -0.02272352]]


In [42]:

# Upsert embeddings with metadata
for i, embedding in enumerate(embeddings):
    index.upsert(vectors=[(f"doc_{i}", embedding.tolist(), {"text": documents[i]})])  # Storing original text


Available indexes: {'indexes': [{'deletion_protection': 'disabled',
              'dimension': 8,
              'host': 'quickstart-hciszb3.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'quickstart',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}
Index 'quickstart' does not exist.


In [39]:
def query_index(query):

    query_embedding = model.encode([query]).tolist()[0]


    results = index.query(
        vector=query_embedding,
        top_k=3,
        include_values=True,
        include_metadata=True
    )

    return results


In [44]:
def generate_answer(context, query):
    return f"Answer based on context: {context} for query: {query}"

query = "What is ai?"
results = query_index(query)


if results['matches']:
    # Get the top match
    top_match = results['matches'][0]
    context = top_match['metadata']
    answer = generate_answer(context, query)
    print(answer)
else:
    print("No matches found.")


Answer based on context: {'text': 'Artificial Intelligence (AI) is transforming the way businesses operate. Companies are leveraging AI to enhance decision-making, streamline operations, and personalize customer experiences. For instance, AI-powered chatbots provide 24/7 customer service, while machine learning algorithms analyze data to identify trends.'} for query: What is ai?
