In [4]:
from pinecone_datasets import load_dataset

dataset = load_dataset('quora_all-MiniLM-L6-bm25')
dataset.documents.drop(['metadata'], axis=1, inplace=True)
dataset.documents.rename(columns={'blob': 'metadata'}, inplace=True)

# We don't need sparse_values for this demo either so let's drop those as well
dataset.documents.drop(['sparse_values'], axis=1, inplace=True)

# To speed things up in this demo, we will use 80K rows of the dataset between rows 240K -> 320K
dataset.documents.drop(dataset.documents.index[320_000:], inplace=True)
dataset.documents.drop(dataset.documents.index[:240_000], inplace=True)
print(dataset.head())

Loading documents parquet files:   0%|          | 0/10 [00:00<?, ?it/s]

            id                                             values  \
240000  515997  [-0.00531694, 0.06937869, -0.0092854, 0.003286...   
240001  515998  [-0.09243751, 0.065432355, -0.06946959, 0.0669...   
240002  515999  [-0.021924071, 0.032280188, -0.020190848, 0.07...   
240003  516000  [-0.120020054, 0.024080949, 0.10693012, -0.018...   
240004  516001  [-0.095293395, -0.048446465, -0.017618902, -0....   

                                                 metadata  
240000  {'text': ' Why is a "law of sciences" importan...  
240001  {'text': ' Is it possible to format a BitLocke...  
240002  {'text': ' Can formatting a hard drive stress ...  
240003  {'text': ' Are the new Samsung Galaxy J7 and J...  
240004  {'text': ' I just watched an add for Indonesia...  


In [5]:
print(f"Rows in dataset: {len(dataset)}")

Rows in dataset: 80000


In [6]:
from pinecone import Pinecone
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

# Retrieve Pinecone API key
api_key = os.getenv("PINECONE_API_KEY")

if not api_key:
    raise ValueError("PINECONE_API_KEY is missing from environment variables!")

# Initialize Pinecone client
pc = Pinecone(api_key=api_key)

print("Successfully connected to Pinecone!")


Successfully connected to Pinecone!


In [7]:
row1 = dataset.documents.iloc[0:1].to_dict(orient="records")[0]
dimension = len(row1['values'])
print(f"These embeddings have dimension {dimension}")

These embeddings have dimension 384


In [8]:
from pinecone import ServerlessSpec

index_name = 'gen-qa-openai-fast'

# Check if index already exists (it shouldn't if this is first time running this demo)
if not pc.has_index(name=index_name):
    # If does not exist, create index
    pc.create_index(
        name=index_name,
        dimension=dimension, # dimensionality of text-embedding-ada-002
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        )
    )

# Instantiate an index client
index = pc.Index(name=index_name)

# View index stats of our new, empty index
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}

In [9]:
from tqdm import tqdm

batch_size = 100

for start in tqdm(range(0, len(dataset.documents), batch_size), "Upserting records batch"):
    batch = dataset.documents.iloc[start:start + batch_size].to_dict(orient="records")
    index.upsert(vectors=batch)


Upserting records batch: 100%|██████████| 800/800 [04:26<00:00,  3.00it/s]


In [22]:
from sentence_transformers import SentenceTransformer
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [23]:
def find_similar_questions(question):
    # Embed the question into a query vector
    xq = model.encode(question).tolist()

    # Now query Pinecone to find similar questions
    return index.query(vector=xq, top_k=5, include_metadata=True)

In [25]:
query = (
    "Which training method should I use for sentence transformers when " +
    "I only have pairs of related sentences?"
)

xq = find_similar_questions(query)
xq


{'matches': [{'id': '522021',
              'metadata': {'text': ' How can I combine those sentences as a '
                                   'sentence?'},
              'score': 0.543346882,
              'values': []},
             {'id': '536966',
              'metadata': {'text': ' What are the best tools to use for '
                                   'Natural Language Processing currently?'},
              'score': 0.483892381,
              'values': []},
             {'id': '61616',
              'metadata': {'text': ' I know the basics of English and I can '
                                   "speak it normally, but I don't know about "
                                   'sentence structure. What can help me to '
                                   'know sentence structure?'},
              'score': 0.47395274,
              'values': []},
             {'id': '101891',
              'metadata': {'text': ' How do I learn Natural Language '
                                   'P

In [27]:
def print_query_results(results):
    for result in results['matches']:
        print(f"{round(result['score'], 2)}: {result['metadata']['text']}")

print_query_results(xq)

0.54:  How can I combine those sentences as a sentence?
0.48:  What are the best tools to use for Natural Language Processing currently?
0.47:  I know the basics of English and I can speak it normally, but I don't know about sentence structure. What can help me to know sentence structure?
0.45:  How do I learn Natural Language Processing?
0.45:  What is the best way to learn phrasal verbs?


In [48]:
import os
import openai
from dotenv import load_dotenv
from openai import OpenAI
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

print(OPENAI_API_KEY)

def generate_answer(query, retrieved_docs):
    """Generate response using OpenAI's GPT with retrieved context."""
    context = "\n".join([doc["metadata"]["text"] for doc in retrieved_docs])
    
    prompt = f"""
    You are an AI assistant. Use the following context to answer the query.
    
    Context:
    {context}
    
    Query:
    {query}
    
    Answer:
    """
    client = OpenAI(api_key=OPENAI_API_KEY)
    response = client.chat.completions.create(
    model="gpt-4o",
    messages=[{"role": "system", "content": prompt}],
    temperature=0.5,
)
    
    return response.choices[0].message.content


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


In [49]:
answer = generate_answer(query, xq['matches'])
print(answer)

When you only have pairs of related sentences and want to train sentence transformers, one effective method is to use a supervised learning approach with contrastive loss functions, such as the Siamese network architecture. This involves using a dataset of sentence pairs where each pair is labeled with a similarity score or a binary label indicating whether they are related. You can then train the model to minimize the distance between embeddings of related sentence pairs while maximizing the distance between unrelated pairs. Popular frameworks like Hugging Face's Transformers library provide tools and pre-trained models that can be fine-tuned for this purpose using your dataset.
