In [25]:
import json
import os
import sys
from dotenv import load_dotenv
from pinecone import Pinecone
from tqdm.auto import tqdm
from openai import OpenAI
import logging
import time

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load environment variables
load_dotenv('.env.local')

# Initialize Pinecone
pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))

# Connect to the index
index_name = os.getenv('PINECONE_INDEX_NAME')

# Check if the index exists, if not create it
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1536,  # OpenAI's text-embedding-ada-002 uses 1536 dimensions
        metric='cosine'
    )
    logging.info(f"Created new index: {index_name}")

index = pc.Index(index_name)

# OpenAI client initialization
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

def get_embedding(text, model="text-embedding-ada-002"):
    try:
        text = text.replace("\n", " ")
        response = client.embeddings.create(input=[text], model=model)
        return response.data[0].embedding
    except Exception as e:
        logging.error(f"An error occurred while getting embedding: {e}")
        if "Rate limit" in str(e):
            logging.warning("Rate limit exceeded. Waiting for 60 seconds before retrying.")
            time.sleep(60)
            return get_embedding(text, model)
        raise

# Load and process the JSON data
try:
    with open('data/english-dev.json', 'r') as f:
        data = json.load(f)
except FileNotFoundError:
    logging.error("The data file 'data/english-dev.json' was not found.")
    sys.exit(1)
except json.JSONDecodeError:
    logging.error("Error decoding the JSON file. Please check if it's valid JSON.")
    sys.exit(1)

# Prepare the data for upsert
vectors_to_upsert = []
for i, item in enumerate(tqdm(data)):
    # Combine all text fields into a single string
    text = item['description'] + ' ' + ' '.join(item['utterances'])
    
    try:
        # Get the embedding using OpenAI API
        vector = get_embedding(text)
        
        # Prepare the vector for upsert
        vectors_to_upsert.append((str(i), vector, {"text": text}))

        # Upsert in batches of 100
        if len(vectors_to_upsert) == 100:
            index.upsert(vectors=vectors_to_upsert)
            vectors_to_upsert = []
    except Exception as e:
        logging.error(f"Error processing item {i}: {e}")
        continue

# Upsert any remaining vectors
if vectors_to_upsert:
    try:
        index.upsert(vectors=vectors_to_upsert)
    except Exception as e:
        logging.error(f"Error upserting final batch: {e}")

logging.info("Data upload complete!")

2024-08-15 16:33:58,436 - INFO - Discovering subpackages in _NamespacePath(['c:\\Python310\\lib\\site-packages\\pinecone_plugins'])
2024-08-15 16:33:58,438 - INFO - Looking for plugins in pinecone_plugins.inference
2024-08-15 16:33:58,439 - INFO - Installing plugin inference into Pinecone


  0%|          | 0/60 [00:00<?, ?it/s]

2024-08-15 16:33:58,917 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-08-15 16:33:59,061 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-08-15 16:33:59,210 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-08-15 16:33:59,350 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-08-15 16:33:59,497 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-08-15 16:33:59,618 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-08-15 16:33:59,816 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-08-15 16:33:59,984 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-08-15 16:34:00,129 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-08-15 16:34:00,236 - INFO - HTTP

In [None]:
#####################################
# Upload https://huggingface.co/datasets/ruslanmv/ai-medical-chatbot to pineceone 
#####################################

import os
import time
from dotenv import load_dotenv
from pinecone import Pinecone
from tqdm.auto import tqdm
from datasets import load_dataset
from sentence_transformers import SentenceTransformer

# Load environment variables
load_dotenv('.env.local')

# Initialize Pinecone
api_key = os.getenv('PINECONE_API_KEY')
if not api_key:
    raise ValueError("PINECONE_API_KEY not found in environment variables")

pc = Pinecone(api_key=api_key)

# Connect to the index
index_name = os.getenv('PINECONE_INDEX_NAME')
if not index_name:
    raise ValueError("PINECONE_INDEX_NAME not found in environment variables")

index = pc.Index(index_name)

# Function to safely load dataset
def safe_load_dataset(dataset_name, split="train"):
    try:
        return load_dataset(dataset_name, split=split)
    except Exception as e:
        print(f"Error loading dataset: {e}")
        print("Attempting to load dataset in streaming mode...")
        return load_dataset(dataset_name, split=split, streaming=True)

# Load the dataset from Hugging Face
print("Loading dataset from Hugging Face...")
dataset = safe_load_dataset("ruslanmv/ai-medical-chatbot")
print("Dataset loaded successfully.")

# Initialize the SentenceTransformer model for embeddings
print("Initializing SentenceTransformer model...")
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to create embeddings
def create_embedding(text):
    return model.encode(text).tolist()

# Prepare and upsert data
batch_size = 100
vectors_to_upsert = []
total_vectors = 0
start_time = time.time()

print(f"Starting data upload in batches of {batch_size}...")
for i, item in enumerate(tqdm(dataset, desc="Processing items")):
    try:
        # Combine all text fields into a single string
        text = f"{item['Description']} {item['Patient']} {item['Doctor']}"
        
        # Create embedding
        vector = create_embedding(text)
        
        # Prepare the vector for upsert
        vectors_to_upsert.append((str(i), vector, {"text": text}))
        
        # Upsert in batches
        if len(vectors_to_upsert) == batch_size:
            index.upsert(vectors=vectors_to_upsert)
            total_vectors += len(vectors_to_upsert)
            vectors_to_upsert = []
    
    except Exception as e:
        print(f"Error processing item {i}: {e}")
        continue

# Upsert any remaining vectors
if vectors_to_upsert:
    index.upsert(vectors=vectors_to_upsert)
    total_vectors += len(vectors_to_upsert)

end_time = time.time()
total_time = end_time - start_time

print(f"Data upload complete!")
print(f"Total vectors uploaded: {total_vectors}")
print(f"Total time taken: {total_time:.2f} seconds")
print(f"Average upload rate: {total_vectors / total_time:.2f} vectors/second")

In [30]:
import os
from dotenv import load_dotenv
from pinecone import Pinecone
from tqdm.auto import tqdm
from openai import OpenAI
import logging
import time
from datasets import load_dataset

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load environment variables
load_dotenv('.env.local')

# Initialize Pinecone
pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))

# Connect to the secondary index
index_name = os.getenv('PINECONE_INDEX_SECONDARY')

# Check if the index exists, if not create it
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1536,  # OpenAI's text-embedding-ada-002 uses 1536 dimensions
        metric='cosine'
    )
    logging.info(f"Created new index: {index_name}")

index = pc.Index(index_name)

# OpenAI client initialization
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

def get_embedding(text, model="text-embedding-ada-002"):
    try:
        text = text.replace("\n", " ")
        response = client.embeddings.create(input=[text], model=model)
        return response.data[0].embedding
    except Exception as e:
        logging.error(f"An error occurred while getting embedding: {e}")
        if "Rate limit" in str(e):
            logging.warning("Rate limit exceeded. Waiting for 60 seconds before retrying.")
            time.sleep(60)
            return get_embedding(text, model)
        raise

# Load the dataset
try:
    dataset = load_dataset("ruslanmv/ai-medical-chatbot", split="train", streaming=True)
    logging.info("Dataset loaded successfully.")
except Exception as e:
    logging.error(f"Error loading dataset: {e}")
    raise

# Prepare the data for upsert
vectors_to_upsert = []
for i, item in enumerate(tqdm(dataset)):
    # Combine all text fields into a single string
    text = f"Description: {item['Description']} Patient: {item['Patient']} Doctor: {item['Doctor']}"
    
    try:
        # Get the embedding using OpenAI API
        vector = get_embedding(text)
        
        # Prepare the vector for upsert
        vectors_to_upsert.append((str(i), vector, {
            "description": item['Description'],
            "patient": item['Patient'],
            "doctor": item['Doctor']
        }))

        # Upsert in batches of 100
        if len(vectors_to_upsert) == 100:
            index.upsert(vectors=vectors_to_upsert)
            vectors_to_upsert = []

        # Optional: Break after processing a certain number of items (e.g., 1000) for testing
        if i >= 1000:
            break

    except Exception as e:
        logging.error(f"Error processing item {i}: {e}")
        continue

# Upsert any remaining vectors
if vectors_to_upsert:
    try:
        index.upsert(vectors=vectors_to_upsert)
    except Exception as e:
        logging.error(f"Error upserting final batch: {e}")

logging.info("Data upload complete!")

IndentationError: expected an indented block after 'if' statement on line 80 (103639978.py, line 83)