In [1]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec 

  from tqdm.autonotebook import tqdm


In [2]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag-diseases", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud= "aws", region="us-east-1") 
)
#dimension of openai embedding model = 1536

In [3]:
import json
data = json.load(open("data_disease.json"))
data['diseases']

[{'disease': 'Influenza',
  'symptoms': ['Fever', 'Cough', 'Sore throat', 'Runny nose', 'Body aches'],
  'description': 'A viral infection that attacks the respiratory system.',
  'treatment': ['Rest', 'Fluids', 'Antiviral medications']},
 {'disease': 'Diabetes Type 2',
  'symptoms': ['Increased thirst',
   'Frequent urination',
   'Fatigue',
   'Blurred vision'],
  'description': 'A chronic condition that affects the way the body processes blood sugar (glucose).',
  'treatment': ['Diet', 'Exercise', 'Insulin therapy', 'Oral medications']},
 {'disease': 'Hypertension',
  'symptoms': ['Headache', 'Shortness of breath', 'Nosebleeds', 'Fatigue'],
  'description': 'A condition in which the force of the blood against the artery walls is too high.',
  'treatment': ['Lifestyle changes', 'Antihypertensive medications']},
 {'disease': 'Asthma',
  'symptoms': ['Wheezing',
   'Shortness of breath',
   'Chest tightness',
   'Coughing'],
  'description': 'A condition in which the airways narrow and

In [4]:
processed_data = []
client = OpenAI()

for disease in data['diseases']:
    response = client.embeddings.create(
        input = disease['disease'],
        model = "text-embedding-3-small"
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": disease["disease"],
        "metadata" : {
            "symptoms": disease["symptoms"],
            "description": disease["description"],
            "treatment": disease["treatment"]
        }

    })

In [13]:
processed_data[0]

{'values': [-0.062478933,
  0.012959809,
  -0.017740643,
  0.02136142,
  -0.017201627,
  0.0034391535,
  -0.010387767,
  0.015162742,
  -0.04173855,
  0.03060671,
  0.026130537,
  0.066978544,
  0.004628503,
  0.034332946,
  -0.010844759,
  0.0157955,
  -0.024466619,
  -0.0018997369,
  -0.017963279,
  0.037824832,
  -0.011202149,
  0.010979513,
  -0.012432511,
  0.0064506102,
  -0.033559576,
  0.01584237,
  -0.023083927,
  0.009069523,
  0.027677277,
  -0.0058910884,
  0.055401426,
  -0.027911631,
  0.04670687,
  0.008096951,
  0.017869538,
  -0.02988021,
  -0.019498302,
  -0.008009068,
  -0.01496354,
  -0.03520006,
  0.03454387,
  0.017963279,
  0.0035182482,
  0.031356644,
  0.0065502105,
  0.024677537,
  0.014201888,
  -0.043214984,
  0.01393238,
  0.054932717,
  -0.014623727,
  0.014401089,
  -0.014459678,
  0.041785423,
  0.011500952,
  0.033629883,
  0.030419225,
  0.04572258,
  -0.02099817,
  -0.030395791,
  0.032856513,
  -0.030372355,
  0.028450647,
  -0.0072122626,
  -0.01694

In [6]:
index = pc.Index('rag-diseases')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 21}

In [7]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 21}},
 'total_vector_count': 21}