In [1]:
import json
import os
from dotenv import load_dotenv
from pinecone import Pinecone
from tqdm.auto import tqdm

# Load environment variables
load_dotenv('.env.local')

# Initialize Pinecone
pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))

# Connect to the index
index_name = os.getenv('PINECONE_INDEX_NAME')
index = pc.Index(index_name)

# Function to encode text (you may want to use a more sophisticated encoding method)
def simple_encode(text):
    return [sum(ord(c) for c in text) / len(text)] * 1536  # 1536 is a common embedding dimension

# Load and process the JSON data
with open('data/MedDialouge-english_train.json', 'r') as f:
    data = json.load(f)

# Prepare the data for upsert
vectors_to_upsert = []
for i, item in enumerate(tqdm(data)):
    # Combine all text fields into a single string
    text = item['description'] + ' ' + ' '.join(item['utterances'])
    
    # Create a simple vector encoding (replace with a proper embedding model for better results)
    vector = simple_encode(text)
    
    # Prepare the vector for upsert
    vectors_to_upsert.append((str(i), vector, {"text": text}))

    # Upsert in batches of 100
    if len(vectors_to_upsert) == 100:
        index.upsert(vectors=vectors_to_upsert)
        vectors_to_upsert = []

# Upsert any remaining vectors
if vectors_to_upsert:
    index.upsert(vectors=vectors_to_upsert)

print("Data upload complete!")

AttributeError: init is no longer a top-level attribute of the pinecone package.

Please create an instance of the Pinecone class instead.

Example:

    import os
    from pinecone import Pinecone, ServerlessSpec

    pc = Pinecone(
        api_key=os.environ.get("PINECONE_API_KEY")
    )

    # Now do stuff
    if 'my_index' not in pc.list_indexes().names():
        pc.create_index(
            name='my_index', 
            dimension=1536, 
            metric='euclidean',
            spec=ServerlessSpec(
                cloud='aws',
                region='us-west-2'
            )
        )



In [3]:
import requests
import json
import os
import xml.etree.ElementTree as ET
from tqdm import tqdm
import time
import re

BASE_URL = "https://apps.who.int/gho/athena/api/"

def get_indicators():
    response = requests.get(f"{BASE_URL}GHO")
    if response.status_code == 200:
        root = ET.fromstring(response.content)
        return [code.text for code in root.findall('.//Code')]
    else:
        print(f"Failed to fetch indicators. Status code: {response.status_code}")
        return []

def xml_to_dict(element):
    result = {}
    for child in element:
        if len(child) == 0:
            result[child.tag] = child.text
        else:
            result[child.tag] = xml_to_dict(child)
    return result

def get_data(indicator):
    url = f"{BASE_URL}GHO/{indicator}"
    response = requests.get(url)
    if response.status_code == 200:
        content_type = response.headers.get('Content-Type', '')
        if 'json' in content_type:
            return response.json()
        elif 'xml' in content_type:
            root = ET.fromstring(response.content)
            return xml_to_dict(root)
        else:
            print(f"Unknown content type for {indicator}: {content_type}")
            return None
    else:
        print(f"Failed to fetch data for {indicator}. Status code: {response.status_code}")
        return None

def save_data(data, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def sanitize_filename(filename):
    # Remove invalid characters and replace spaces with underscores
    return re.sub(r'[\\/*?:"<>|\n\r\t]', '', filename).replace(' ', '_')

def main():
    if not os.path.exists('who_gho_data'):
        os.makedirs('who_gho_data')

    indicators = get_indicators()
    print(f"Found {len(indicators)} indicators")

    for indicator in tqdm(indicators, desc="Downloading data"):
        data = get_data(indicator)
        if data:
            safe_indicator = sanitize_filename(indicator)
            filename = os.path.join('who_gho_data', f"{safe_indicator}.json")
            save_data(data, filename)
        time.sleep(1)  # Add a delay to avoid overwhelming the server

    print("Download complete. Data saved in 'who_gho_data' directory.")

if __name__ == "__main__":
    main()

Found 3145 indicators


Downloading data: 100%|██████████| 3145/3145 [1:15:49<00:00,  1.45s/it]

Download complete. Data saved in 'who_gho_data' directory.



