In [None]:
!pip install pandas couchdb requests


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
import pandas as pd
import couchdb
import requests
from io import StringIO
import math

# Step 1: Connect to CouchDB

In [None]:

db_name = <db_name>  
couch = couchdb.Server(<couch server name>)
if db_name in couch:


    db = couch[db_name]
    print(f" Connected to existing database: {db_name}")
else:
    db = couch.create(db_name)
    print(f" Created new database: {db_name}")

✅ Connected to existing database: oecd_health_expenditure


# Step 2: Fetch First Dataset (Health Protection that was uploaded earlier)


In [None]:
url_protection = (
    "https://sdmx.oecd.org/public/rest/data/"
    "OECD.ELS.HD,DSD_HEALTH_PROT@DF_HEALTH_PROT,1.0/"
    "all?startPeriod=2010&dimensionAtObservation=AllDimensions&format=csvfilewithlabels"
)

headers = {
    "User-Agent": "Mozilla/5.0"
}

response_protection = requests.get(url_protection, headers=headers)
response_protection.raise_for_status()

df_protection = pd.read_csv(StringIO(response_protection.text))
df_protection
print(f" Health Protection dataset: {df_protection.shape[0]} rows, {df_protection.shape[1]} columns.")


✅ Health Protection dataset: 4620 rows, 24 columns.


In [4]:
df_protection

Unnamed: 0,STRUCTURE,STRUCTURE_ID,STRUCTURE_NAME,ACTION,REF_AREA,Reference area,FREQ,Frequency of observation,MEASURE,Measure,...,TIME_PERIOD,Time period,OBS_VALUE,Observation value,OBS_STATUS,Observation status,UNIT_MULT,Unit multiplier,DECIMALS,Decimals
0,DATAFLOW,OECD.ELS.HD:DSD_HEALTH_PROT@DF_HEALTH_PROT(1.0),Healthcare coverage,I,NLD,Netherlands,A,Annual,HIC,Health insurance coverage,...,2010,,99.6,,E,Estimated value,0,Units,,
1,DATAFLOW,OECD.ELS.HD:DSD_HEALTH_PROT@DF_HEALTH_PROT(1.0),Healthcare coverage,I,NLD,Netherlands,A,Annual,HIC,Health insurance coverage,...,2011,,99.6,,E,Estimated value,0,Units,,
2,DATAFLOW,OECD.ELS.HD:DSD_HEALTH_PROT@DF_HEALTH_PROT(1.0),Healthcare coverage,I,NLD,Netherlands,A,Annual,HIC,Health insurance coverage,...,2012,,99.7,,E,Estimated value,0,Units,,
3,DATAFLOW,OECD.ELS.HD:DSD_HEALTH_PROT@DF_HEALTH_PROT(1.0),Healthcare coverage,I,NLD,Netherlands,A,Annual,HIC,Health insurance coverage,...,2013,,99.8,,E,Estimated value,0,Units,,
4,DATAFLOW,OECD.ELS.HD:DSD_HEALTH_PROT@DF_HEALTH_PROT(1.0),Healthcare coverage,I,NLD,Netherlands,A,Annual,HIC,Health insurance coverage,...,2014,,99.8,,E,Estimated value,0,Units,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4615,DATAFLOW,OECD.ELS.HD:DSD_HEALTH_PROT@DF_HEALTH_PROT(1.0),Healthcare coverage,I,BGR,Bulgaria,A,Annual,HIC,Health insurance coverage,...,2019,,88.6,,A,Normal value,0,Units,,
4616,DATAFLOW,OECD.ELS.HD:DSD_HEALTH_PROT@DF_HEALTH_PROT(1.0),Healthcare coverage,I,BGR,Bulgaria,A,Annual,HIC,Health insurance coverage,...,2020,,88.0,,A,Normal value,0,Units,,
4617,DATAFLOW,OECD.ELS.HD:DSD_HEALTH_PROT@DF_HEALTH_PROT(1.0),Healthcare coverage,I,BGR,Bulgaria,A,Annual,HIC,Health insurance coverage,...,2021,,88.2,,A,Normal value,0,Units,,
4618,DATAFLOW,OECD.ELS.HD:DSD_HEALTH_PROT@DF_HEALTH_PROT(1.0),Healthcare coverage,I,BGR,Bulgaria,A,Annual,HIC,Health insurance coverage,...,2022,,93.5,,A,Normal value,0,Units,,


# Step 3: Fetch Second Dataset (Health Expenditure that you want)

# CouchDB is a NoSQL document database. It stores data in JSON format only.

In [None]:

def safe_convert(df):
    records = []
    for _, row in df.iterrows():
        doc = row.to_dict()
        clean_doc = {}
        for k, v in doc.items():
            if isinstance(v, float) and (math.isnan(v) or v in [float('inf'), float('-inf')]):
                clean_doc[k] = None
            else:
                clean_doc[k] = v
        records.append(clean_doc)
    return records

protection_records = safe_convert(df_protection)
expenditure_records = safe_convert(df_expenditure)

In [118]:
protection_records[0]

{'STRUCTURE': 'DATAFLOW',
 'STRUCTURE_ID': 'OECD.ELS.HD:DSD_HEALTH_PROT@DF_HEALTH_PROT(1.0)',
 'STRUCTURE_NAME': 'Healthcare coverage',
 'ACTION': 'I',
 'REF_AREA': 'NLD',
 'Reference area': 'Netherlands',
 'FREQ': 'A',
 'Frequency of observation': 'Annual',
 'MEASURE': 'HIC',
 'Measure': 'Health insurance coverage',
 'UNIT_MEASURE': 'PT_POP',
 'Unit of measure': 'Percentage of population',
 'INSURANCE_TYPE': 'TPRIBASI',
 'Insurance type': 'Public and primary voluntary health insurance',
 'TIME_PERIOD': 2010,
 'Time period': None,
 'OBS_VALUE': 99.6,
 'Observation value': None,
 'OBS_STATUS': 'E',
 'Observation status': 'Estimated value',
 'UNIT_MULT': 0,
 'Unit multiplier': 'Units',
 'DECIMALS': None,
 'Decimals': None}

In [119]:
expenditure_records[0]

{'STRUCTURE': 'DATAFLOW',
 'STRUCTURE_ID': 'OECD.ELS.HD:DSD_SHA@DF_SHA(1.0)',
 'STRUCTURE_NAME': 'Health expenditure and financing',
 'ACTION': 'I',
 'REF_AREA': 'USA',
 'Reference area': 'United States',
 'FREQ': 'A',
 'Frequency of observation': 'Annual',
 'MEASURE': 'EXP_HEALTH',
 'Measure': 'Expenditure',
 'UNIT_MEASURE': 'PT_B1GQ',
 'Unit of measure': 'Percentage of GDP',
 'FINANCING_SCHEME': '_T',
 'Financing scheme': 'Total',
 'FINANCING_SCHEME_REV': '_Z',
 'Revenues of financing schemes': 'Not application',
 'FUNCTION': '_T',
 'Health function': 'Total',
 'MODE_PROVISION': '_T',
 'Mode of provision': 'Total',
 'PROVIDER': '_T',
 'Health care provider': 'Total',
 'FACTOR_PROVISION': '_Z',
 'Factor of provision': 'Not applicable',
 'ASSET_TYPE': '_Z',
 'Asset type': 'Not applicable',
 'PRICE_BASE': '_Z',
 'Price base': 'Not applicable',
 'TIME_PERIOD': 1988.0,
 'Time period': None,
 'OBS_VALUE': 10.251,
 'Observation value': None,
 'BASE_PER': None,
 'Base period': None,
 'CURREN

# dump complete protection_records

In [127]:
import json
to_add = json.dumps(protection_records, indent=2)

In [128]:
to_add = []
for record in protection_records:
    st = json.dumps(record)
    to_add.append(json.loads(st))

In [123]:
to_add[0]

{'STRUCTURE': 'DATAFLOW',
 'STRUCTURE_ID': 'OECD.ELS.HD:DSD_HEALTH_PROT@DF_HEALTH_PROT(1.0)',
 'STRUCTURE_NAME': 'Healthcare coverage',
 'ACTION': 'I',
 'REF_AREA': 'NLD',
 'Reference area': 'Netherlands',
 'FREQ': 'A',
 'Frequency of observation': 'Annual',
 'MEASURE': 'HIC',
 'Measure': 'Health insurance coverage',
 'UNIT_MEASURE': 'PT_POP',
 'Unit of measure': 'Percentage of population',
 'INSURANCE_TYPE': 'TPRIBASI',
 'Insurance type': 'Public and primary voluntary health insurance',
 'TIME_PERIOD': 2010,
 'Time period': None,
 'OBS_VALUE': 99.6,
 'Observation value': None,
 'OBS_STATUS': 'E',
 'Observation status': 'Estimated value',
 'UNIT_MULT': 0,
 'Unit multiplier': 'Units',
 'DECIMALS': None,
 'Decimals': None}

In [124]:
to_add = []
for record in expenditure_records:
    st = json.dumps(record)
    to_add.append(json.loads(st))

# for remaining datasets

# duplicating data in ecery write

# Step 1: Read/Download and Clean Data (Run Once Per Dataset)


In [None]:
import requests
import pandas as pd
from io import StringIO

headers = {"User-Agent": "Mozilla/5.0"}
new_datasets = [
    {
        "url": "https://sdmx.oecd.org/public/rest/data/OECD.ELS.HD,DSD_SHA@DF_SHA_FP,1.0/.A..PT_EXP_PRV.....HP1.FP1../2010?format=csvfilewithlabels",
        "doc_id": "health_expenditure_private_financing_2010",
        "dataset_name": "Health Expenditure Private Financing 2010"
    },
    {
        "url": "https://sdmx.oecd.org/public/rest/data/OECD.ELS.HD,DSD_SHA@DF_SHA_HK,1.0/.A.CAPITAL_FORM.PT_B1GQ....._T.._T./2015?format=csvfilewithlabels",
        "doc_id": "health_expenditure_capital_formation_2015",
        "dataset_name": "Health Expenditure Capital Formation 2015"
    }
]

cleaned_datasets = {}

for dataset in new_datasets:
    try:
        print(f" Downloading {dataset['dataset_name']}...")
        response = requests.get(dataset["url"], headers=headers)
        response.raise_for_status()
        df = pd.read_csv(StringIO(response.text))
        print(f" Downloaded {df.shape[0]} rows and {df.shape[1]} columns.")

        
        cleaned_data = safe_convert(df)
        cleaned_datasets[dataset['doc_id']] = cleaned_data

    except Exception as e:
        print(f" Error downloading/cleaning {dataset['dataset_name']}: {e}")



📥 Downloading Health Expenditure Private Financing 2010...
✅ Downloaded 62 rows and 46 columns.

📥 Downloading Health Expenditure Capital Formation 2015...
✅ Downloaded 1109 rows and 46 columns.


# Step 2: Write/Upload Cleaned Data to CouchDB (Run Once Per Dataset)


In [None]:
import couchdb


for doc_id, records in cleaned_datasets.items():
    print(f"Uploading records for {doc_id}...")
    uploaded, skipped = 0, 0
    for record in records:
        # Set a unique _id for each record (for example, based on a unique field or a combination)
        # Here, let's assume 'REF_AREA' and 'TIME_PERIOD' are unique for each record
        unique_id = f"{doc_id}_{record.get('REF_AREA', '')}_{record.get('TIME_PERIOD', '')}"
        record['_id'] = unique_id

        try:
            db.save(record)
            uploaded += 1
        except couchdb.http.ResourceConflict:
            # Record already exists, skip to avoid duplicate
            skipped += 1
        except Exception as e:
            print(f" Error uploading record: {e}")
    print(f" Uploaded: {uploaded}, Skipped (already exists): {skipped}")



⬆️ Uploading records for health_expenditure_private_financing_2010...
✅ Uploaded: 0, Skipped (already exists): 62

⬆️ Uploading records for health_expenditure_capital_formation_2015...
✅ Uploaded: 0, Skipped (already exists): 1109
