# Step 1 — Data Collection & Integration (Starter Notebook)
Use this notebook to download datasets, clean names, and export clean tables for Power BI.
Run cells top-to-bottom.

In [1]:
# 0) Imports & paths
import os, json, zipfile, io
from pathlib import Path
import pandas as pd
import requests

DATA_DIR = Path('./data')
RAW = DATA_DIR/'raw'
EXT = DATA_DIR/'external'
PROC = DATA_DIR/'processed'
for d in (DATA_DIR, RAW, EXT, PROC): d.mkdir(parents=True, exist_ok=True)

print('Folders ready:', RAW, EXT, PROC)

Folders ready: data\raw data\external data\processed


## 1) RxNorm (US) — download Current Prescribable Content (CPC)
- Files are weekly zip; we only need the **prescribe** zip and a few tables.
- This cell expects you to paste the **download URL** into `RXNORM_URL`. See README.

In [None]:
# Paste latest prescribe zip URL (from RxNorm Files page)
RXNORM_URL = ''  # e.g., 'https://download.nlm.nih.gov/umls/kss/rxnorm/RxNorm_weekly_prescribe_YYYYMMDD.zip'
rx_zip = RAW/'rxnorm_prescribe.zip'

if RXNORM_URL:
    r = requests.get(RXNORM_URL, timeout=120)
    r.raise_for_status()
    rx_zip.write_bytes(r.content)
    print('Downloaded to', rx_zip)
else:
    print('Set RXNORM_URL before running.')

### Parse RxNorm RRF (optional in Step 1)
If you don't want to parse RRF yet, you can skip and use a prepared CSV later.

In [None]:
# TODO: parse RRF files to build rxnorm_products table
# Placeholder: create an empty frame with correct columns
rx_cols = ['rxcui','drug_name','dose_form','strength','ingredient_rxcui','ingredient_name','country','source_file','as_of_date']
rxnorm_products = pd.DataFrame(columns=rx_cols)
rxnorm_products.to_csv(PROC/'rxnorm_products.csv', index=False)
rxnorm_products.head()

## 2) Canada DPD — download All Files extract
- Download the ZIP from Health Canada portal and place it in `data/raw/dpd_all_files.zip`.
- We will read Active, Drug, and Ingredient tables and join.

In [None]:
# If you manually downloaded, set the path here
DPD_ZIP_PATH = RAW/'dpd_all_files.zip'

if DPD_ZIP_PATH.exists():
    print('Found DPD zip at', DPD_ZIP_PATH)
else:
    print('Place the DPD zip at', DPD_ZIP_PATH, 'then rerun this cell.')

In [None]:
# TODO: parse DPD tables into a single dpd_products.csv
dpd_cols = ['drug_code','brand_name','proper_name','active_ingredient','strength','route','schedule','class','status','country','source_file','as_of_date']
dpd_products = pd.DataFrame(columns=dpd_cols)
dpd_products.to_csv(PROC/'dpd_products.csv', index=False)
dpd_products.head()

## 3) DailyMed Labels — API fetch (sample)
Fetch label sections (e.g., Warnings, Adverse Reactions) for selected NDCs or setids.

In [None]:
# Example: stub for pulling labels by setid list
setids = []  # add DailyMed setids here
dailymed_cols = ['setid','ndc','rxcui','brand_name','generic_name','section_name','section_text','version_date','source_url']
labels_df = pd.DataFrame(columns=dailymed_cols)
labels_df.to_csv(PROC/'dailymed_labels.csv', index=False)
labels_df.head()

## 4) FAERS / Canada Vigilance / CAERS — place files
- Put quarterly FAERS extracts or API pulls in `data/raw/faers/`.
- Put Canada Vigilance CSV extracts in `data/raw/cv/`.
- Put CAERS CSV in `data/raw/caers/`.
We'll standardize columns to the template.

In [None]:
# Create empty standardized CSVs for joining later
cols = {
    'faers_events': ['safetyreportid','receivedate','primarysourcecountry','patient_age','patient_sex','drug_name','rxcui','role_code','reaction_meddra_pt','serious','outcome','report_source'],
    'canada_vigilance_events': ['report_id','received_date','age','sex','drug_name','dpd_drug_code','reaction_term','serious','outcome','reporter_type'],
    'caers_events': ['report_id','created_date','product_type','brand_name','ingredients','adverse_event_term','serious','outcome','notes']
}
for name, c in cols.items():
    path = (PROC/f"{name}.csv")
    if not path.exists():
        pd.DataFrame(columns=c).to_csv(path, index=False)
        print('Created', path)

## 5) Lookups & joins
- Build a **brand → ingredient** lookup table.
- Save unified tables for Power BI: `products.csv`, `ingredients.csv`, `side_effects.csv`.

In [None]:
# TODO joins (placeholder)
lookups = pd.DataFrame(columns=['brand_name','normalized_ingredient','rxcui_or_inci','country','notes'])
lookups.to_csv(PROC/'lookups.csv', index=False)

print('Placeholders created in', PROC)

In [3]:
# Read NDC "product" Excel and save a clean table as rxnorm_products.csv

from pathlib import Path
import pandas as pd
from datetime import date

RAW = Path("data/raw")
PROC = Path("data/processed")
PROC.mkdir(parents=True, exist_ok=True)

# find the product file (you have product.xls)
candidates = list(RAW.glob("ndc_products.xlsx")) + list(RAW.glob("product.*"))
if not candidates:
    raise FileNotFoundError("Put your NDC product Excel in data/raw/ (e.g., product.xls or ndc_products.xlsx)")
src = candidates[0]
print("Reading:", src)

# if you get an error for .xls, first run:  !pip install xlrd
df = pd.read_excel(src)

# pull the useful columns (some names may not exist depending on version)
def col(name):
    return df[name] if name in df.columns else pd.Series([None]*len(df))

brand = col("PROPRIETARYNAME")
brand_suffix = col("PROPRIETARYNAMESUFFIX")
generic = col("NONPROPRIETARYNAME")
dose_form = col("DOSAGEFORMNAME")
num_strength = col("ACTIVE_NUMERATOR_STRENGTH")
unit_strength = col("ACTIVE_INGRED_UNIT")
substance = col("SUBSTANCENAME")  # can be "A; B; C"

# make display name (brand + suffix, else generic)
drug_name = brand.fillna("").str.strip()
drug_name = (drug_name + (" " + brand_suffix.fillna("").str.strip())).str.strip()
drug_name = drug_name.mask(drug_name.eq(""), generic)

# strength text
strength_txt = (
    num_strength.fillna("").astype(str).str.strip()
    + " "
    + unit_strength.fillna("").astype(str).str.strip()
).str.strip()
strength_txt = strength_txt.replace({"nan nan": ""})

base = pd.DataFrame({
    "rxcui": "",                    # not in NDC
    "drug_name": drug_name,
    "dose_form": dose_form,
    "strength": strength_txt,
    "ingredient_rxcui": "",         # not in NDC
    "ingredient_name": substance,   # may be multiple separated by ";"
    "country": "US",
    "source_file": src.name,
    "as_of_date": date.today().isoformat(),
})

# split multi-ingredient rows ("A; B; C") into separate rows
f = base.copy()
f["ingredient_name"] = f["ingredient_name"].fillna("").astype(str)
f["ingredient_name"] = (
    f["ingredient_name"]
    .str.replace(r"\s*;\s*", "|", regex=True)
    .str.split("|")
)
f = f.explode("ingredient_name")
f["ingredient_name"] = f["ingredient_name"].str.strip()
f = f[f["ingredient_name"] != ""]

cols = ["rxcui","drug_name","dose_form","strength","ingredient_rxcui","ingredient_name","country","source_file","as_of_date"]
out = f.loc[:, cols].drop_duplicates()

out_path = PROC / "rxnorm_products.csv"
out.to_csv(out_path, index=False)
print(f"Saved {len(out):,} rows to {out_path}")
out.head(10)


Reading: data\raw\product.xls


ValueError: Excel file format cannot be determined, you must specify an engine manually.

In [4]:
!pip install xlrd openpyxl


Collecting xlrd
  Downloading xlrd-2.0.2-py2.py3-none-any.whl.metadata (3.5 kB)
Downloading xlrd-2.0.2-py2.py3-none-any.whl (96 kB)
   ---------------------------------------- 0.0/96.6 kB ? eta -:--:--
   ---- ----------------------------------- 10.2/96.6 kB ? eta -:--:--
   ---- ----------------------------------- 10.2/96.6 kB ? eta -:--:--
   ------------------------- -------------- 61.4/96.6 kB 465.5 kB/s eta 0:00:01
   ---------------------------------------- 96.6/96.6 kB 612.2 kB/s eta 0:00:00
Installing collected packages: xlrd
Successfully installed xlrd-2.0.2


In [5]:
import os

src = "product.xls"  # or the full path if needed
print(os.path.splitext(src)[1])  # This will show the extension


.xls


In [10]:
import pandas as pd

src = r"C:\Users\Hp\Downloads\drug-safety-explorer-step0\drug-safety-explorer\data\raw\product.xlsx"
df = pd.read_excel(src, engine='openpyxl')
print(df.head())


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Hp\\Downloads\\drug-safety-explorer-step0\\drug-safety-explorer\\data\\raw\\product.xlsx'