# Header

In [None]:
import os

## Notebook Configuration

In [None]:
# path to the this notebook
# NOTE: Replace this with your project path if needed
PROJECT_PATH = (
    "/content/drive/My Drive/W210"
    if "google.colab" in str(get_ipython())
    else "."
)

# path to the data folder
# NOTE: Replace this with your data path if needed
DATA_PATH = f"{PROJECT_PATH}/data" if "google.colab" in str(get_ipython()) else PROJECT_PATH
# NOTE: For colab we use content so it doesn't load on google drive storage
RAW_DATA_PATH = f"{PROJECT_PATH}/data" if "google.colab" in str(get_ipython()) else f"{PROJECT_PATH}/data"

## Colab Setup

In [None]:
if "google.colab" in str(get_ipython()):
    from google.colab import drive
    drive.mount('/content/drive')

    # setup libraries used by notebook
    #os.system("pip install -q kaggle")

os.chdir(PROJECT_PATH)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Library Import

In [None]:
import itertools
import json
import requests
import shutil
import typing
import zipfile

from io import BytesIO
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl

from tqdm import tqdm

## Functions

In [None]:
def download_from_web(
    save_path: typing.Union[str, Path, typing.IO[bytes], typing.BinaryIO],
    url: str,
    block_size: int = 300 * 1024,
) -> typing.Union[typing.IO[bytes], typing.BinaryIO]:
    """
    Perform download of a file from a public web adress

    :param save_path: path to save the data extraction
    :param url: address where data is stored
    :param block_size: size in bytes from incremental download
    :return: buffer object to file
    """
    # make sure the path points to an buffer object
    if isinstance(save_path, str) or isinstance(save_path, Path):
        file_path: typing.Union[typing.IO[bytes], typing.BinaryIO] = open(save_path, "wb")
    else:
        file_path = save_path

    # generate a request to get the content
    response = requests.get(url, stream=True)
    total_size_in_bytes = int(response.headers.get("content-length", 0))

    # parse the file
    progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
    for data in response.iter_content(block_size):
        progress_bar.update(len(data))
        file_path.write(data)
    file_path.close()

    # returns the buffer object
    return file_path

---

# Data Pre-Processing

In [None]:
download_numbers = {
    "2023q4": 35,
    "2023q3": 32,
    "2023q2": 32,
    "2023q1": 32,
    "2022q4": 35,
    "2022q3": 33,
    "2022q2": 31,
    "2022q1": 34,
    "2021q4": 31,
    "2021q3": 36,
    "2021q2": 34,
    "2021q1": 33,
    "2020q4": 30,
    # "2020q3": 29,
    # "2020q2": 30,
    # "2020q1": 33,
}
outputs = list()
for quarter, files in tqdm(download_numbers.items()):
    path = Path(f"{DATA_PATH}/drug-events/{quarter}")
    path.mkdir(parents=True, exist_ok=True)
    for i in range(1, files + 1):
        if not os.path.exists(f"{DATA_PATH}/drug-events/{quarter}/{i:04d}-of-{files:04d}.json.zip"):
            download_from_web(
                save_path=f"{DATA_PATH}/drug-events/{quarter}/{i:04d}-of-{files:04d}.json.zip",
                url=f"https://download.open.fda.gov/drug/event/{quarter}/drug-event-{i:04d}-of-{files:04d}.json.zip",
            )
        with zipfile.ZipFile(f"{DATA_PATH}/drug-events/{quarter}/{i:04d}-of-{files:04d}.json.zip") as z:
            raw_events = json.load(z.open(f"drug-event-{i:04d}-of-{files:04d}.json"))
            for event in raw_events["results"]:
                output = dict()
                output["event_id"] = event["safetyreportid"]
                output["event_date"] = event["receivedate"]
                output["event_serious"] = event["serious"]
                output["package_ndc"] = list()
                for drug_info in event["patient"]["drug"]:
                    if "openfda" in drug_info:
                        if "package_ndc" in drug_info["openfda"]:
                            output["package_ndc"] = drug_info["openfda"]["package_ndc"]
                            break
                outputs.append(output)

100%|██████████| 13/13 [1:10:59<00:00, 327.68s/it]


In [None]:
adverse_events = pl.DataFrame(outputs)

In [None]:
del outputs
import gc
while gc.collect():
  continue

In [None]:
adverse_events = adverse_events.explode("package_ndc")

# Export

In [None]:
adverse_events.write_parquet(f"{DATA_PATH}/preprocessed/adverse_events.parquet")

---