In [45]:
import os
import requests
import pandas as pd
import json
import time
import random
from datetime import datetime, timedelta

import pymysql as mysql
import mysql.connector
import warnings
import matplotlib.pyplot as plt
import csv
import pyodbc
import sqlite3
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy import event
from tabulate import tabulate
import plotly.express as px
import seaborn as sns
from string import punctuation
punctuation = set(punctuation)


from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from nltk.corpus import stopwords
from string import punctuation


# mysql password
PASSWORD = "password"

#!pip install nbimporter
import nbimporter
from Functions import process_text

# FDA Data

### API Request for Historical Text Documents

https://open.fda.gov/apis/other/historicaldocument/

In [29]:
# OpenFDA API endpoint for fetching historical documents
BASE_URL = 'https://api.fda.gov/other/historicaldocument.json'

# Parameters for the API request
params = {
    'limit': 1000,  # Adjust the limit as needed
    'skip': 0      # Starting point for fetching records
}

# Function to pause execution to avoid overwhelming the server
def pause_execution():
    time.sleep(5 + 10 * random.random())

# Function to fetch data from the OpenFDA API
def fetch_data(max_requests):
    documents = []
    request_count = 0  # Initialize request counter
    
    while request_count < max_requests:
        response = requests.get(BASE_URL, params=params)
        print(f"Requesting data with params: {params}")  # Debugging statement
        print(f"Response status code: {response.status_code}")  # Debugging statement

        if response.status_code != 200:
            print(f"Failed to fetch data: {response.status_code}")  # Debugging statement
            break
        
        data = response.json()
        results = data.get('results', [])
        if not results:
            print("No more results found.")  # Debugging statement
            break

        for item in results:
            doc_type = item.get('doc_type', 'N/A')
            year = item.get('year', 'N/A')
            text = item.get('text', 'N/A')
            document = {'doc_type': doc_type, 'year': year, 'text': text}
            if document not in documents:
                documents.append(document)
        
        params['skip'] += params['limit']
        request_count += 1  # Increment request counter
        pause_execution()

    return documents

# Fetch the data
documents = fetch_data(max_requests=2)

# Save the data to a CSV file
documents_df = pd.DataFrame(documents)
documents_df.head()

Requesting data with params: {'limit': 1000, 'skip': 0}
Response status code: 200
Requesting data with params: {'limit': 1000, 'skip': 1000}
Response status code: 200


Unnamed: 0,doc_type,year,text
0,pr,2006,FDA NEWS RELEASE\nFOR IMMEDIATE RELEASE\n\nFeb...
1,pr,2006,FDA NEWS RELEASE\nFOR IMMEDIATE RELEASE\n\nFeb...
2,talk,1991,I FOOD _-AND DRUG ADMINISTRATION ‘ 'i\nU. S. D...
3,pr,1934,iNFORMATION FOR THE PRESS\n\nu 8. DEPARTMENT O...
4,pr,1975,\n\nU. S. DEPARTMENT OF HEALTH. EDUCATION. AN...


In [76]:
documents_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   doc_type  2000 non-null   object
 1   year      2000 non-null   int64 
 2   text      2000 non-null   object
dtypes: int64(1), object(2)
memory usage: 47.0+ KB


In [30]:
# Save csv file to data library
#csv_file_path = 'DataLibrary/raw_documents.csv'
#documents_df.to_csv(csv_file_path, index=False)

# Save the data to a JSON file
#json_file_path = 'DataLibrary/raw_documents.json'
#with open(json_file_path, 'w') as json_file:
#    json.dump(documents, json_file, indent=4)

#print(f"Data saved to {csv_file_path} and {json_file_path}")

### API Request for Adverse Events Data

https://open.fda.gov/apis/drug/event/

In [31]:
# OpenFDA API endpoint for fetching drug event data
BASE_URL = 'https://api.fda.gov/drug/event.json'

# Parameters for the API request
params = {
    'limit': 1000,  # Maximum limit per request
    'skip': 0      # Starting point for fetching records
}

# Function to pause execution to avoid overwhelming the server
def pause_execution():
    time.sleep(5 + 10 * random.random())

# Function to fetch data from the OpenFDA API
def fetch_data(max_requests):
    events = []
    request_count = 0  # Initialize request counter
    
    while request_count < max_requests:
        response = requests.get(BASE_URL, params=params)
        print(f"Requesting data with params: {params}")  # Debugging statement
        print(f"Response status code: {response.status_code}")  # Debugging statement

        if response.status_code != 200:
            print(f"Failed to fetch data: {response.status_code}")  # Debugging statement
            break
        
        data = response.json()
        results = data.get('results', [])
        if not results:
            print("No more results found.")  # Debugging statement
            break

        for item in results:
            events.append(item)
        
        params['skip'] += params['limit']
        request_count += 1  # Increment request counter
        pause_execution()

    return events

# Fetch the data
events = fetch_data(max_requests=2)

# Convert the data to a pandas DataFrame
events_df = pd.json_normalize(events)

# Display the first few rows of the DataFrame
events_df.head()

Requesting data with params: {'limit': 1000, 'skip': 0}
Response status code: 200
Requesting data with params: {'limit': 1000, 'skip': 1000}
Response status code: 200


Unnamed: 0,safetyreportid,transmissiondateformat,transmissiondate,serious,seriousnessdeath,receivedateformat,receivedate,receiptdateformat,receiptdate,fulfillexpeditecriteria,...,seriousnessother,occurcountry,patient.patientagegroup,seriousnesshospitalization,patient.summary.narrativeincludeclinical,seriousnesslifethreatening,patient.patientweight,primarysource.literaturereference,seriousnesscongenitalanomali,authoritynumb
0,5801206-7,102,20090109,1,1.0,102,20080707,102,20080625,1,...,,,,,,,,,,
1,10003300,102,20141002,1,,102,20140306,102,20140306,2,...,,,,,,,,,,
2,10003301,102,20141002,1,,102,20140228,102,20140228,2,...,1.0,,,,,,,,,
3,10003302,102,20141002,2,,102,20140312,102,20140312,2,...,,US,,,,,,,,
4,10003304,102,20141212,2,,102,20140312,102,20140424,2,...,,US,,,,,,,,


In [77]:
events_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 42 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   safetyreportid                               2000 non-null   object 
 1   transmissiondateformat                       2000 non-null   object 
 2   transmissiondate                             2000 non-null   object 
 3   serious                                      2000 non-null   object 
 4   seriousnessdeath                             145 non-null    object 
 5   receivedateformat                            2000 non-null   object 
 6   receivedate                                  2000 non-null   object 
 7   receiptdateformat                            2000 non-null   object 
 8   receiptdate                                  2000 non-null   object 
 9   fulfillexpeditecriteria                      2000 non-null   object 
 10  

In [29]:
# Save the data to a JSON file
#json_file_path = 'DataLibrary/raw_events.json'
#with open(json_file_path, 'w') as json_file:
#    json.dump(events, json_file, indent=4)

# Save the data to a CSV file
#csv_file_path = 'DataLibrary/raw_events.csv'
#events_df.to_csv(csv_file_path, index=False)

#print(f"Data saved to {csv_file_path} and {json_file_path}")

Data saved to DataLibrary/raw_events.csv and DataLibrary/raw_events.json


### API Request for Drug Labels

https://open.fda.gov/apis/drug/label/

In [32]:
# OpenFDA API endpoint for fetching drug label data
BASE_URL = 'https://api.fda.gov/drug/label.json'

# Parameters for the API request
params = {
    'limit': 1000,  # Maximum limit per request
    'skip': 0      # Starting point for fetching records
}

# Function to pause execution to avoid overwhelming the server
def pause_execution():
    time.sleep(5 + 10 * random.random())

# Function to fetch data from the OpenFDA API
def fetch_data(max_requests):
    labels = []
    request_count = 0  # Initialize request counter
    
    while request_count < max_requests:
        response = requests.get(BASE_URL, params=params)
        print(f"Requesting data with params: {params}")  # Debugging statement
        print(f"Response status code: {response.status_code}")  # Debugging statement

        if response.status_code != 200:
            print(f"Failed to fetch data: {response.status_code}")  # Debugging statement
            break
        
        data = response.json()
        results = data.get('results', [])
        if not results:
            print("No more results found.")  # Debugging statement
            break

        for item in results:
            labels.append(item)
        
        params['skip'] += params['limit']
        request_count += 1  # Increment request counter
        pause_execution()

    return labels

# Fetch the data
labels = fetch_data(max_requests=2)

# Convert the data to a pandas DataFrame
labels_df = pd.json_normalize(labels)

# Display the first few rows of the DataFrame
labels_df.head()

Requesting data with params: {'limit': 1000, 'skip': 0}
Response status code: 200
Requesting data with params: {'limit': 1000, 'skip': 1000}
Response status code: 200


Unnamed: 0,effective_time,inactive_ingredient,purpose,keep_out_of_reach_of_children,warnings,questions,spl_product_data_elements,version,dosage_and_administration,pregnancy_or_breast_feeding,...,components,intended_use_of_the_device,mechanism_of_action_table,general_precautions_table,drug_and_or_laboratory_test_interactions_table,accessories,ask_doctor_table,when_using_table,ask_doctor_or_pharmacist_table,do_not_use_table
0,20210902,[INACTIVE INGREDIENTS Sucrose],"[USES USES: Temporary Relief - Acne, Boils* * ...",[Keep this and all medication out of reach of ...,[WARNINGS This product is to be used for self-...,[QUESTIONS OR COMMENTS www.Rxhomeo.com | 1.888...,[SILICEA SILICEA SUCROSE SILICON DIOXIDE SILIC...,2,"[DOSAGE Adults- Take 4 or 6 Pellets by mouth, ...","[As with any drug, if you are pregnant, or nur...",...,,,,,,,,,,
1,20150109,"[INGREDIENTS: TALC, POLYMETHYL METHACRYLATE, V...",[Purpose Sunscreen],[Keep out of reach of children If product is s...,[Warnings For external use only.],,[CHANTECAILLE PROTECTION NATURELLE BRONZE SPF ...,4,[Directions Protection Naturelle SPF 46 PA+++ ...,,...,,,,,,,,,,
2,20230802,[INACTIVE INGREDIENTS Sucrose/Lactose],[USES To relieve the symptoms of itching.],[KEEP OUT OF REACH OF CHILDREN Keep this and a...,[STOP USE AND ASK DOCTOR If symptoms persist/w...,,[Mezereum DAPHNE MEZEREUM BARK SUCROSE LACTOSE...,3,[DIRECTIONS Adults: Dissolve 3 to 5 under the ...,,...,,,,,,,,,,
3,20230905,,,,[WARNINGS NOT FOR INJECTION. Ofloxacin ophthal...,,[Ofloxacin Ofloxacin OFLOXACIN OFLOXACIN Sodiu...,7,[DOSAGE AND ADMINISTRATION The recommended dos...,,...,,,,,,,,,,
4,20230403,,,,,,[Naproxen Naproxen NAPROXEN NAPROXEN CROSCARME...,27,[2 DOSAGE AND ADMINISTRATION Use the lowest ef...,,...,,,,,,,,,,


In [78]:
pd.set_option('display.max_info_columns', 140)
labels_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 140 columns):
 #    Column                                                            Non-Null Count  Dtype 
---   ------                                                            --------------  ----- 
 0    effective_time                                                    2000 non-null   object
 1    inactive_ingredient                                               1217 non-null   object
 2    purpose                                                           1189 non-null   object
 3    keep_out_of_reach_of_children                                     1185 non-null   object
 5    questions                                                         657 non-null    object
 6    spl_product_data_elements                                         1999 non-null   object
 7    version                                                           2000 non-null   object
 8    dosage_and_administration      

In [31]:
# Save the data to a JSON file
#json_file_path = 'DataLibrary/raw_labels.json'
#with open(json_file_path, 'w') as json_file:
#    json.dump(labels, json_file, indent=4)

# Save the data to a CSV file
#csv_file_path = 'DataLibrary/raw_labels.csv'
#labels_df.to_csv(csv_file_path, index=False)

#print(f"Data saved to {csv_file_path} and {json_file_path}")

Data saved to DataLibrary/raw_labels.csv and DataLibrary/raw_labels.json


# National Library of Medicine - Standardized Drug Names and Information

https://lhncbc.nlm.nih.gov/RxNav/APIs/RxNormAPIs.html

### API Request for RxNorm

In [73]:
# Base URL for RxTerms API to get all drug names
BASE_URL = 'https://rxnav.nlm.nih.gov/REST/RxTerms/rxcui'

# Function to pause execution to avoid overwhelming the server
def pause_execution():
    time.sleep(1 + 2 * random.random())

# Function to fetch data from the RxTerms API
def fetch_data(max_requests):
    drugs = []
    start_rxcui = 1  # Starting RxCUI for the search
    step = 500      # Step size to iterate through RxCUIs
    request_count = 0  # Initialize request counter

    while request_count < max_requests:
        batch_drugs = []
        for i in range(start_rxcui, start_rxcui + step):
            url = f"{BASE_URL}/{i}/allinfo.json"
            response = requests.get(url)

            if response.status_code != 200:
                print(f"Failed to fetch data: {response.status_code}")  # Debugging statement
                continue

            if response.content.strip():  # Check if the response content is not empty
                data = response.json()
                properties = data.get('rxtermsProperties', {})

                if properties:
                    if properties.get('marketed', '').lower() == 'true':
                        properties['rxcui'] = i
                        batch_drugs.append(properties)

        drugs.extend(batch_drugs)  # Add the batch results to the main list
        print(f"Processed batch starting with RxCUI: {start_rxcui}")  # Debugging statement for each batch

        # Update the start_rxcui for the next batch
        start_rxcui += step
        request_count += 1  # Increment request counter

        # Break condition to stop if no more data in the batch
        if not batch_drugs:
            break

        pause_execution()

    return drugs

# Fetch the data
drugs = fetch_data(max_requests=1)

# Convert the data to a pandas DataFrame
drugs_df = pd.DataFrame(drugs)

# Display the first few rows of the DataFrame
drugs_df.head()

KeyboardInterrupt: 

In [None]:
drugs_df.info()

In [56]:
# Save the data to a JSON file
#json_file_path = 'DataLibrary/rxterms_drugs.json'
#with open(json_file_path, 'w') as json_file:
#    json.dump(drugs, json_file, indent=4)

# Save the data to a CSV file
#csv_file_path = 'DataLibrary/rxterms_drugs.csv'
#drugs_df.to_csv(csv_file_path, index=False)

#print(f"Data saved to {csv_file_path} and {json_file_path}")

Data saved to DataLibrary/rxterms_drugs.csv and DataLibrary/rxterms_drugs.json


# Medicaid Drug Prices API 

https://data.medicaid.gov/dataset/99315a95-37ac-4eee-946a-3c523b4c481e#data-table

In [79]:
# Base URL for the Medicaid API for NADAC data
BASE_URL = 'https://data.medicaid.gov/api/1/datastore/sql'

# Initial SQL query to fetch data with a limit of 500 and initial offset 0
query_template = '[SELECT * FROM f3b5cf6b-07cc-5f75-8d7b-0a1090b3f7e9][LIMIT 500 OFFSET {}]'

# Function to pause execution to avoid overwhelming the server
def pause_execution():
    # Pause execution for a random time between 1 and 3 seconds to avoid hitting the server too hard
    time.sleep(1 + 2 * random.random())

# Function to fetch data from the Medicaid API in batches
def fetch_nadac_data():
    all_data = []  # Initialize an empty list to store all fetched data
    offset = 0  # Starting offset for pagination
    max_requests = 2  # Maximum number of requests to fetch 1000 records
    
    for _ in range(max_requests):  # Loop to fetch data in batches
        # Modify the query to include the current offset for pagination
        query = query_template.format(offset)
        params = {'query': query}  # Set the parameters for the API request

        response = requests.get(BASE_URL, params=params)  # Send the request to the API
        print(f"Requesting data with offset: {offset}")  # Debugging statement to show current offset
        print(f"Response status code: {response.status_code}")  # Debugging statement to show response status

        if response.status_code != 200:  # Check if the request was unsuccessful
            print(f"Failed to fetch data: {response.status_code}")  # Debugging statement for error status
            try:
                error_details = response.json()
                print(f"Error message: {error_details.get('message', 'No message provided')}")  # Print the error message
                print(f"Error details: {error_details.get('data', 'No additional data provided')}")  # Print additional error details
            except ValueError:
                print("Failed to decode JSON error response.")  # Debugging statement for JSON error
            break  # Exit the loop if there was an error

        try:
            data = response.json()  # Parse the response JSON
            #print(f"Raw response data: {data}")  # Debugging statement to show raw response data
            
            if not isinstance(data, list):  # Check if the response is an array
                print("Expected a list but got a different structure.")
                break

            if not data:  # Check if the list is empty
                print("No more results found.")  # Debugging statement to indicate no more data
                break  # Exit the loop if no more data is found

            all_data.extend(data)  # Append the fetched data to the main list
            offset += len(data)  # Increment the offset by the number of records fetched
            print(f"Fetched {len(data)} records. Total so far: {len(all_data)}")  # Debugging statement to show fetched data count
        except ValueError:  # Handle any JSON decoding errors
            print("Failed to decode JSON response.")  # Debugging statement for JSON error
            break  # Exit the loop if there was an error decoding JSON
        
        pause_execution()  # Pause before the next request to avoid overwhelming the server

    return all_data  # Return the collected data

# Fetch the data
prices = fetch_nadac_data()

# Convert the data to a pandas DataFrame
prices_df = pd.DataFrame(prices)

# Display the first few rows of the DataFrame
prices_df.head()
print(f"Total records fetched: {len(prices_df)}")  # Debugging statement

Requesting data with offset: 0
Response status code: 200
Fetched 500 records. Total so far: 500
Requesting data with offset: 500
Response status code: 200
Fetched 500 records. Total so far: 1000
Total records fetched: 1000


In [80]:
prices_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column                                     Non-Null Count  Dtype 
---  ------                                     --------------  ----- 
 0   NDC Description                            1000 non-null   object
 1   NDC                                        1000 non-null   object
 2   NADAC_Per_Unit                             1000 non-null   object
 3   Effective Date                             1000 non-null   object
 4   Pricing_Unit                               1000 non-null   object
 5   Pharmacy_Type_Indicator                    1000 non-null   object
 6   OTC                                        1000 non-null   object
 7   Explanation_Code                           1000 non-null   object
 8   Classification_for_Rate_Setting            1000 non-null   object
 9   Corresponding_Generic_Drug_NADAC_Per_Unit  1000 non-null   object
 10  Corresponding_Generic_Drug_Effective_

# Select Variables of Interest from Datasets

In [99]:
#documents_df = pd.read_csv('DataLibrary/raw_documents.csv')
documents_table = pd.DataFrame(documents_df, columns = ["doc_type", "year", "text"])df = pd.read_csv('DataLibrary/rxterms_drugs.csv')


#events_df = pd.read_csv('DataLibrary/raw_events.csv.csv')
events_table = pd.DataFrame(events_df, columns = ["safetyreportid", 
                                               "transmissiondate", 
                                               "serious", 
                                               "seriousnessdeath",
                                               "receivedateformat",
                                               "receivedate", 
                                               "primarysource.reportercountry",
                                               "patient.patientsetage", 
                                               "patient.patientsex", 
                                               "patient.reaction", 
                                               "patient.drug", 
                                               "patient.patientagegroup",
                                               "patient.patientweight", 
                                               "seriousnesscongentialanomali", 
                                               "authoritynumb", 
                                               "reportduplicate"])

#labels_df = pd.read_csv('')

labels_table = pd.DataFrame(labels_df, columns = ["effective_time",
                                                  "inactive_ingredient",
                                                  "purpose",
                                                  "warnings",
                                                  "questions",
                                                  "spl_product_data_elements",
                                                  "version",
                                                  "package_label_principal_display_panel",
                                                  "active_ingredient",
                                                  "openfda.brand_name",
                                                  "openfda.generic_name",
                                                  "openfda.manufacturer_name",
                                                  "openfda.product_type",
                                                  "openfda.substance_name"])

#drugs_table = pd.read_csv('')

#prices_table = pd.read_csv('')

# Preprocess Documents Table

In [None]:
documents_table.info()

print(documents_table['doc_type'].unique())

# doc_type (VARCHAR(4)) - does not need preprocess
# year (INT) - does not need preprocessing
# text (VARCHAR(tbd)) - needs text preprocessing pipeline

documents_table['text_processed'] = process_text(documents_table['text'])

documents_table.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8854 entries, 0 to 8853
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   doc_type  8854 non-null   object
 1   year      8854 non-null   int64 
 2   text      8854 non-null   object
dtypes: int64(1), object(2)
memory usage: 207.6+ KB
['pr' 'talk']


# PreProcess Events Table

# PreProcess Labels Table

# PreProcess Drugs Table

# PreProcess Prices Table

# Standardize Variable Names Across Data Sources

In [None]:
# package_ndc = NDC

# Combining Data into SQL Database Based on Keys

In [6]:
# Keys in FDA data = rxcui OR package_ndc
# Key in medicaid data = NDC
# Key in RxNorm data = rxcui

### Create new database to store data

In [None]:
# connect to MySQL server
connection = mysql.connector.connect(host="localhost", user="root", password=PASSWORD)

# Create a cursor object
cursor = connection.cursor()

# Drop the database if it exists - this is so we can start fresh (at least while developing)
cursor.execute("DROP DATABASE IF EXISTS pharma_db")

# Create the 'covid_db' database
cursor.execute("CREATE DATABASE pharma_db")

# Switch to the 'covid_db' database
cursor.execute("USE pharma_db")

# Commit the changes
connection.commit()

# Close the cursor and connection
cursor.close()
connection.close()

### Create Read-Only User Access for Public

In [None]:
# public username and password

### Create Edit Privelages for Us

In [None]:
# private username and password

### Create dataframes from API request dataframes

In [None]:
# Connect to the MySQL server
connection = mysql.connector.connect(
    host="localhost", user="root", password=PASSWORD, database="pharma_db"
)

# Create a cursor object
cursor = connection.cursor()

# Create the table
create_table1_query = """
CREATE TABLE IF NOT EXISTS `table_name` (
  vars
) ENGINE=InnoDB;
"""
cursor.execute(create_table1_query)


# Create the table
create_table2_query = """
CREATE TABLE IF NOT EXISTS `table_name` (
  vars
) ENGINE=InnoDB;
"""
cursor.execute(create_table2_query)

# Create the table
create_table3_query = """
CREATE TABLE IF NOT EXISTS `table_name` (
  vars
) ENGINE=InnoDB;
"""
cursor.execute(create_table3_query)

# Create the table
create_table4_query = """
CREATE TABLE IF NOT EXISTS `table_name` (
  vars
) ENGINE=InnoDB;
"""
cursor.execute(create_table4_query)


# show tables below
cursor.execute("SHOW TABLES")

# Fetch all the rows
tables = cursor.fetchall()

# Print the list of tables
for table in tables:
    print(table[0])

# Commit the changes
connection.commit()

# Close the cursor and connection
cursor.close()
connection.close()