In [1]:
import os
import sys
import time
import requests
import datetime
import pandas as pd
import numpy as np 
np.set_printoptions(threshold=sys.maxsize)

from tqdm.auto import tqdm
from typing import List
from sqlalchemy import create_engine, text

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
username = os.environ["USERNAME"]
password = os.environ["PASSWORD"]
hostname = os.environ["HOSTNAME"]
database = os.environ["DATABASE"]
port = os.environ["PORT"]

db_credentials = f"postgresql://{username}:{password}@{hostname}:{port}/{database}"
engine = create_engine(db_credentials)

In [4]:
sql_query = text(
f"""
SELECT
    *
FROM 
    ctgov.provided_documents
"""
)

In [5]:
# Execute the SQL query and create a pandas DataFrame from the result
df = pd.read_sql_query(
    sql_query,
    con=engine,
    params={} # type: ignore
)
df

Unnamed: 0,id,nct_id,document_type,has_protocol,has_icf,has_sap,document_date,url
0,1549700,NCT01688999,Study Protocol and Statistical Analysis Plan,True,False,True,2019-05-10,https://ClinicalTrials.gov/ProvidedDocs/99/NCT...
1,1549701,NCT01688999,Informed Consent Form,False,True,False,2019-06-28,https://ClinicalTrials.gov/ProvidedDocs/99/NCT...
2,1557841,NCT02173548,Study Protocol and Statistical Analysis Plan,True,False,True,2016-04-20,https://ClinicalTrials.gov/ProvidedDocs/48/NCT...
3,1557842,NCT03543878,Study Protocol and Statistical Analysis Plan,True,False,True,2019-11-04,https://ClinicalTrials.gov/ProvidedDocs/78/NCT...
4,1557843,NCT04501640,Study Protocol,True,False,False,2020-09-17,https://ClinicalTrials.gov/ProvidedDocs/40/NCT...
...,...,...,...,...,...,...,...,...
40269,1557836,NCT03626714,Study Protocol and Statistical Analysis Plan,True,False,True,2018-07-02,https://ClinicalTrials.gov/ProvidedDocs/14/NCT...
40270,1557837,NCT03538808,Study Protocol and Statistical Analysis Plan,True,False,True,2017-12-05,https://ClinicalTrials.gov/ProvidedDocs/08/NCT...
40271,1557838,NCT03540134,Study Protocol and Statistical Analysis Plan,True,False,True,2018-10-08,https://ClinicalTrials.gov/ProvidedDocs/34/NCT...
40272,1557839,NCT03542474,Study Protocol and Statistical Analysis Plan,True,False,True,2019-02-06,https://ClinicalTrials.gov/ProvidedDocs/74/NCT...


In [6]:
df.index

RangeIndex(start=0, stop=40274, step=1)

In [7]:
df.describe()

Unnamed: 0,id
count,40274.0
mean,1542771.0
std,11645.02
min,1522595.0
25%,1532685.0
50%,1542774.0
75%,1552856.0
max,1562933.0


In [8]:
df.set_index(df['nct_id'], inplace=True)
df.drop(columns=['nct_id'], inplace=True)
df.drop(columns=['id'], inplace=True)
df.to_csv("provided_document.csv")

In [9]:
# Assuming you have already created the DataFrame 'df' with the given data

df_prot = df[(df['has_protocol'] == True) & (df['has_icf'] == False) & (df['has_sap'] == False)]
print(len(df_prot))
df_icf = df[(df['has_protocol'] == False) & (df['has_icf'] == True) & (df['has_sap'] == False)]
print(len(df_icf))
df_sap = df[(df['has_protocol'] == False) & (df['has_icf'] == False) & (df['has_sap'] == True)]
print(len(df_sap))

df_prot_sap = df[(df['has_protocol'] == True) & (df['has_icf'] == False) & (df['has_sap'] == True)]
print(len(df_prot_sap))
df_prot_icf = df[(df['has_protocol'] == True) & (df['has_icf'] == True) & (df['has_sap'] == False)]
print(len(df_prot_icf))
df_prot_sap_icf = df[(df['has_protocol'] == True) & (df['has_icf'] == True) & (df['has_sap'] == True)]
print(len(df_prot_sap_icf))


10667
5782
8989
13797
205
834


In [10]:
sum([10667,
    5782,
    8989,
    13797,
    205,
    834])


40274

In [11]:
# Download all the links in pandas and put them in the folder pdf 
# Create the "pdf" folder if it doesn't exist
if not os.path.exists("pdf"):
    os.makedirs("pdf")

# Mapping for renaming the files
document_type_mappings = {
    (True, False, False): 'Prot',
    (False, True, False): 'ICF',
    (False, False, True): 'SAP',
    (True, False, True): 'Prot_SAP',
    (True, True, False): 'Prot_ICF',
    (True, True, True): 'Prot_SAP_ICF'
}

# Download and save the PDF files
codes = []
for index, row in tqdm(df.iterrows()):
    # Introduce a delay of 1 second
    time.sleep(1)
    
    link = row['url']  # Replace 'link' with the actual column name in your DataFrame
    has_protocol = row['has_protocol']
    has_icf = row['has_icf']
    has_sap = row['has_sap']

    # Generate the new file name based on the document type mappings
    file_type = document_type_mappings.get((has_protocol, has_icf, has_sap), 'Unknown')

    if not (has_protocol and not has_icf and not has_sap): # Modify here
        continue
    
    # Generate the new filename as "nct_id_document_type.pdf"
    nct_id = index  # Replace 'nct_id' with the actual column name in your DataFrame
    filename = f"{nct_id}_{file_type}.pdf"

    # Check if the file already exists in the "pdf" folder
    if os.path.exists(os.path.join("pdf", filename)):
        continue

    # Make the request to download the file
    response = requests.get(link)
    codes.append(response.status_code)
    # Check if the request was successful
    if response.status_code == 200:
        # Save the file to the "pdf" folder
        with open(f"pdf/{filename}", "wb") as file:
            file.write(response.content)
    else:
        print(f"Failed to download {filename}")
    

0it [00:00, ?it/s]

KeyboardInterrupt: 

In [None]:
# display(df["document_type"].str.count(r"Study Protocol, Statistical Analysis Plan, and Informed Consent Form: Study Protocol*").sum())
# display(df["document_type"].str.count(r"Study Protocol and Statistical Analysis Plan*").sum())
print(len(df))
print("="*20)
a = df["document_type"].str.count(r"^Study Protocol(?! and Statistical Analysis Plan)(?!, Statistical Analysis Plan, and Informed Consent Form).*").sum()
print(a)
b = df["document_type"].str.count(r"^Statistical Analysis Plan.*").sum()
print(b)
c = df["document_type"].str.count(r"^Informed Consent Form.*").sum()
print(c)
d = df["document_type"].str.count(r"^Study Protocol and Statistical Analysis Plan.*").sum()
print(d)
e = df["document_type"].str.count(r"^Study Protocol, Statistical Analysis Plan, and Informed Consent Form.*").sum()
print(e)
print("="*20)
print(a+b+c+d+e)


In [None]:
# # Download all the links in pandas and put them in the folder pdf 
# # Create the "pdf" folder if it doesn't exist
# if not os.path.exists("pdf"):
#     os.makedirs("pdf")

# # Define the document type mappings
# document_type_mappings = {
#     r'^Study Protocol(?! and Statistical Analysis Plan)(?!, Statistical Analysis Plan, and Informed Consent Form).*': 'Prot',
#     r'^Statistical Analysis Plan.*': 'SAP',
#     r'^Informed Consent Form.*': 'ICF',
#     r'^Study Protocol and Statistical Analysis Plan.*': 'Prot_SAP',
#     r'^Study Protocol, Statistical Analysis Plan, and Informed Consent Form.*': 'Prot_SAP_ICF'
# }

# # Download and save the PDF files
# codes = []
# for index, row in tqdm(df.iterrows()):
#     # Introduce a delay of 1 second
#     # time.sleep(1)
    
#     link = row['url']  # Replace 'link' with the actual column name in your DataFrame

#     document_type = row['document_type']  # Replace 'document_type' with the actual column name in your DataFrame

#     # Get the filename from the URL
#     filename = link.split("/")[-1]

#     # Map the document type to a specific name
#     new_document_type = None
#     for pattern, mapped_type in document_type_mappings.items():
#         if pd.Series(document_type).str.contains(pattern, regex=True).bool():
#             new_document_type = mapped_type
#             break

#     if new_document_type is None:
#         print(f"Could not map document type: {document_type}")
#         continue
    
#     if new_document_type!="Prot": # Modify here
#         continue
    
#     # Generate the new filename as "nct_id_document_type.pdf"
#     nct_id = index  # Replace 'nct_id' with the actual column name in your DataFrame
#     new_filename = f"{nct_id}_{new_document_type}.pdf"

#     # Check if the file already exists in the "pdf" folder
#     if os.path.exists(os.path.join("pdf", new_filename)):
#         continue


#     # Make the request to download the file
#     response = requests.get(link)
#     codes.append(response.status_code)
#     # Check if the request was successful
#     if response.status_code == 200:
#         # Save the file to the "pdf" folder
#         with open(f"pdf/{new_filename}", "wb") as file:
#             file.write(response.content)
#             # print(f"Downloaded {filename} to {new_filename}")
#     else:
#         print(f"Failed to download {filename} to {new_filename}")
    