In [1]:
from dotenv import load_dotenv
load_dotenv()


True

## Parse the completed trials

In [2]:
import os
import datetime
import psycopg2
import pandas as pd

from typing import List
from sqlalchemy import create_engine, text


In [3]:
username = os.environ["USERNAME"]
password = os.environ["PASSWORD"]
hostname = os.environ["HOSTNAME"]
database = "aact"
port = os.environ["PORT"]

db_credentials = f"postgresql://{username}:{password}@{hostname}:{port}/{database}"
engine = create_engine(db_credentials)
db_connections = engine.connect()

In [4]:
sql_query = text(
    f"""
SELECT
    studies.nct_id,
    MAX(studies.brief_title) AS brief_title,
    MAX(studies.official_title) AS official_title,
    MAX(studies.overall_status) AS overall_status,
    STRING_AGG(DISTINCT baseline_measurements.description, ' ') AS baseline_measurements,
    STRING_AGG(DISTINCT brief_summaries.description, ' ') AS brief_summaries,
    STRING_AGG(DISTINCT detailed_descriptions.description, ' ') AS detailed_descriptions,
    MAX(eligibilities.criteria) AS criteria, 
    MAX(eligibilities.gender) AS gender, 
    MAX(eligibilities.minimum_age) AS minimum_age, 
    MAX(eligibilities.maximum_age) AS maximum_age, 
    MAX(facilities.name) AS facilities, 
    MAX(facilities.city) AS city, 
    MAX(facilities.state) AS state, 
    MAX(facilities.zip) AS zip, 
    MAX(facilities.country) AS country, 
    MAX(participant_flows.recruitment_details) AS recruitment_details, 
    MAX(participant_flows.pre_assignment_details) AS pre_assignment_details, 
    MAX(studies.study_type) AS study_type
FROM 
    ctgov.studies
INNER JOIN ctgov.baseline_measurements ON baseline_measurements.nct_id = studies.nct_id 
INNER JOIN ctgov.brief_summaries ON brief_summaries.nct_id = studies.nct_id 
INNER JOIN ctgov.detailed_descriptions ON detailed_descriptions.nct_id = studies.nct_id 
INNER JOIN ctgov.eligibilities ON eligibilities.nct_id = studies.nct_id 
INNER JOIN ctgov.facilities ON facilities.nct_id = studies.nct_id 
INNER JOIN ctgov.participant_flows ON participant_flows.nct_id = studies.nct_id 
WHERE
    studies.overall_status = 'Completed'
GROUP BY studies.nct_id;
"""
)

# Execute the SQL query and create a pandas DataFrame from the result
df = pd.read_sql_query(
    sql_query,
    engine,
    params={}  # type: ignore
)
df


Unnamed: 0,nct_id,brief_title,official_title,overall_status,baseline_measurements,brief_summaries,detailed_descriptions,criteria,gender,minimum_age,maximum_age,facilities,city,state,zip,country,recruitment_details,pre_assignment_details,study_type
0,NCT00000143,Studies of Ocular Complications of AIDS (SOCA)...,Studies of Ocular Complications of AIDS (SOCA)...,Completed,,"To compare the newest CMV retinitis drug, cido...",Cytomegalovirus (CMV) is among the most freque...,Inclusion criteria:\n\nAge 13 years or older\n...,All,13 Years,,"University of South Florida, MDC Box 21",Tampa,Texas,94143,United States,June 1997,,Interventional
1,NCT00000378,Antidepressant Treatment of Melancholia in Lat...,Antidepressant Treatment of Melancholia in Lat...,Completed,,The purpose of this study is to compare the sa...,To compare the efficacy and safety of a select...,Inclusion Criteria:\n\n-\n\nPatients must have...,All,60 Years,95 Years,1051 Riverside Drive,New York,New York,10032,United States,,,Interventional
2,NCT00000620,Action to Control Cardiovascular Risk in Diabe...,Action to Control Cardiovascular Risk in Diabe...,Completed,,The purpose of this study is to prevent major ...,"BACKGROUND:\n\nCurrently, about 17 million Ame...",Inclusion Criteria:\n\nDiagnosed with type 2 d...,All,40 Years,79 Years,Wake Forest University,Winston-Salem,Washington,98195,United States,All participants had established type 2 diabet...,Eligible participants provided evidence of abi...,Interventional
3,NCT00001213,Cysteamine Eye Drops to Treat Corneal Crystals...,Trial of Topical Cysteamine in the Treatment o...,Completed,Although 328 participants were initially enrol...,Cystinosis is an inherited disease that result...,"Protocol 86-EI-0062 began as a randomized, dou...",INCLUSION CRITERIA:\n\nPatients must have a do...,All,2 Years,,"National Institutes of Health Clinical Center,...",Bethesda,Maryland,20892,United States,,,Interventional
4,NCT00001259,A Treatment Study for Premenstrual Syndrome (PMS),The Treatment of Menstrually-Related Mood Diso...,Completed,,This study examines the effects of estrogen an...,This protocol is designed to accompany Clinica...,INCLUSION CRITERIA:\n\nThe subjects of this st...,Female,18 Years,45 Years,"National Institutes of Health Clinical Center,...",Bethesda,Maryland,20892,United States,One participant signed consent but withdrew pr...,"8 of 46 participants who completed Study 2, Ph...",Interventional
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28618,NCT05451225,Survey of Correction Officers Before and After...,Reducing Duration of Untreated Psychosis Throu...,Completed,,The investigators will implement a multifacete...,Because Correction Officers in the three jails...,Inclusion Criteria:\n\nCorrection Officers;\no...,All,21 Years,,Columbia University,New York,New York,10027,United States,Recruitment of the study sample took place in ...,Correction officers (COs) were recruited at ea...,Interventional
28619,NCT05489146,t-RNS After Hand Recovery in Chronic Stroke,Combining Transcranial Random Noise Stimulatio...,Completed,Fugl Meyer Upper Extremity Assessment (FMUE) m...,Upper extremity (UE) paresis or weakness is on...,The primary purpose of this study was to inves...,Inclusion Criteria:\n\nAge 18 or older\nEpisod...,All,18 Years,80 Years,Neuromotor Recovery and Rehabilitation Lab,Pittsburgh,Pennsylvania,15260,United States,,,Interventional
28620,NCT05502081,Clinical Study to Compare Efficacy and Safety ...,Clinical Study to Evaluate the Possible Effica...,Completed,0. Uninfected\n\nAmbulatory mild disease\n\nAs...,Introduction:\n\nCorona Virus induced disease ...,I. INTRODUCTION\n\n1.1. COVID-19 overview and ...,Inclusion Criteria:\n\nage more than 12 years ...,All,12 Years,,El-gomhoria St,Mansoura,El-dkhalia,050,Egypt,from 1/11/2021 to 29/5/2022 at isolation hospi...,assignment is applied after admission of parti...,Interventional
28621,NCT05594173,Chewing and Oral Processing of Solid Food,Chewing and Oral Processing of Solid Food in H...,Completed,,Food texture modification is commonly used as ...,Aim: To explore chewing and oral processing be...,Inclusion Criteria:\n\nHealthy adults under ag...,All,18 Years,60 Years,Toronto Rehabilitation Institute - University ...,Toronto,Ontario,M5G 2A2,Canada,,,Observational


In [5]:
len(df)


28623

In [6]:
df.set_index(df['nct_id'], inplace=True)
df.drop(columns=['nct_id'], inplace=True)


In [7]:
output = "ctgov"
timestamp = datetime.datetime.now().strftime("%Y%m%d")
file_name = f"{output}_{timestamp}.csv"
# Write the data to output filename
df.to_csv(file_name)


## Counting with tiktoken

In [15]:
import tiktoken
model = "cl100k_base"
encoder = tiktoken.get_encoding(model)

df = pd.read_csv("ctgov_20230613.csv", index_col="nct_id")
df


Unnamed: 0_level_0,brief_title,official_title,overall_status,baseline_measurements,brief_summaries,detailed_descriptions,criteria,gender,minimum_age,maximum_age,facilities,city,state,zip,country,recruitment_details,pre_assignment_details,study_type
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
NCT00000143,Studies of Ocular Complications of AIDS (SOCA)...,Studies of Ocular Complications of AIDS (SOCA)...,Completed,,"To compare the newest CMV retinitis drug, cido...",Cytomegalovirus (CMV) is among the most freque...,Inclusion criteria:\n\nAge 13 years or older\n...,All,13 Years,,"University of South Florida, MDC Box 21",Tampa,Texas,94143,United States,June 1997,,Interventional
NCT00000378,Antidepressant Treatment of Melancholia in Lat...,Antidepressant Treatment of Melancholia in Lat...,Completed,,The purpose of this study is to compare the sa...,To compare the efficacy and safety of a select...,Inclusion Criteria:\n\n-\n\nPatients must have...,All,60 Years,95 Years,1051 Riverside Drive,New York,New York,10032,United States,,,Interventional
NCT00000620,Action to Control Cardiovascular Risk in Diabe...,Action to Control Cardiovascular Risk in Diabe...,Completed,,The purpose of this study is to prevent major ...,"BACKGROUND:\n\nCurrently, about 17 million Ame...",Inclusion Criteria:\n\nDiagnosed with type 2 d...,All,40 Years,79 Years,Wake Forest University,Winston-Salem,Washington,98195,United States,All participants had established type 2 diabet...,Eligible participants provided evidence of abi...,Interventional
NCT00001213,Cysteamine Eye Drops to Treat Corneal Crystals...,Trial of Topical Cysteamine in the Treatment o...,Completed,Although 328 participants were initially enrol...,Cystinosis is an inherited disease that result...,"Protocol 86-EI-0062 began as a randomized, dou...",INCLUSION CRITERIA:\n\nPatients must have a do...,All,2 Years,,"National Institutes of Health Clinical Center,...",Bethesda,Maryland,20892,United States,,,Interventional
NCT00001259,A Treatment Study for Premenstrual Syndrome (PMS),The Treatment of Menstrually-Related Mood Diso...,Completed,,This study examines the effects of estrogen an...,This protocol is designed to accompany Clinica...,INCLUSION CRITERIA:\n\nThe subjects of this st...,Female,18 Years,45 Years,"National Institutes of Health Clinical Center,...",Bethesda,Maryland,20892,United States,One participant signed consent but withdrew pr...,"8 of 46 participants who completed Study 2, Ph...",Interventional
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NCT05451225,Survey of Correction Officers Before and After...,Reducing Duration of Untreated Psychosis Throu...,Completed,,The investigators will implement a multifacete...,Because Correction Officers in the three jails...,Inclusion Criteria:\n\nCorrection Officers;\no...,All,21 Years,,Columbia University,New York,New York,10027,United States,Recruitment of the study sample took place in ...,Correction officers (COs) were recruited at ea...,Interventional
NCT05489146,t-RNS After Hand Recovery in Chronic Stroke,Combining Transcranial Random Noise Stimulatio...,Completed,Fugl Meyer Upper Extremity Assessment (FMUE) m...,Upper extremity (UE) paresis or weakness is on...,The primary purpose of this study was to inves...,Inclusion Criteria:\n\nAge 18 or older\nEpisod...,All,18 Years,80 Years,Neuromotor Recovery and Rehabilitation Lab,Pittsburgh,Pennsylvania,15260,United States,,,Interventional
NCT05502081,Clinical Study to Compare Efficacy and Safety ...,Clinical Study to Evaluate the Possible Effica...,Completed,0. Uninfected\n\nAmbulatory mild disease\n\nAs...,Introduction:\n\nCorona Virus induced disease ...,I. INTRODUCTION\n\n1.1. COVID-19 overview and ...,Inclusion Criteria:\n\nage more than 12 years ...,All,12 Years,,El-gomhoria St,Mansoura,El-dkhalia,050,Egypt,from 1/11/2021 to 29/5/2022 at isolation hospi...,assignment is applied after admission of parti...,Interventional
NCT05594173,Chewing and Oral Processing of Solid Food,Chewing and Oral Processing of Solid Food in H...,Completed,,Food texture modification is commonly used as ...,Aim: To explore chewing and oral processing be...,Inclusion Criteria:\n\nHealthy adults under ag...,All,18 Years,60 Years,Toronto Rehabilitation Institute - University ...,Toronto,Ontario,M5G 2A2,Canada,,,Observational


In [17]:
df["concat"] = df.apply(lambda x: ' '.join(
    x.astype(str) if x is not None else "None. "), axis=1)
df["tokens"] = df["concat"].apply(lambda x: encoder.encode(x))
df["num_tokens"] = df["tokens"].apply(lambda x: len(x))
total_tokens = df['num_tokens'].sum()
# $0.0004 per 1000 tokens
total_price = total_tokens / 1000 * 0.0004
print(f"Price for {total_tokens} tokens is ${total_price}")


Price for 163015532 tokens is $65.2062128
