In [8]:
import requests
import json
import os

def get_study_data(page_size=1000):
    base_url = 'https://clinicaltrials.gov/api/v2/studies'
    
    params = {
        'format': 'json',       
        'pageSize': page_size,  
    }
    
    output_dir = "clinical_trials_data"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    all_studies_path = os.path.join(".", 'all_studies.jsonl')
    
    with open(all_studies_path, 'w', encoding='utf-8') as all_file:
        all_file.write('[\n')
    
        page_token = None
        page_number = 1
        first_study = True
        
        while True:
            if page_token:
                params['pageToken'] = page_token
            
            try:
                response = requests.get(base_url, params=params)
                response.raise_for_status()
            except requests.exceptions.RequestException as e:
                print(f"API request failed: {e}")
                break

            data = response.json()
            studies = data.get('studies', [])

            if not studies:
                print("No more studies found.")
                break

            page_filename = os.path.join(output_dir, f"studies_page_{page_number}.json")
            with open(page_filename, 'w', encoding='utf-8') as page_file:
                json.dump(studies, page_file, indent=4)
            print(f"Saved page {page_number} with {len(studies)} studies to {page_filename}")

            for study in studies:
                if not first_study:
                    all_file.write(',\n')
                else:
                    first_study = False
                json.dump(study, all_file)
            
            page_token = data.get('nextPageToken')
            if not page_token:
                print("No more pages.")
                break

            page_number += 1
        
        all_file.write('\n]')
    
    print(f"All studies have been saved successfully in '{output_dir}' directory.")

if __name__ == "__main__":
    get_study_data()


Saved page 1 with 1000 studies to clinical_trials_data/studies_page_1.json
Saved page 2 with 1000 studies to clinical_trials_data/studies_page_2.json
Saved page 3 with 1000 studies to clinical_trials_data/studies_page_3.json
Saved page 4 with 1000 studies to clinical_trials_data/studies_page_4.json
Saved page 5 with 1000 studies to clinical_trials_data/studies_page_5.json
Saved page 6 with 1000 studies to clinical_trials_data/studies_page_6.json
Saved page 7 with 1000 studies to clinical_trials_data/studies_page_7.json
Saved page 8 with 1000 studies to clinical_trials_data/studies_page_8.json
Saved page 9 with 1000 studies to clinical_trials_data/studies_page_9.json
Saved page 10 with 1000 studies to clinical_trials_data/studies_page_10.json
Saved page 11 with 1000 studies to clinical_trials_data/studies_page_11.json
Saved page 12 with 1000 studies to clinical_trials_data/studies_page_12.json
Saved page 13 with 1000 studies to clinical_trials_data/studies_page_13.json
Saved page 14 wit

In [3]:
import chromadb
client = chromadb.PersistentClient(path="./clinical_trials_chroma")

In [9]:
import os
import json
from sentence_transformers import SentenceTransformer
from unidecode import unidecode

model = SentenceTransformer("malteos/scincl")
collection = client.get_or_create_collection("clinical_trials_studies")

def embed_studies_from_json(json_folder, batch_size=32):
    batch_texts = []       # To store concatenated officialTitle and detailedDescriptionฃ
    batch_metadata = []    # To store metadata
    batch_documents = []   # To store documents
    batch_ids = []         # To store study IDs
    index = 0
    length = len(os.listdir(json_folder))
    for file_name in os.listdir(json_folder):
        if file_name.endswith('.json'):
            index += 1
            file_path = os.path.join(json_folder, file_name)
            print(f"Processing file: {file_name}")
            with open(file_path, 'r', encoding='utf-8') as f:
                studies = json.load(f)

            for study in studies:
                try:
                    official_title = study.get('protocolSection', {}).get('identificationModule', {}).get('officialTitle', '')
                    detailed_description = study.get('protocolSection', {}).get('descriptionModule', {}).get('detailedDescription', "")

                    # Skip if no valid officialTitle or detailedDescription
                    if not official_title or not detailed_description:
                        continue
                    
                    concatenated_text = unidecode(f"{official_title} [SEP] {detailed_description}")
                    batch_texts.append(concatenated_text)
                    batch_metadata.append({
                        "nctId": study.get("protocolSection", {}).get("identificationModule", {}).get("nctId", "unknown"),
                        "officialTitle": official_title,
                        "detailed_description": detailed_description
                    })
                    batch_documents.append(json.dumps(study, ensure_ascii=True))
                    batch_ids.append(study.get("protocolSection", {}).get("identificationModule", {}).get("nctId", "unknown"))

                    # When batch size is reached, process the batch
                    if len(batch_texts) == batch_size:
                        process_batch(batch_texts, batch_documents, batch_ids, batch_metadata)
                        print(f"Processed {len(batch_texts)} studies. {index}/{length}")
                        # Clear the batches
                        batch_texts.clear()
                        batch_documents.clear()
                        batch_metadata.clear()
                        batch_ids.clear()

                except Exception as e:
                    print(f"Failed to process study: {e}")
                    continue
                
    if batch_texts:
        process_batch(batch_texts, batch_documents, batch_ids, batch_metadata)

def process_batch(texts, documents, ids, metadatas):
    embeddings = model.encode(texts, batch_size=len(texts))
    collection.add(
        embeddings=embeddings,
        documents=documents,
        metadatas=metadatas,
        ids=ids
    )
    print(f"Processed and added batch of {len(texts)} studies.")

json_folder = "clinical_trials_data"
embed_studies_from_json(json_folder, batch_size=750)
print("Embedding and storing complete!")

Processing file: studies_page_122.json
Processing file: studies_page_292.json
Processing batch of 750 studies...
Processing batch of 750 documents...
Processing batch of 750 ids...
Processing batch of 750 metadatas...
Processed and added batch of 750 studies.
Processed 750 studies. 2/512
Processing file: studies_page_493.json
Processing batch of 750 studies...
Processing batch of 750 documents...
Processing batch of 750 ids...
Processing batch of 750 metadatas...


KeyboardInterrupt: 

: 