In [59]:
import pandas as pd
import numpy as np
import os
import glob
from tqdm import tqdm
tqdm.pandas()


In [60]:
dataname = "ctgov_437713_20230321"
filename = "ctgov_437713_20230321.csv"
df = pd.read_csv(filename, index_col="nct_id")
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 437713 entries, NCT02421263 to NCT02467868
Data columns (total 6 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   brief_title              437713 non-null  object
 1   study_type               436883 non-null  object
 2   source                   437713 non-null  object
 3   start_date               432681 non-null  object
 4   verification_date        436883 non-null  object
 5   primary_completion_date  415903 non-null  object
dtypes: object(6)
memory usage: 23.4+ MB


In [61]:
import torch
from transformers import AutoTokenizer, AutoModel
from datasets import Dataset

In [62]:
def _generate_embedding(
        text=None, 
        model=None,
        modelname=None,
        tokenizer=None, 
        size=256
    ):
    # print(modelname)
    # Split the text into smaller chunks to fit the BERT model_name's input size
    chunks = [text[i:i+size] for i in range(0, len(text), size)] # type: ignore
    # Generate BERT embeddings for each chunk and concatenate them
    embeddings = []
    for chunk in chunks:
        if "openai" in modelname: # type: ignore
            chunk_embedding = openai.Embedding.create(
                input=[chunk],
                model="text-embedding-ada-002"
            )['data'][0]['embedding']  # type: ignore
        else:
            # Tokenize the text
            tokens = tokenizer.encode(chunk, add_special_tokens=True)
            device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
            tokens = torch.tensor([tokens]).to(device)

            # Generate the BERT/GPT embeddings
            chunk_outputs = model(tokens)
            # Extract the tensor containing the embeddings
            chunk_embeddings = chunk_outputs.last_hidden_state
            # Average the embeddings over the sequence length to get a single vector for the chunk
            chunk_embedding = torch.mean(chunk_embeddings, dim=1).tolist()[0]
        embeddings.append(chunk_embedding)
    row_embedding = np.array([sum(x) for x in zip(*embeddings)])
    return row_embedding


In [63]:
cache_dir = os.path.join(os.path.curdir, 'cache')
size = 256
modelnames = [
    # 'openai/text-embedding-ada-002',
    'emilyalsentzer/Bio_ClinicalBERT', 
    'microsoft/biogpt'
]
column = 'brief_title'
from_scratch = False
if from_scratch:
    for modelname in modelnames:
        print(modelname)
        if 'openai' in modelname:  # type: ignore
            tokenizer = None
            model = 'openai'

            # # First, check the number to trials in temp folder
            # np_files = glob.glob(os.path.join('temp', '*.npy'))
            # print(f'{len(np_files)} out of {len(df)}')
            # st_index = 0
            # if len(np_files)>=2:
            #     st_index = len(np_files)-2

            # for index, row in tqdm(df[column][st_index:].iteritems()):
            #     embedding = _generate_embedding(row,
            #                                     tokenizer=tokenizer,
            #                                     modelname=modelname,
            #                                     model=model,
            #                                     size=size)
            #     np.save(f'temp/{index}.npy', embedding) # type: ignore
            # if len(np_files) == len(df):
            #     embeddings = []
            #     for index, row in df[column].iteritems():
            #         embedding = np.load(f'temp/{index}.npy')
            #         print(f'temp/{index}.npy', (embedding.shape))
            #         # df.loc[index, 'embedding'] = embedding
            #         embeddings.append(np.array2string(embedding, separator=",", threshold=np.inf)) # type: ignore
            #     df[[modelname]] = embeddings

        else:
            tokenizer = AutoTokenizer.from_pretrained(modelname, cache_dir=cache_dir)
            model = AutoModel.from_pretrained(modelname, cache_dir=cache_dir)

            device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
            model.to(device)

            df[modelname] = df[column].progress_apply(
                lambda row: _generate_embedding(
                    row,
                    tokenizer=tokenizer,
                    modelname=modelname,
                    model=model,
                    size=size
                ), # type: ignore
            )

    dataset = Dataset.from_pandas(df)
else:
    dataset = Dataset.load_from_disk(dataname)


In [64]:
print(dataset)

Dataset({
    features: ['brief_title', 'study_type', 'source', 'start_date', 'verification_date', 'primary_completion_date', 'emilyalsentzer/Bio_ClinicalBERT', 'microsoft/biogpt', 'nct_id'],
    num_rows: 437713
})


In [65]:
print(dataset.info)

DatasetInfo(description='', citation='', homepage='', license='', features={'brief_title': Value(dtype='string', id=None), 'study_type': Value(dtype='string', id=None), 'source': Value(dtype='string', id=None), 'start_date': Value(dtype='string', id=None), 'verification_date': Value(dtype='string', id=None), 'primary_completion_date': Value(dtype='string', id=None), 'emilyalsentzer/Bio_ClinicalBERT': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None), 'microsoft/biogpt': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None), 'nct_id': Value(dtype='string', id=None)}, post_processed=None, supervised_keys=None, task_templates=None, builder_name=None, config_name=None, version=None, splits=None, download_checksums=None, download_size=None, post_processing_size=None, dataset_size=None, size_in_bytes=None)


In [66]:
for modelname in modelnames:
    print(modelname)
    # dataset.add_faiss_index(column='embeddings')
    dataset.add_faiss_index(column=modelname)

emilyalsentzer/Bio_ClinicalBERT


  0%|          | 0/438 [00:00<?, ?it/s]

microsoft/biogpt


  0%|          | 0/438 [00:00<?, ?it/s]

In [67]:
df = dataset.to_pandas()
print(df)

                                              brief_title  \
0       The Effects of Psilocybin-Facilitated Experien...   
1       Study to Explore the Safety, Tolerability and ...   
2       Phase II Study of DC Versus 5-FU/CF as Chemoth...   
3       Small Intestinal Bacterial Overgrowth: A Prosp...   
4       Prevalence and Clinical Severity of Cutaneous ...   
...                                                   ...   
437708  Hyper-synchronicity in Hypertrophic Cardiomyop...   
437709  Anti-OX40 Antibody (MEDI6469) in Patients With...   
437710  Pharmacokinetic / Pharmacodynamic Study Compar...   
437711  Study of Efficacy and Safety of Myl1401O + Tax...   
437712  Efficacy and Safety Study With MYL-1401H and N...   

                              study_type                         source  \
0                         Interventional             NYU Langone Health   
1                         Interventional             Incyte Corporation   
2                         Interventional  

In [68]:
nct_id = "NCT02421263" 
for modelname in modelnames:
    nct_id_embedding = np.array(df[df["nct_id"] == nct_id][modelname][0])
    # print(nct_id_embedding)
    scores, retrieved_examples = dataset.get_nearest_examples(
        modelname, nct_id_embedding, k=10)
    # print(scores, retrieved_examples[column])
    for score, index, title in zip(scores, retrieved_examples["nct_id"], retrieved_examples["brief_title"]):
        print(modelname, nct_id, score, index, title)

emilyalsentzer/Bio_ClinicalBERT NCT02421263 0.0 NCT02421263 The Effects of Psilocybin-Facilitated Experience on the Psychology and Effectiveness of Religious Professionals
emilyalsentzer/Bio_ClinicalBERT NCT02421263 5.5929375 NCT02243813 Effects of Psilocybin-facilitated Experience on the Psychology and Effectiveness of Professional Leaders in Religion
emilyalsentzer/Bio_ClinicalBERT NCT02421263 11.584749 NCT05652803 Bibliotherapy-based Psychoeducation Program the Effect of Elderly Individuals on Depression and Hopelessness Levels
emilyalsentzer/Bio_ClinicalBERT NCT02421263 12.550905 NCT03389568 The Effects of Singing-based Music Therapy Program on the Level of Psychoemotional Benefits in Caregivers of ICU Patients
emilyalsentzer/Bio_ClinicalBERT NCT02421263 12.657162 NCT05508048 The Effects of Logotherapy Based Psychosocial Support Program on Nursing Students on the Meaning of Life and Life Satisfaction
emilyalsentzer/Bio_ClinicalBERT NCT02421263 12.846097 NCT03232541 The Effects of A

In [69]:
# from huggingface_hub import notebook_login
# token = "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"
# notebook_login()

In [70]:
# import requests

# url = "https://huggingface.co/api/datasets/tmquan/ctgov-studies-embeddings/commit/main"
# params = {"create_pr": 1}

# response = requests.post(url, params=params)

# if response.status_code == 200:
#     print("Pull Request created successfully.")
# else:
#     print("Error:", response.status_code, response.reason)


In [71]:
# dataset.push_to_hub("ctgov-studies-embeddings", private=True)


In [72]:
if from_scratch:
    dataset.save_to_disk(dataname)


In [73]:
df.to_csv("ctgov-studies-embeddings.csv")


In [None]:
df

In [None]:
from datasets import load_dataset
dataset = load_dataset("tmquan/ctgov-studies-embeddings")