In [1]:
import pandas as pd
import numpy as np
import os
import glob
from tqdm import tqdm
tqdm.pandas()


In [2]:
# dataname = "ctgov_437713_20230321"
# filename = "ctgov_437713_20230321.csv"
dataname = "ctgov_20230417"
filename = "ctgov_20230417.csv"
df = pd.read_csv(filename, index_col="nct_id")
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 34983 entries, NCT00000143 to NCT05736861
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   brief_title             34983 non-null  object
 1   official_title          34897 non-null  object
 2   baseline_measurements   7582 non-null   object
 3   brief_summaries         34983 non-null  object
 4   detailed_descriptions   34983 non-null  object
 5   criteria                34982 non-null  object
 6   gender                  34968 non-null  object
 7   minimum_age             33303 non-null  object
 8   maximum_age             17129 non-null  object
 9   facilities              33586 non-null  object
 10  city                    34983 non-null  object
 11  state                   31170 non-null  object
 12  zip                     32401 non-null  object
 13  country                 34983 non-null  object
 14  recruitment_details     14607 non-null  obj

In [3]:
import torch
from transformers import AutoTokenizer, AutoModel
from datasets import Dataset

In [4]:
def _generate_embedding(
        text=None, 
        model=None,
        modelname=None,
        tokenizer=None, 
        size=256
    ):
    # print(modelname)
    # print(text)
    # Split the text into smaller chunks to fit the BERT model_name's input size
    chunks = [text[i:i+size] for i in range(0, len(text), size)] # type: ignore
    # Generate BERT embeddings for each chunk and concatenate them
    embeddings = []
    for chunk in chunks:
        if "openai" in modelname: # type: ignore
            chunk_embedding = openai.Embedding.create(
                input=[chunk],
                model="text-embedding-ada-002"
            )['data'][0]['embedding']  # type: ignore
        else:
            # Tokenize the text
            tokens = tokenizer.encode(chunk, add_special_tokens=True)
            device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
            tokens = torch.tensor([tokens]).to(device)

            # Generate the BERT/GPT embeddings
            chunk_outputs = model(tokens)
            # Extract the tensor containing the embeddings
            chunk_embeddings = chunk_outputs.last_hidden_state
            # Average the embeddings over the sequence length to get a single vector for the chunk
            chunk_embedding = torch.mean(chunk_embeddings, dim=1).tolist()[0]
        embeddings.append(chunk_embedding)
    row_embedding = np.array([sum(x) for x in zip(*embeddings)])
    return row_embedding


In [5]:
column = 'info'
df[column] = df.apply(lambda x: ' '.join(x.astype(str) if x is not None else "None. "), axis=1)
df[:2]

Unnamed: 0_level_0,brief_title,official_title,baseline_measurements,brief_summaries,detailed_descriptions,criteria,gender,minimum_age,maximum_age,facilities,city,state,zip,country,recruitment_details,pre_assignment_details,study_type,info
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
NCT00000143,Studies of Ocular Complications of AIDS (SOCA)...,Studies of Ocular Complications of AIDS (SOCA)...,,"To compare the newest CMV retinitis drug, cido...",Cytomegalovirus (CMV) is among the most freque...,Inclusion criteria:\n\nAge 13 years or older\n...,All,13 Years,,"University of South Florida, MDC Box 21",Tampa,Texas,94143,United States,June 1997,,Interventional,Studies of Ocular Complications of AIDS (SOCA)...
NCT00000378,Antidepressant Treatment of Melancholia in Lat...,Antidepressant Treatment of Melancholia in Lat...,,The purpose of this study is to compare the sa...,To compare the efficacy and safety of a select...,Inclusion Criteria:\n\n-\n\nPatients must have...,All,60 Years,95 Years,1051 Riverside Drive,New York,New York,10032,United States,,,Interventional,Antidepressant Treatment of Melancholia in Lat...


In [6]:
cache_dir = os.path.join(os.path.curdir, 'cache')
size = 256
modelnames = [
    # 'openai/text-embedding-ada-002',
    'emilyalsentzer/Bio_ClinicalBERT', 
    'microsoft/biogpt'
]

from_scratch = False
if from_scratch:
    for modelname in modelnames:
        print(modelname)
        if 'openai' in modelname:  # type: ignore
            tokenizer = None
            model = 'openai'

            # # First, check the number to trials in temp folder
            # np_files = glob.glob(os.path.join('temp', '*.npy'))
            # print(f'{len(np_files)} out of {len(df)}')
            # st_index = 0
            # if len(np_files)>=2:
            #     st_index = len(np_files)-2

            # for index, row in tqdm(df[column][st_index:].iteritems()):
            #     embedding = _generate_embedding(row,
            #                                     tokenizer=tokenizer,
            #                                     modelname=modelname,
            #                                     model=model,
            #                                     size=size)
            #     np.save(f'temp/{index}.npy', embedding) # type: ignore
            # if len(np_files) == len(df):
            #     embeddings = []
            #     for index, row in df[column].iteritems():
            #         embedding = np.load(f'temp/{index}.npy')
            #         print(f'temp/{index}.npy', (embedding.shape))
            #         # df.loc[index, 'embedding'] = embedding
            #         embeddings.append(np.array2string(embedding, separator=",", threshold=np.inf)) # type: ignore
            #     df[[modelname]] = embeddings

        else:
            tokenizer = AutoTokenizer.from_pretrained(modelname, cache_dir=cache_dir)
            model = AutoModel.from_pretrained(modelname, cache_dir=cache_dir)

            device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
            model.to(device)

            df[modelname] = df[column].progress_apply(
                lambda row: _generate_embedding(
                    row,
                    tokenizer=tokenizer,
                    modelname=modelname,
                    model=model,
                    size=size
                ), # type: ignore
            )
            df = df.set_index('nct_id')
    dataset = Dataset.from_pandas(df)
else:
    dataset = Dataset.load_from_disk(dataname)


In [7]:
print(dataset)

Dataset({
    features: ['brief_title', 'official_title', 'baseline_measurements', 'brief_summaries', 'detailed_descriptions', 'criteria', 'gender', 'minimum_age', 'maximum_age', 'facilities', 'city', 'state', 'zip', 'country', 'recruitment_details', 'pre_assignment_details', 'study_type', 'info', 'emilyalsentzer/Bio_ClinicalBERT', 'microsoft/biogpt', 'nct_id'],
    num_rows: 34983
})


In [8]:
print(dataset.info)

DatasetInfo(description='', citation='', homepage='', license='', features={'brief_title': Value(dtype='string', id=None), 'official_title': Value(dtype='string', id=None), 'baseline_measurements': Value(dtype='string', id=None), 'brief_summaries': Value(dtype='string', id=None), 'detailed_descriptions': Value(dtype='string', id=None), 'criteria': Value(dtype='string', id=None), 'gender': Value(dtype='string', id=None), 'minimum_age': Value(dtype='string', id=None), 'maximum_age': Value(dtype='string', id=None), 'facilities': Value(dtype='string', id=None), 'city': Value(dtype='string', id=None), 'state': Value(dtype='string', id=None), 'zip': Value(dtype='string', id=None), 'country': Value(dtype='string', id=None), 'recruitment_details': Value(dtype='string', id=None), 'pre_assignment_details': Value(dtype='string', id=None), 'study_type': Value(dtype='string', id=None), 'info': Value(dtype='string', id=None), 'emilyalsentzer/Bio_ClinicalBERT': Sequence(feature=Value(dtype='float64',

In [9]:
for modelname in modelnames:
    print(modelname)
    # dataset.add_faiss_index(column='embeddings')
    dataset.add_faiss_index(column=modelname)

emilyalsentzer/Bio_ClinicalBERT


  0%|          | 0/35 [00:00<?, ?it/s]

microsoft/biogpt


  0%|          | 0/35 [00:00<?, ?it/s]

In [10]:
df = dataset.to_pandas()
df = df.set_index('nct_id')
df[:2]

Unnamed: 0_level_0,brief_title,official_title,baseline_measurements,brief_summaries,detailed_descriptions,criteria,gender,minimum_age,maximum_age,facilities,city,state,zip,country,recruitment_details,pre_assignment_details,study_type,info,emilyalsentzer/Bio_ClinicalBERT,microsoft/biogpt
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
NCT00000143,Studies of Ocular Complications of AIDS (SOCA)...,Studies of Ocular Complications of AIDS (SOCA)...,,"To compare the newest CMV retinitis drug, cido...",Cytomegalovirus (CMV) is among the most freque...,Inclusion criteria:\n\nAge 13 years or older\n...,All,13 Years,,"University of South Florida, MDC Box 21",Tampa,Texas,94143,United States,June 1997,,Interventional,Studies of Ocular Complications of AIDS (SOCA)...,"[0.020412593614310026, -0.1898606214672327, -1...","[-5.695895534008741, 5.35822943970561, -1.9499..."
NCT00000378,Antidepressant Treatment of Melancholia in Lat...,Antidepressant Treatment of Melancholia in Lat...,,The purpose of this study is to compare the sa...,To compare the efficacy and safety of a select...,Inclusion Criteria:\n\n-\n\nPatients must have...,All,60 Years,95 Years,1051 Riverside Drive,New York,New York,10032,United States,,,Interventional,Antidepressant Treatment of Melancholia in Lat...,"[0.4295310452580452, 0.0076399907702580094, -1...","[-6.805308409035206, -2.983658343553543, -11.8..."


In [11]:
# nct_id = "NCT02421263" 
# for modelname in modelnames:
#     nct_id_embedding = np.array(df[df["nct_id"] == nct_id][modelname][0])
#     # print(nct_id_embedding)
#     scores, retrieved_examples = dataset.get_nearest_examples(
#         modelname, nct_id_embedding, k=10)
#     # print(scores, retrieved_examples[column])
#     for score, index, title in zip(scores, retrieved_examples["nct_id"], retrieved_examples["brief_title"]):
#         print(modelname, nct_id, score, index, title)

In [12]:
df[:2]

Unnamed: 0_level_0,brief_title,official_title,baseline_measurements,brief_summaries,detailed_descriptions,criteria,gender,minimum_age,maximum_age,facilities,city,state,zip,country,recruitment_details,pre_assignment_details,study_type,info,emilyalsentzer/Bio_ClinicalBERT,microsoft/biogpt
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
NCT00000143,Studies of Ocular Complications of AIDS (SOCA)...,Studies of Ocular Complications of AIDS (SOCA)...,,"To compare the newest CMV retinitis drug, cido...",Cytomegalovirus (CMV) is among the most freque...,Inclusion criteria:\n\nAge 13 years or older\n...,All,13 Years,,"University of South Florida, MDC Box 21",Tampa,Texas,94143,United States,June 1997,,Interventional,Studies of Ocular Complications of AIDS (SOCA)...,"[0.020412593614310026, -0.1898606214672327, -1...","[-5.695895534008741, 5.35822943970561, -1.9499..."
NCT00000378,Antidepressant Treatment of Melancholia in Lat...,Antidepressant Treatment of Melancholia in Lat...,,The purpose of this study is to compare the sa...,To compare the efficacy and safety of a select...,Inclusion Criteria:\n\n-\n\nPatients must have...,All,60 Years,95 Years,1051 Riverside Drive,New York,New York,10032,United States,,,Interventional,Antidepressant Treatment of Melancholia in Lat...,"[0.4295310452580452, 0.0076399907702580094, -1...","[-6.805308409035206, -2.983658343553543, -11.8..."


In [13]:
if from_scratch:
    dataset.save_to_disk(dataname)


: 