# Generate embeddings

We generate embeddings using held-out training data.
This way, when we later evaluate out method in full, testing data will never have been seen by the word embedding method.
We split train/test using NCT IDs from the manual annotations by Harry and Undina as the test data.
All other NCT IDs represent training data.

In [1]:
import csv
import lzma
import pathlib

import gensim
import pandas as pd

In [2]:
uog_df = pd.read_csv('../../data/annotations/annotate_notes_uog.csv')
hrn_df = pd.read_csv('../../data/annotations/annotate_notes_hr2479.csv')

uog_nct_id = set(uog_df['NCT_id'].dropna())
hrn_nct_id = set(hrn_df['NCT_id'].dropna())

annotated_nct_id = uog_nct_id.union(hrn_nct_id)

In [3]:
computed_data_root = pathlib.Path('../../data/outputs/')
criteria_file = computed_data_root.joinpath('trial_eligibility_criteria.tsv.xz')

train_nct = list()
test_nct = list()

train_criteria = list()
test_criteria = list()

with lzma.open(criteria_file, 'rt', newline='\n') as f:
    reader = csv.reader(f, delimiter='\t')
    print(next(reader))
    for nct_id, criteria_string in reader:
        if nct_id in annotated_nct_id:
            test_nct.append(nct_id)
            test_criteria.append(criteria_string)
        else:
            train_nct.append(nct_id)
            train_criteria.append(criteria_string)

['NCT_ID', 'eligibility_criteria']


In [4]:
# Save training data
with open(computed_data_root.joinpath('train_nct.txt'), 'w') as f:
    for nct in train_nct:
        f.write(nct + '\n')
        
with open(computed_data_root.joinpath('train_criteria.txt'), 'w') as f:
    for criterium in train_criteria:
        f.write(criterium + '\n')

# Save testing data
with open(computed_data_root.joinpath('test_nct.txt'), 'w') as f:
    for nct in test_nct:
        f.write(nct + '\n')
        
with open(computed_data_root.joinpath('test_criteria.txt'), 'w') as f:
    for criterium in test_criteria:
        f.write(criterium + '\n')

In [5]:
w2v_model = gensim.models.Word2Vec(
    corpus_file=computed_data_root.joinpath('train_criteria.txt').as_posix(),
    size=200, window=5, min_count=5, workers=6, sg=1, hs=1, seed=100,
    max_vocab_size=None, max_final_vocab=None,
)

w2v_model.save('../../data/models/word2vec.model')


d2v_model = gensim.models.Doc2Vec(
    corpus_file=computed_data_root.joinpath('train_criteria.txt').as_posix(),
    dm=0,
    vector_size=100,
    seed=100,
    workers=6,
    min_count=5,
    max_vocab_size=None,
)
    
d2v_model.save('../../data/models/doc2vec.model')