In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import os
import sys

sys.path.append(os.path.expanduser("~/hybrid-dictionary-ner-doc2vec-doc-relevance/code/embeddings/"))
sys.path.append(os.path.expanduser("~/hybrid-dictionary-ner-doc2vec-doc-relevance/code/tendency_analysis/"))

import create_model as cm
import calculate_sim as cs
import counting_table as ct
import ROC_curve as ROC

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

os.chdir(os.path.expanduser('~/hybrid-dictionary-ner-doc2vec-doc-relevance/'))

# Helper functions

In [2]:
def generate_hyperparameters(params):
    try:
        from sklearn.model_selection import ParameterGrid

        param_grid = ParameterGrid(params)
        df_hp = pd.DataFrame.from_dict(param_grid)

        return df_hp
    except:
        import itertools

        keys, values = zip(*params.items())
        df_hp = [dict(zip(keys, v)) for v in itertools.product(*values)]
        return df_hp

# Steps

## Step 1:

Generate the hyperparameters to test:

In [12]:
params_d2v = {
    "dm": [1],
    "vector_size": [256], 
    "window": [5, 6], 
    "min_count": [5], 
    "epochs": [5], 
    "workers": [8]}

hp_df = generate_hyperparameters(params_d2v)
hp_df

Unnamed: 0,dm,epochs,min_count,vector_size,window,workers
0,1,5,5,256,5,8
1,1,5,5,256,6,8


## Step 2:

Load the data:

1. Tokens: a .TSV file generated after the preprocessing step.

2. Relevance matrix: for relish, required 3 columns "PMID1", "PMID2" and "relevance". For TREC, required 3 columns "PMID1", "PMID2" and "Group".

In [13]:
# Input paths: 
#input_tokens = "../data_full/RELISH/RELISH_tokens.tsv"
#input_relevance_matrix = "../data_full/RELISH/RELISH_relevance_matrix.tsv"
input_tokens = "../data_full/TREC/TREC_tokens.tsv"
input_relevance_matrix = "../data_full/TREC/TREC_relevance_matrix.tsv"

# Load the relevance matrix and the tokens:
relevance_matrix = cs.load_relevance_matrix(input_relevance_matrix)
pmid, join_text = cm.load_tokens(input_tokens)

# Generate the training documents:
tagged_data = cm.generate_TaggedDocument(pmid, join_text)

## Step 3:

Execute pipeline. It is divided in these processes:

1. Generate a folder to contain all the variables or figures for a given hyperparameter search.

2. Loop through every column of the hyperparameter DataFrame. For each combination of hyperparameters execute:

    2.1. Generate the doc2vec model. Optionally, store it in the output directory or load it.

    2.2. Fill the relevance matrix with the trained model. It is recommended to use the multiprocessing function in order to accelerate the process. Optionally, store the filled relevance matrix or load it.

    2.3. Generate the counting table from the filled relevance matrix. Optionally, store the filled counting table or load it.

    2.4. Generate the ROC curve and calculate the area under its curve (AUC). The AUC value is stored in the hyperparameter DataFrame and stored in the output directory.

In [14]:
def execute_optimization(hp_df: pd.DataFrame, 
        tagged_data: TaggedDocument, 
        relevance_matrix: pd.DataFrame, 
        dataset: str, 
        output_dir: str, 
        load_model: bool = False,
        save_model: bool = False, 
        load_rel_matrix: bool = False,
        save_rel_matrix: bool = False,
        load_counting_table: bool = False,
        save_counting_table: bool = False):

    output_dir = f"testing/hyperparameter_optimization/runs/{output_dir}"
    if not os.path.exists(output_dir): os.mkdir(output_dir)

    hp_df_path = output_dir + "/hp_df.tsv"
    hp_df.to_csv(hp_df_path, sep = "\t")

    hp_df["AUC"] = 0
    for i, row in hp_df.iterrows():
        print(f"Starting row {i}:")
        params = row.to_dict()
        del params["AUC"]

        relevance_matrix_row = relevance_matrix.copy(deep = True)

        # Models
        print(f"\tGenerate and train the model.")
        model_path = output_dir + f"/model_{i}.model"
        if load_model and os.path.exists(model_path):
            model = cs.load_d2v_model(model_path)
        else:
            model = cm.generate_doc2vec_model(tagged_data, params)
            cm.train_doc2vec_model(model, tagged_data, time_train = True)

            # We shouldn't save each model, it would take too much disk space
            if save_model:
                print(f"\tSave the model.")
                cm.save_doc2vec_model(model, output_dir + f"/model_{i}.model")

        # Relevance Matrix
        print(f"\tFill the relevance matrix.")
        rel_matrix_path = output_dir + f"/relevance_matrix_{i}.tsv"
        if load_rel_matrix and os.path.exists(rel_matrix_path):
            relevance_matrix_row = cm.load_relevance_matrix(rel_matrix_path)
        else:
            cs.fill_relevance_matrix_multiprocess(relevance_matrix_row, model)
            if save_rel_matrix:
                print(f"\tSaving the relevance matrix.")
                cs.save_rel_matrix(relevance_matrix_row, rel_matrix_path)

        # Counting Table
        print(f"\tGenerating the counting table.")
        counting_table_path = output_dir + f"/counting_table_{i}.tsv"
        if load_counting_table and os.path.exists(counting_table_path):
            counting_table = pd.read_csv(counting_table_path, sep = "\t")
        else:
            counting_table = ct.hp_create_counting_table(relevance_matrix_row, dataset = dataset)
            if save_counting_table:
                print(f"\tSaving the counting table.")
                ct.save_table(counting_table, counting_table_path)

        # Measure and save ROC
        print(f"\tPlotting the ROC curve.")
        ROC_path = output_dir + f"/ROC_{i}.png"
        ROC.generate_roc_values(counting_table, dataset = dataset)
        ROC.draw_roc_curve(counting_table, draw_auc = True, show_figure = False, output_path = ROC_path)

        hp_df.at[i, "AUC"] = ROC.calculate_auc(counting_table)
        hp_df.to_csv(hp_df_path, sep = "\t")
    
    return hp_df

execute_optimization(hp_df, tagged_data, relevance_matrix, dataset = "TREC", output_dir = "Last_TREC_test", save_model=True, load_model=True, save_counting_table=True)

Starting row 0:
	Generate and train the model.
Epoch #0 start
Epoch #0 end
Epoch #1 start
Epoch #1 end
Epoch #2 start
Epoch #2 end
Epoch #3 start
Epoch #3 end
Epoch #4 start
Epoch #4 end
--- Time to train: 35.64 seconds
	Save the model.
	Fill the relevance matrix.
	Generating the counting table.
	Saving the counting table.
	Plotting the ROC curve.
Starting row 1:
	Generate and train the model.
Epoch #0 start
Epoch #0 end
Epoch #1 start
Epoch #1 end
Epoch #2 start
Epoch #2 end
Epoch #3 start
Epoch #3 end
Epoch #4 start
Epoch #4 end
--- Time to train: 34.00 seconds
	Save the model.
	Fill the relevance matrix.
	Generating the counting table.
	Saving the counting table.
	Plotting the ROC curve.


Unnamed: 0,dm,epochs,min_count,vector_size,window,workers,AUC
0,1,5,5,256,5,8,0.601351
1,1,5,5,256,6,8,0.598169


# Additional Information

This pipeline is still under development. 