In [1]:
import os
import shutil
import sys
from tqdm import tqdm

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

tf.get_logger().setLevel('ERROR')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk

#for BERT
import transformers

In [3]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
print(tf.__version__)

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 11765252691709507486
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 22723493888
locality {
  bus_id: 1
  links {
  }
}
incarnation: 18059970968359081302
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:01:00.0, compute capability: 8.6"
]
2.5.0


In [4]:
# GPU options to limit OOM erors
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [18]:
pd.set_option('display.max_colwidth', None)
pd.options.display.max_columns = None

## Reload model

In [5]:
#RELOAD MODEL

saved_model_path = 'models/multilabel_characteristics_bert'

reloaded_model = tf.saved_model.load(saved_model_path)

In [6]:
reloaded_model

<tensorflow.python.saved_model.load.Loader._recreate_base_user_object.<locals>._UserObject at 0x22698bd97f0>

## Label data with characteristics

In [7]:
complete = pd.read_csv('data/mature_labelled.csv', index_col=0)

In [10]:
def predict_labels(model, df, label_name: list, logit_model=True, feature_column='feature', chunk_size=100):
    
    labelled_df = df[feature_column].copy()
    labelled_df = labelled_df.to_frame()
    labelled_df[label_name] = np.nan
    
    with tqdm(total=len(labelled_df), file=sys.stdout) as pbar:
         for chunk_i in range(0, len(labelled_df.index), chunk_size):
                
                chunk = labelled_df.index[chunk_i:chunk_i + chunk_size]
                
                if logit_model:
                    labels = np.round(tf.sigmoid(model(tf.constant(labelled_df.loc[chunk, feature_column]))))
                else:
                    labels = labels = np.round(model(tf.constant(labelled_df.loc[chunk, feature_column])))

                labelled_df.loc[chunk, label_name] = labels
                
                pbar.update(len(chunk))
                
    return labelled_df

In [11]:
char_labels = ['algo_neural_net', 'algo_support_vector', 'algo_regression', 'algo_decision_tree', 'feat_xr', 'feat_ct', 'feat_mri', 'feat_eeg',
 'feat_ecg', 'feat_emg', 'feat_us', 'feat_echo', 'feat_histo', 'feat_oct', 'feat_mamm', 'feat_endoscop', 'feat_gene', 'feat_bio', 'feat_nlp', 'feat_ehr',
 'subspec_icu', 'subspec_ed', 'spec_paeds', 'spec_id', 'subspec_sepsis', 'subspec_cov19', 'subspec_dermca', 'spec_onc', 'subspec_lungca', 'subspec_brainca',
 'subspec_gica', 'subspec_hepca', 'subspec_prosca', 'subspec_gynonc', 'subspec_haemonc', 'subspec_breastca', 'subspec_urology', 'spec_psych',
 'spec_msk', 'spec_gi', 'spec_hep', 'spec_resp', 'subspec_pneum', 'spec_neuro', 'subspec_epilep', 'subspec_cva', 'subspec_alzh', 'spec_cvs',
 'subspec_ihd', 'subspec_hf', 'subspec_arrhyt', 'spec_dm', 'subspec_retina', 'spec_haem', 'spec_obs', 'spec_renal']

In [12]:
def label_unlabelled(tolabeldf):
    characteristics_labelled = predict_labels(model = reloaded_model,
                                              df = tolabeldf,
                                              label_name = char_labels,
                                              logit_model=False)
    tolabeldf.loc[characteristics_labelled.index, char_labels] = characteristics_labelled[char_labels]
    print("Characteristic labelling complete...")

In [13]:
labelled = label_unlabelled(complete)

100%|████████████████████████████████████████████████████████████████████████████| 42307/42307 [10:42<00:00, 65.86it/s]
Characteristic labelling complete...


In [22]:
complete.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42307 entries, 1 to 192947
Data columns (total 76 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   pmid                 42307 non-null  int64  
 1   doi                  37409 non-null  object 
 2   title                42306 non-null  object 
 3   abstract             42307 non-null  object 
 4   article_date         32669 non-null  object 
 5   pubmed_date          42307 non-null  object 
 6   article_type         42307 non-null  object 
 7   lang                 42307 non-null  object 
 8   journal              42307 non-null  object 
 9   journal_short        42307 non-null  object 
 10  journal_country      42307 non-null  object 
 11  authors              41281 non-null  object 
 12  author_affils        31156 non-null  object 
 13  keywords             24364 non-null  object 
 14  mesh_terms           32535 non-null  object 
 15  references_pmids     21512 non-null

In [14]:
complete.to_csv("data/characteristics_labelled.csv")