In [1]:
import os
import shutil
import sys
from tqdm import tqdm

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

tf.get_logger().setLevel('ERROR')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk

#for BERT
import transformers

In [3]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
print(tf.__version__)

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 13232044890594622560
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 10200748032
locality {
  bus_id: 1
  links {
  }
}
incarnation: 11713221639265085995
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:01:00.0, compute capability: 8.6"
]
2.5.0


In [4]:
# GPU options to limit OOM erors
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

## Reload model

In [5]:
#RELOAD MODEL

saved_model_path = 'models/maturity_bert'

reloaded_model = tf.saved_model.load(saved_model_path)

In [6]:
reloaded_model

<tensorflow.python.saved_model.load.Loader._recreate_base_user_object.<locals>._UserObject at 0x1ba178ede80>

## Label data subset for further round of active learning

In [8]:
complete = pd.read_csv('data/inclusions.csv', index_col=0)

In [9]:
labeldf = complete[['pmid', 'feature']].copy()
labeldf.tail(10)

Unnamed: 0,pmid,feature
161490,9456211,Neural network assessment of perioperative car...
161496,9453525,TACHY: an expert system for the management of ...
161502,9450258,Detection of ECG waveforms by neural networks....
161506,9445150,Neural network analysis of breast cancer from ...
161507,9440819,Automated classification of patients with chro...
161510,9438272,On the use of neural network techniques to ana...
161511,9438271,Analysis of quantitative EEG with artificial n...
161513,9436967,Neural networks as a prognostic tool for patie...
161517,9430460,Automated interpretation of myocardial SPECT p...
161524,9423655,Acute pulmonary embolism: cost-effectiveness a...


In [10]:
labeldf[labeldf['feature'].isna()]

Unnamed: 0,pmid,feature
31873,32729810,


In [11]:
labeldf.dropna(subset=['feature'], inplace=True)
labeldf[labeldf['feature'].isna()]

Unnamed: 0,pmid,feature


In [12]:
labeldf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32472 entries, 0 to 161524
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   pmid     32472 non-null  int64 
 1   feature  32472 non-null  object
dtypes: int64(1), object(1)
memory usage: 761.1+ KB


In [13]:
def label_unlabelled(df, feature_column = 'feature', model=reloaded_model, number_to_label='all', cpu_labelling=False, chunk_size=100):
    
    # Either use the whole df, random sample of size specified or a list of indices
    if number_to_label == 'all':
        labelled_df = df[feature_column].copy()
        
    elif isinstance(number_to_label, list):
        labelled_df = df.loc[number_to_label, feature_column].copy()
        
    else:
        assert isinstance(number_to_label, int), "Number to label must be 'all' or an integer subset to label"
        assert number_to_label < len(df), "When specifying a subset to label, must be less than the total number of samples"
        labelled_df = df[feature_column].sample(number_to_label).copy()
        
    # Add a column to the DF for labels
    labelled_df = labelled_df.to_frame()
    labelled_df['include'] = np.nan 
    
    # Decide what device we want TF to use
    if cpu_labelling:
        device = '/cpu:0'
        print("Labelling with CPU...")
    else:
        device = '/gpu:0'
        print("Labelling with GPU...")
     
    # Label by specified chunk size
    with tqdm(total=len(labelled_df), file=sys.stdout) as pbar:
        for chunk_i in range(0, len(labelled_df.index), chunk_size):
            
            chunk = labelled_df.index[chunk_i:chunk_i + chunk_size]
        
            try:
                with tf.device(device):
                    labels = tf.sigmoid(model(tf.constant(labelled_df.loc[chunk, 'feature'])))
                labelled_df.loc[chunk, 'include'] = labels
                pbar.update(len(chunk))
            except Exception as e:
                print(e)
                print("Returning (possibly) partially labelled dataset...")
                return labelled_df
                break
            
    return labelled_df

In [14]:
labelled = label_unlabelled(labeldf, number_to_label='all', cpu_labelling=False, chunk_size=50)

Labelling with GPU...
100%|████████████████████████████████████████████████████████████████████████████| 32472/32472 [06:43<00:00, 80.51it/s]


In [15]:
uncertain = labelled[(labelled.include < 0.9) & (labelled.include > 0.1)]
uncertain.head(20)

Unnamed: 0,feature,include
323,Automated Left Ventricle Ischemic Scar Detecti...,0.615293
548,Deep Learning for Basal Cell Carcinoma Detecti...,0.869935
833,Deep learning segmentation of transverse muscu...,0.15516
968,Pregnancy prediction performance of an annotat...,0.61472
977,"Comparison of deep learning, radiomics and sub...",0.172651
1044,"Do AI models recognise rare, aggressive skin c...",0.304076
1440,Artificial intelligence-assisted colonic endoc...,0.133151
1760,Deep Learning-Based Post-Processing of Real-Ti...,0.823535
2762,A Novel Hierarchical Deep Learning Framework f...,0.828656
2802,AI Pinpoints Origin of Unidentified Cancers. A...,0.163015


In [16]:
labelled['include_rounded'] = np.round(labelled.include)

In [17]:
labelled.include_rounded.value_counts()

0.0    31407
1.0     1065
Name: include_rounded, dtype: int64

In [18]:
labelled.tail(5)

Unnamed: 0,feature,include,include_rounded
161510,On the use of neural network techniques to ana...,2.2e-05,0.0
161511,Analysis of quantitative EEG with artificial n...,0.000115,0.0
161513,Neural networks as a prognostic tool for patie...,3.1e-05,0.0
161517,Automated interpretation of myocardial SPECT p...,0.000384,0.0
161524,Acute pulmonary embolism: cost-effectiveness a...,0.005829,0.0


In [19]:
labelled.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32472 entries, 0 to 161524
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   feature          32472 non-null  object 
 1   include          32472 non-null  float64
 2   include_rounded  32472 non-null  float64
dtypes: float64(2), object(1)
memory usage: 2.0+ MB


In [20]:
complete.dropna(subset=['feature'], inplace=True)
complete.tail(5)

Unnamed: 0,pmid,doi,title,abstract,article_date,pubmed_date,article_type,lang,journal,journal_short,journal_country,authors,author_affils,keywords,mesh_terms,references_pmids,feature,include
161510,9438272,,On the use of neural network techniques to ana...,This is the third communication on the use of ...,,23/01/1998,Clinical Trial,eng,Neuropsychobiology,Neuropsychobiology,Switzerland,"['Baumgart-Schmitt R', 'Herrmann W M', 'Eilers...",,,"['Algorithms', 'Electroencephalography', 'Elec...",,On the use of neural network techniques to ana...,1.0
161511,9438271,,Analysis of quantitative EEG with artificial n...,Artificial neural networks (ANN) are widely us...,,23/01/1998,Clinical Trial,eng,Neuropsychobiology,Neuropsychobiology,Switzerland,"['Winterer G', 'Ziller M', 'Klöppel B', 'Heinz...",,,"['Alcoholism', 'Algorithms', 'Discriminant Ana...",,Analysis of quantitative EEG with artificial n...,1.0
161513,9436967,,Neural networks as a prognostic tool for patie...,Patients with non-small cell carcinoma of the ...,,22/01/1998,Journal Article,eng,Modern pathology : an official journal of the ...,Mod Pathol,United States,"['Bellotti M', 'Elsner B', 'Paez De Lima A', '...",,,"['Adenocarcinoma', 'Antigens, Nuclear', 'Bioma...",,Neural networks as a prognostic tool for patie...,1.0
161517,9430460,,Automated interpretation of myocardial SPECT p...,The purpose of this study was to develop a com...,,16/01/1998,Comparative Study,eng,Journal of nuclear medicine : official publica...,J Nucl Med,United States,"['Lindahl D', 'Palmer J', 'Ohlsson M', 'Peters...",,,"['Coronary Angiography', 'Coronary Disease', '...",,Automated interpretation of myocardial SPECT p...,1.0
161524,9423655,,Acute pulmonary embolism: cost-effectiveness a...,To evaluate the cost-effectiveness of artifici...,,10/01/1998,Journal Article,eng,Radiology,Radiology,United States,"['Tourassi G D', 'Floyd C E', 'Coleman R E']",,,"['Acute Disease', 'Angiography', 'Cost-Benefit...",,Acute pulmonary embolism: cost-effectiveness a...,1.0


In [21]:
print(len(complete))
print(len(labelled))

32472
32472


In [22]:
complete['include_fuzzy'] = labelled['include']
complete['include'] = labelled['include_rounded']

In [23]:
# SAVE FINAL FILES

In [24]:
#uncertain.to_csv("final_outputs/comparative_uncertain.csv")

In [25]:
complete.to_csv("data/maturity_labelled.csv")

In [26]:
complete['include'].value_counts()

0.0    31407
1.0     1065
Name: include, dtype: int64