In [1]:
import os
import shutil
import sys
from tqdm import tqdm

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

tf.get_logger().setLevel('ERROR')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk

#for BERT
import transformers

In [3]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
print(tf.__version__)

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 11667513408113877868
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 10161326080
locality {
  bus_id: 1
  links {
  }
}
incarnation: 5468096745878897231
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:01:00.0, compute capability: 8.6"
]
2.5.0


In [4]:
# GPU options to limit OOM erors
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

## Reload model

In [5]:
#RELOAD MODEL

saved_model_path = 'models/maturity_bert'

reloaded_model = tf.saved_model.load(saved_model_path)

In [6]:
reloaded_model

<tensorflow.python.saved_model.load.Loader._recreate_base_user_object.<locals>._UserObject at 0x253992638e0>

## Label data subset for further round of active learning

In [8]:
complete = pd.read_csv('training_data/training_maturity.csv', index_col=0)

In [9]:
labeldf = complete[['feature']].copy()
labeldf.tail(10)

Unnamed: 0,feature
1000001566,We developed and evaluated the accuracy/perfor...
1000001567,"The drug addicts are classified into mild, mod..."
1000001568,Neural network performs better than clinical j...
1000001569,We compare results from our machine learning c...
1000001570,Automatic measurement using automated segmenta...
1000001571,The results of the model built from machine le...
1000001572,Accuracy was compared to most recent iteration...
1000001573,The output from the automated procedure was co...
1000001574,When compared to a risk screening algorithm th...
1000001575,The results from the imaging risk scoring mode...


In [10]:
labeldf[labeldf['feature'].isna()]

Unnamed: 0,feature


In [11]:
labeldf.dropna(subset=['feature'], inplace=True)
labeldf[labeldf['feature'].isna()]

Unnamed: 0,feature


In [12]:
labeldf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4285 entries, 37 to 1000001575
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   feature  4285 non-null   object
dtypes: object(1)
memory usage: 67.0+ KB


In [13]:
def label_unlabelled(df, feature_column = 'feature', model=reloaded_model, number_to_label='all', cpu_labelling=False, chunk_size=100):
    
    # Either use the whole df, random sample of size specified or a list of indices
    if number_to_label == 'all':
        labelled_df = df[feature_column].copy()
        
    elif isinstance(number_to_label, list):
        labelled_df = df.loc[number_to_label, feature_column].copy()
        
    else:
        assert isinstance(number_to_label, int), "Number to label must be 'all' or an integer subset to label"
        assert number_to_label < len(df), "When specifying a subset to label, must be less than the total number of samples"
        labelled_df = df[feature_column].sample(number_to_label).copy()
        
    # Add a column to the DF for labels
    labelled_df = labelled_df.to_frame()
    labelled_df['include'] = np.nan 
    
    # Decide what device we want TF to use
    if cpu_labelling:
        device = '/cpu:0'
        print("Labelling with CPU...")
    else:
        device = '/gpu:0'
        print("Labelling with GPU...")
     
    # Label by specified chunk size
    with tqdm(total=len(labelled_df), file=sys.stdout) as pbar:
        for chunk_i in range(0, len(labelled_df.index), chunk_size):
            
            chunk = labelled_df.index[chunk_i:chunk_i + chunk_size]
        
            try:
                with tf.device(device):
                    labels = tf.sigmoid(model(tf.constant(labelled_df.loc[chunk, 'feature'])))
                labelled_df.loc[chunk, 'include'] = labels
                pbar.update(len(chunk))
            except Exception as e:
                print(e)
                print("Returning (possibly) partially labelled dataset...")
                return labelled_df
                break
            
    return labelled_df

In [None]:
labelled = label_unlabelled(labeldf, number_to_label='all', cpu_labelling=False, chunk_size=50)

Labelling with GPU...
 85%|██████████████████████████████████████████████████████████████████▍           | 3650/4285 [00:49<00:07, 79.55it/s]

In [14]:
uncertain = labelled[(labelled.include < 0.9) & (labelled.include > 0.1)]
uncertain.head(20)

Unnamed: 0,feature,include
168,A Clinical Study to Evaluate Autofluorescence ...,0.12768
909,COVID-view: Diagnosis of COVID-19 using Chest ...,0.109709
975,Deep learning-based virtual cytokeratin staini...,0.198843
1026,Development of a semi-automated method for tum...,0.860473
1244,The Reproducibility of Deep Learning-Based Seg...,0.753388
1255,Automatized Detection and Categorization of Fi...,0.655719
1485,Do plaque-related factors affect the diagnosti...,0.675015
1561,Robust whole slide image analysis for cervical...,0.325925
1774,Implementation of artificial intelligence algo...,0.138411
1832,Predicting Survived Events in Nontraumatic Out...,0.289305


In [15]:
#labelled['include_rounded'] = np.round(labelled.include)

labelled['include_rounded'] = np.where(labelled.include > 0.5, 1, 0)

In [16]:
labelled.include_rounded.value_counts()

0    7660
1     395
Name: include_rounded, dtype: int64

In [17]:
labelled.tail(5)

Unnamed: 0,feature,include,include_rounded
27388,Stress detection using deep neural networks. O...,0.000143,0
27411,The Prediction of Hepatitis E through Ensemble...,0.000115,0
27412,A Machine Learning Approach to Identify Predic...,8.1e-05,0
27414,Interpretable Machine Learning Model for Locor...,0.000116,0
27426,Using high-dimensional features for high-accur...,4.2e-05,0


In [18]:
labelled.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8055 entries, 0 to 27426
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   feature          8055 non-null   object 
 1   include          8055 non-null   float64
 2   include_rounded  8055 non-null   int32  
dtypes: float64(1), int32(1), object(1)
memory usage: 478.3+ KB


In [19]:
complete.dropna(subset=['feature'], inplace=True)
complete.tail(5)

Unnamed: 0,feature,include,include_rounded
27388,Stress detection using deep neural networks. O...,0.999779,1
27411,The Prediction of Hepatitis E through Ensemble...,0.996813,1
27412,A Machine Learning Approach to Identify Predic...,0.999936,1
27414,Interpretable Machine Learning Model for Locor...,0.999928,1
27426,Using high-dimensional features for high-accur...,0.999877,1


In [20]:
print(len(complete))
print(len(labelled))

8055
8055


In [21]:
complete['include_fuzzy'] = labelled['include']
complete['include'] = labelled['include_rounded']

In [22]:
# SAVE FINAL FILES

In [23]:
#uncertain.to_csv("final_outputs/comparative_uncertain.csv")

In [24]:
complete.to_csv("maturity_labelled.csv")

In [25]:
complete['include'].value_counts()

0    7660
1     395
Name: include, dtype: int64