In [1]:
import pandas as pd
import time as time

import numpy as np
from collections import Counter

In [2]:
import matplotlib.pyplot as plt #visualisation
import seaborn as sns

In [3]:
import tensorflow as tf

In [4]:
pd.set_option('display.max_colwidth', None)
pd.set_option("display.max_rows", None, "display.max_columns", None)

## Load Dataset

In [5]:
s = time.time()
labelled = pd.read_csv('data/mature_labelled.csv', index_col=0, dtype='string')
e = time.time()
print("Multicore Loading Time = {}".format(e-s))

print(len(labelled))

Multicore Loading Time = 1.496028184890747
34179


In [6]:
labelled[labelled.isnull().all(1)]

Unnamed: 0,pmid,doi,title,abstract,article_date,pubmed_date,article_type,lang,journal,journal_short,journal_country,authors,author_affils,keywords,mesh_terms,references_pmids,feature,include,mature


In [7]:
len(labelled)

34179

In [8]:
selected = labelled[['pmid', 'feature']].copy()

In [9]:
selected = selected.rename(columns={"feature":"text"})
selected.info()

<class 'pandas.core.frame.DataFrame'>
Index: 34179 entries, 1 to 172538
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   pmid    34179 non-null  string
 1   text    34179 non-null  string
dtypes: string(2)
memory usage: 801.1+ KB


## Pre-process Text

In [10]:
groups = selected.fillna("") #handle NaN values to allow regex over all cells

groups_1 = groups.applymap(lambda x:x.lower() if type(x) == str else x) #reduce all to lowercase

groups_2 = groups_1.replace(r"[\([{})\]]", "", regex=True) #remove brackets

groups_3 = groups_2.replace("' ", "'", regex=True) #remove quote+space in front of word

groups_4 = groups_3.replace("""[\.'!?]""", "", regex=True) #remove punctuation

groups = groups_4.replace('"', "", regex=True) #remove double quote

## Tag Algorithms

In [11]:
algo = groups[['text']].copy()

In [12]:
######################
## CLASSES
######################
# NEURAL NETWORK / nn
# SUPPORT VECTOR MACHINE / svm
# STANDARD REGRESSIONS /reg
# DECISION TREES / dt
# DISCRIMINANT ANALYSIS / da
# NAIVE BAYES / nb
# K-NEAREST NEIGHBOUR / knn
# 
# TRANSFER LEARNING / tl
# FEDERATED LEARNING / fl
# UNSUPERVISED LEARNING / unsup

In [13]:
## NEURAL NETWORK

## text
text = ['neural net', 'deep learning', 'convolutional', 'back propagation', 'lstm', ' cnn']

algo['nn_text'] = np.where(groups['text'].str.contains("neural net"), "1", "0")

for x in text:
    algo['nn_text'] = np.where(groups['text'].str.contains(x), "1", algo['nn_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(algo['nn_text']))

text counts:
Counter({'0': 20276, '1': 13903})


In [14]:
## SUPPORT VECTOR MACHINE

## text
text = ['vector machine', 'support vector', 'svm', 'vector regression']

algo['svm_text'] = np.where(groups['text'].str.contains("support vector machine"), "1", "0")

for x in text:
    algo['svm_text'] = np.where(groups['text'].str.contains(x), "1", algo['svm_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(algo['svm_text']))

text counts:
Counter({'0': 29687, '1': 4492})


In [15]:
## MULTIVARIABLE REGRESSION

## text
text = ['logistic regression', 'linear regression', 'multivariable regression', 'multivariate regression',
       'simple regression', 'univariate logistic', 'multivariate linear', 'multivariable linear', 'linear model', 'logistic model',
        'glm', 'regularized regression', 'ridge regression', 'sparse regression', 'stepwise regression', 'kernel regression',
       'process regression']

algo['reg_text'] = np.where(groups['text'].str.contains("univariate regression"), "1", "0")

for x in text:
    algo['reg_text'] = np.where(groups['text'].str.contains(x), "1", algo['reg_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(algo['reg_text']))

text counts:
Counter({'0': 32171, '1': 2008})


In [16]:
## DECISION TREE

## text
text = ['regression tree', 'random forest', 'ensemble tree', 'adaboost', 'xgboost', 'gradient boost']

algo['dt_text'] = np.where(groups['text'].str.contains("decision tree"), "1", "0")

for x in text:
    algo['dt_text'] = np.where(groups['text'].str.contains(x), "1", algo['dt_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(algo['dt_text']))

text counts:
Counter({'0': 31365, '1': 2814})


In [17]:
## DISCRIMINANT ANALYSIS

## text
text = ['discriminant analysis', 'linear discriminant', 'linear discrimination']

algo['da_text'] = np.where(groups['text'].str.contains("discrimination analysis"), "1", "0")

for x in text:
    algo['da_text'] = np.where(groups['text'].str.contains(x), "1", algo['da_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(algo['da_text']))

text counts:
Counter({'0': 33575, '1': 604})


In [18]:
## NAIVE BAYES

## text
text = ['probabilistic classif']

algo['nb_text'] = np.where(groups['text'].str.contains("naive bayes"), "1", "0")

for x in text:
    algo['nb_text'] = np.where(groups['text'].str.contains(x), "1", algo['nb_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(algo['nb_text']))

text counts:
Counter({'0': 33965, '1': 214})


In [19]:
## TRANSFER LEARNING

## text
algo['tl_text'] = np.where(groups['text'].str.contains("transfer learning"), "1", "0")

##output
print('text counts:')
print(Counter(algo['tl_text']))

text counts:
Counter({'0': 33372, '1': 807})


In [20]:
## FEDERATED LEARNING

## text
algo['fl_text'] = np.where(groups['text'].str.contains("federated learning"), "1", "0")

##output
print('text counts:')
print(Counter(algo['fl_text']))

text counts:
Counter({'0': 34163, '1': 16})


In [21]:
## K-NEAREST NEIGHBOUR

## text
algo['knn_text'] = np.where(groups['text'].str.contains("k-nearest"), "1", "0")
algo['knn_text'] = np.where(groups['text'].str.contains("k nearest neighbour"), "1", algo['knn_text'])

##output
print('text counts:')
print(Counter(algo['knn_text']))

text counts:
Counter({'0': 33629, '1': 550})


In [22]:
## UNSUPERVISED LEARNING

## text
text = ['k-means', 'means cluster', 'hierarchical cluster', 'unsupervised learning', 'unsupervised algorithm',
       'unsupervised model', 'unsupervised method', 'latent class analysis']

algo['unsup_text'] = np.where(groups['text'].str.contains("clustering algorithm"), "1", "0")

for x in text:
    algo['unsup_text'] = np.where(groups['text'].str.contains(x), "1", algo['unsup_text']) #if yes then 1, if no, keep current
    
##output
print('text counts:')
print(Counter(algo['unsup_text']))

text counts:
Counter({'0': 33576, '1': 603})


In [23]:
## COMBINE
labelled['algo_neural_net'] = np.where(algo['nn_text'].str.contains("1"), "1", "0")

labelled['algo_support_vector'] = np.where(algo['svm_text'].str.contains("1"), "1", "0")

labelled['algo_regression'] = np.where(algo['reg_text'].str.contains("1"), "1", "0")

labelled['algo_decision_tree'] = np.where(algo['dt_text'].str.contains("1"), "1", "0")

labelled['algo_discriminant'] = np.where(algo['da_text'].str.contains("1"), "1", "0")

labelled['algo_naive_bayes'] = np.where(algo['nb_text'].str.contains("1"), "1", "0")

labelled['algo_transfer'] = np.where(algo['tl_text'].str.contains("1"), "1", "0")

labelled['algo_federated'] = np.where(algo['fl_text'].str.contains("1"), "1", "0")

labelled['algo_k_nearest'] = np.where(algo['knn_text'].str.contains("1"), "1", "0")

labelled['algo_unsupervised'] = np.where(algo['unsup_text'].str.contains("1"), "1", "0")

#algo.to_csv('output/algo_tagged.csv')

In [24]:
labelled.head(5)

Unnamed: 0,pmid,doi,title,abstract,article_date,pubmed_date,article_type,lang,journal,journal_short,journal_country,authors,author_affils,keywords,mesh_terms,references_pmids,feature,include,mature,algo_neural_net,algo_support_vector,algo_regression,algo_decision_tree,algo_discriminant,algo_naive_bayes,algo_transfer,algo_federated,algo_k_nearest,algo_unsupervised
1,34688173,10.1016/j.compbiomed.2021.104924,A convolutional neural network trained with dermoscopic images of psoriasis performed on par with 230 dermatologists.,Psoriasis is a common chronic inflammatory skin disease that causes physical and psychological burden to patients. A Convolutional Neural Network (CNN) focused on dermoscopic images would substantially aid the classification and increase the accuracy of diagnosis of psoriasis.,2021-10-06,2021-10-24,Journal Article,eng,Computers in biology and medicine,Comput Biol Med,United States,"['Yang Yiguang', 'Wang Juncheng', 'Xie Fengying', 'Liu Jie', 'Shu Chang', 'Wang Yukun', 'Zheng Yushan', 'Zhang Haopeng']","['Image Processing Center, School of Astronautics, Beihang University, Beijing, 100191, China; Beijing Advanced Innovation Center for Biomedical Engineering, Beihang University, Beijing, 100191, China.', 'Department of Dermatology, State Key Laboratory of Complex Severe and Rare Diseases, Peking Union Medical College Hospital, Chinese Academy of Medical Science and Peking Union Medical College, National Clinical Research Center for Dermatologic and Immunologic Diseases, Beijing, 100730, China.', 'Image Processing Center, School of Astronautics, Beihang University, Beijing, 100191, China; Beijing Advanced Innovation Center for Biomedical Engineering, Beihang University, Beijing, 100191, China. Electronic address: xfy_73@buaa.edu.cn.', 'Department of Dermatology, State Key Laboratory of Complex Severe and Rare Diseases, Peking Union Medical College Hospital, Chinese Academy of Medical Science and Peking Union Medical College, National Clinical Research Center for Dermatologic and Immunologic Diseases, Beijing, 100730, China. Electronic address: Liujie04672@pumch.cn.', 'Department of Dermatology, State Key Laboratory of Complex Severe and Rare Diseases, Peking Union Medical College Hospital, Chinese Academy of Medical Science and Peking Union Medical College, National Clinical Research Center for Dermatologic and Immunologic Diseases, Beijing, 100730, China.', 'Department of Dermatology, State Key Laboratory of Complex Severe and Rare Diseases, Peking Union Medical College Hospital, Chinese Academy of Medical Science and Peking Union Medical College, National Clinical Research Center for Dermatologic and Immunologic Diseases, Beijing, 100730, China.', 'Image Processing Center, School of Astronautics, Beihang University, Beijing, 100191, China; Beijing Advanced Innovation Center for Biomedical Engineering, Beihang University, Beijing, 100191, China.', 'Image Processing Center, School of Astronautics, Beihang University, Beijing, 100191, China; Beijing Advanced Innovation Center for Biomedical Engineering, Beihang University, Beijing, 100191, China.']","['Convolutional neural networks', 'Deep-learning', 'Dermoscopic images', 'Papulosquamous skin diseases', 'Psoriasis']",,,A convolutional neural network trained with dermoscopic images of psoriasis performed on par with 230 dermatologists. Psoriasis is a common chronic inflammatory skin disease that causes physical and psychological burden to patients. A Convolutional Neural Network (CNN) focused on dermoscopic images would substantially aid the classification and increase the accuracy of diagnosis of psoriasis.,1.0,1.0,1,0,0,0,0,0,0,0,0,0
2,34688172,10.1016/j.compbiomed.2021.104927,A large margin piecewise linear classifier with fusion of deep features in the diagnosis of COVID-19.,"The world has experienced epidemics of coronavirus infections several times over the last two decades. Recent studies have shown that using medical imaging techniques can be useful in developing an automatic computer-aided diagnosis system to detect pandemic diseases with high accuracy at an early stage. In this study, a large margin piecewise linear classifier was developed to diagnose COVID-19 compared to a wide range of viral pneumonia, including SARS and MERS, using chest x-ray images. In the proposed method, a preprocessing pipeline was employed. Moreover, deep pre- and post-rectified linear unit (ReLU) features were extracted using the well-known VGG-Net19, which was fine-tuned to optimize transfer learning. Afterward, the canonical correlation analysis was performed for feature fusion, and fused deep features were passed into the LMPL classifier. The introduced method reached the highest performance in comparison with related state-of-the-art methods for two different schemes (normal, COVID-19, and typical viral pneumonia) and (COVID-19, SARS, and MERS pneumonia) with 99.39% and 98.86% classification accuracy, respectively.",2021-10-11,2021-10-24,Journal Article,eng,Computers in biology and medicine,Comput Biol Med,United States,"['Azouji Neda', 'Sami Ashkan', 'Taheri Mohammad', 'Müller Henning']","['Department of Computer Science and Engineering and IT, School of Electrical and Computer Engineering, Shiraz University, Shiraz, Iran. Electronic address: azouji@shirazu.ac.ir.', 'Department of Computer Science and Engineering and IT, School of Electrical and Computer Engineering, Shiraz University, Shiraz, Iran. Electronic address: sami@shirazu.ac.ir.', 'Department of Computer Science and Engineering and IT, School of Electrical and Computer Engineering, Shiraz University, Shiraz, Iran. Electronic address: motaheri@shirazu.ac.ir.', 'Department of Business Information Systems University of Applied Sciences Western Switzerland, Sierre (HES SO), Switzerland. Electronic address: henning.mueller@hevs.ch.']","['COVID-19', 'Computer-aided diagnosis (CAD)', 'Deep feature extraction', 'Large margin classifier', 'MERS', 'SARS', 'X-ray']",,,"A large margin piecewise linear classifier with fusion of deep features in the diagnosis of COVID-19. The world has experienced epidemics of coronavirus infections several times over the last two decades. Recent studies have shown that using medical imaging techniques can be useful in developing an automatic computer-aided diagnosis system to detect pandemic diseases with high accuracy at an early stage. In this study, a large margin piecewise linear classifier was developed to diagnose COVID-19 compared to a wide range of viral pneumonia, including SARS and MERS, using chest x-ray images. In the proposed method, a preprocessing pipeline was employed. Moreover, deep pre- and post-rectified linear unit (ReLU) features were extracted using the well-known VGG-Net19, which was fine-tuned to optimize transfer learning. Afterward, the canonical correlation analysis was performed for feature fusion, and fused deep features were passed into the LMPL classifier. The introduced method reached the highest performance in comparison with related state-of-the-art methods for two different schemes (normal, COVID-19, and typical viral pneumonia) and (COVID-19, SARS, and MERS pneumonia) with 99.39% and 98.86% classification accuracy, respectively.",1.0,0.0,0,0,0,0,0,0,1,0,0,0
8,34687858,10.1016/j.neuroimage.2021.118652,Causal Decoding of Individual Cortical Excitability States.,"Brain responsiveness to stimulation fluctuates with rapidly shifting cortical excitability state, as reflected by oscillations in the electroencephalogram (EEG). For example, the amplitude of motor-evoked potentials (MEPs) elicited by transcranial magnetic stimulation (TMS) of motor cortex changes from trial to trial. To date, individual estimation of the cortical processes leading to this excitability fluctuation has not been possible. Here, we propose a data-driven method to derive individually optimized EEG classifiers in healthy humans using a supervised learning approach that relates pre-TMS EEG activity dynamics to MEP amplitude. Our approach enables considering multiple brain regions and frequency bands, without defining them a priori, whose compound phase-pattern information determines the excitability. The individualized classifier leads to an increased classification accuracy of cortical excitability states from 57% to 67% when compared to μ-oscillation phase extracted by standard fixed spatial filters. Results show that, for the used TMS protocol, excitability fluctuates predominantly in the μ-oscillation range, and relevant cortical areas cluster around the stimulated motor cortex, but between subjects there is variability in relevant power spectra, phases, and cortical regions. This novel decoding method allows causal investigation of the cortical excitability state, which is critical also for individualizing therapeutic brain stimulation.",2021-10-20,2021-10-24,Journal Article,eng,NeuroImage,Neuroimage,United States,"['Metsomaa J', 'Belardinelli P', 'Ermolova M', 'Ziemann U', 'Zrenner C']","['Department of Neurology & Stroke, University of Tübingen, Tübingen, Germany; Hertie Institute for Clinical Brain Research, University of Tübingen.', 'Department of Neurology & Stroke, University of Tübingen, Tübingen, Germany; Hertie Institute for Clinical Brain Research, University of Tübingen; CIMeC, Center for Mind-Brain Sciences, University of Trento, Italy.', 'Department of Neurology & Stroke, University of Tübingen, Tübingen, Germany; Hertie Institute for Clinical Brain Research, University of Tübingen.', 'Department of Neurology & Stroke, University of Tübingen, Tübingen, Germany; Hertie Institute for Clinical Brain Research, University of Tübingen. Electronic address: ulf.ziemann@uni-tuebingen.de.', 'Department of Neurology & Stroke, University of Tübingen, Tübingen, Germany; Hertie Institute for Clinical Brain Research, University of Tübingen; Temerty Centre for Therapeutic Brain Intervention, Centre for Addiction and Mental Health, and Department of Psychiatry, University of Toronto, Toronto, ON, Canada.']","['EEG', 'TMS', 'brain state', 'classification', 'excitability', 'machine learning']",,,"Causal Decoding of Individual Cortical Excitability States. Brain responsiveness to stimulation fluctuates with rapidly shifting cortical excitability state, as reflected by oscillations in the electroencephalogram (EEG). For example, the amplitude of motor-evoked potentials (MEPs) elicited by transcranial magnetic stimulation (TMS) of motor cortex changes from trial to trial. To date, individual estimation of the cortical processes leading to this excitability fluctuation has not been possible. Here, we propose a data-driven method to derive individually optimized EEG classifiers in healthy humans using a supervised learning approach that relates pre-TMS EEG activity dynamics to MEP amplitude. Our approach enables considering multiple brain regions and frequency bands, without defining them a priori, whose compound phase-pattern information determines the excitability. The individualized classifier leads to an increased classification accuracy of cortical excitability states from 57% to 67% when compared to μ-oscillation phase extracted by standard fixed spatial filters. Results show that, for the used TMS protocol, excitability fluctuates predominantly in the μ-oscillation range, and relevant cortical areas cluster around the stimulated motor cortex, but between subjects there is variability in relevant power spectra, phases, and cortical regions. This novel decoding method allows causal investigation of the cortical excitability state, which is critical also for individualizing therapeutic brain stimulation.",1.0,0.0,0,0,0,0,0,0,0,0,0,0
9,34687853,10.1016/j.mri.2021.10.024,Radiomic machine learning for pretreatment assessment of prognostic risk factors for endometrial cancer and its effects on radiologists' decisions of deep myometrial invasion.,To evaluate radiomic machine learning (ML) classifiers based on multiparametric magnetic resonance images (MRI) in pretreatment assessment of endometrial cancer (EC) risk factors and to examine effects on radiologists' interpretation of deep myometrial invasion (dMI).,2021-10-20,2021-10-24,Journal Article,eng,Magnetic resonance imaging,Magn Reson Imaging,Netherlands,"['Otani Satoshi', 'Himoto Yuki', 'Nishio Mizuho', 'Fujimoto Koji', 'Moribata Yusaku', 'Yakami Masahiro', 'Kurata Yasuhisa', 'Hamanishi Junzo', 'Ueda Akihiko', 'Minamiguchi Sachiko', 'Mandai Masaki', 'Kido Aki']","['Department of Diagnostic Imaging and Nuclear Medicine, Graduate School of Medicine, Kyoto University, Kyoto 606-8507, Japan.', 'Department of Diagnostic Radiology and Nuclear Medicine, Kyoto University Hospital, Kyoto 606-8507, Japan. Electronic address: yhimoto@kuhp.kyoto-u.ac.jp.', 'Department of Diagnostic Imaging and Nuclear Medicine, Graduate School of Medicine, Kyoto University, Kyoto 606-8507, Japan.', 'Department of Real World Data Research and Developmentx, Graduate School of Medicine, Kyoto University, Kyoto 606-8507, Japan.', 'Department of Diagnostic Radiology and Nuclear Medicine, Kyoto University Hospital, Kyoto 606-8507, Japan; Preemptive Medicine and Lifestyle-related Disease Research Center, Kyoto University Hospital, Kyoto 606-8507, Japan.', 'Preemptive Medicine and Lifestyle-related Disease Research Center, Kyoto University Hospital, Kyoto 606-8507, Japan.', 'Department of Diagnostic Radiology and Nuclear Medicine, Kyoto University Hospital, Kyoto 606-8507, Japan.', 'Department of Gynecology and Obstetrics, Kyoto University, Kyoto 606-8507, Japan.', 'Department of Gynecology and Obstetrics, Kyoto University, Kyoto 606-8507, Japan.', 'Department of Diagnostic Pathology, Kyoto University, Kyoto 606-8507, Japan.', 'Department of Gynecology and Obstetrics, Kyoto University, Kyoto 606-8507, Japan.', 'Department of Diagnostic Radiology and Nuclear Medicine, Kyoto University Hospital, Kyoto 606-8507, Japan.']","['Endometrial cancer', 'Radiomic machine learning']",,,Radiomic machine learning for pretreatment assessment of prognostic risk factors for endometrial cancer and its effects on radiologists' decisions of deep myometrial invasion. To evaluate radiomic machine learning (ML) classifiers based on multiparametric magnetic resonance images (MRI) in pretreatment assessment of endometrial cancer (EC) risk factors and to examine effects on radiologists' interpretation of deep myometrial invasion (dMI).,1.0,1.0,0,0,0,0,0,0,0,0,0,0
10,34687850,10.1016/j.mri.2021.10.023,MRI-based machine learning for determining quantitative and qualitative characteristics affecting the survival of glioblastoma multiforme.,Our current study aims to consider the image biomarkers extracted from the MRI images for exploring their effects on glioblastoma multiforme (GBM) patients' survival. Determining its biomarker helps better manage the disease and evaluate treatments. It has been proven that imaging features could be used as a biomarker. The purpose of this study is to investigate the features in MRI and clinical features as the biomarker association of survival of GBM.,2021-10-20,2021-10-24,Journal Article,eng,Magnetic resonance imaging,Magn Reson Imaging,Netherlands,"['Jajroudi Mahdie', 'Enferadi Milad', 'Homayoun Amir Azar', 'Reiazi Reza']","['Pharmaceutical Research Center, Mashhad University of Medical Sciences, Mashhad, Iran. Electronic address: Jajroudimh991@mums.ac.ir.', 'Research Center for Nuclear Medicine, Shariati Hospital, Tehran University of Medical Sciences, Tehran, Iran.', 'Sina Trauma Research Center, Tehran University of Medical Sciences, Tehran, Iran.', 'Radiation Medicine Program, Princess Margaret Cancer Centre, University Health Network, Toronto, Ontario, Canada. Electronic address: reza.reiazi@uhnresearch.ca.']","['Biomarker', 'Clinical features', 'Glioblastoma multiforme', 'MRI features', 'Machine learning']",,,MRI-based machine learning for determining quantitative and qualitative characteristics affecting the survival of glioblastoma multiforme. Our current study aims to consider the image biomarkers extracted from the MRI images for exploring their effects on glioblastoma multiforme (GBM) patients' survival. Determining its biomarker helps better manage the disease and evaluate treatments. It has been proven that imaging features could be used as a biomarker. The purpose of this study is to investigate the features in MRI and clinical features as the biomarker association of survival of GBM.,1.0,0.0,0,0,0,0,0,0,0,0,0,0


## Tag Features

In [25]:
feat = groups[['text']].copy()

In [26]:
######################
## CLASSES
######################
# BIO_MARKER / bio
# GENOMIC / gene
# IMAGING / imaging
    ### XR / xr
    ### CT / ct
    ### MRI / mri
# ECHO / echo
# US / us
# MAMMOGRAM / mamm
# OCT / oct
# EEG / eeg
# ECG / ecg
# EMG / emg
# DERMASCOPY / derm
# CELLULAR_PATH / histo
# ENDOSCOPY / endo
#
# NATURAL_LANGUAGE / nlp
# EHR RECORDS / ehr
#
# WEARABLE_SENSORS / sensor
# SMARTPHONE / phone
# PATIENT REPORTED / prom
# DIGITAL STETH / sound

In [27]:
## XR

## text
text = ['xr', 'x-ray', 'radiograph']

feat['xr_text'] = np.where(groups['text'].str.contains("cxr"), "1", "0")

for x in text:
    feat['xr_text'] = np.where(groups['text'].str.contains(x), "1", feat['xr_text']) #if yes then 1, if no, keep current
    
##output
print('text counts:')
print(Counter(feat['xr_text']))

text counts:
Counter({'0': 32828, '1': 1351})


In [28]:
## CT

## text
text = ['computed tomograph', 'axial tomograph', 'ct scan', 'ct image', 'ct slice', ' ct ', ' ct-',
       'tomography scan', 'computerised tomograph', 'computerized tomograph', 'assisted tomograph']

feat['ct_text'] = np.where(groups['text'].str.contains("cat scan"), "1", "0")

for x in text:
    feat['ct_text'] = np.where(groups['text'].str.contains(x), "1", feat['ct_text']) #if yes then 1, if no, keep current

##exclude

feat['ct_text'] = np.where(groups['text'].str.contains("optical coherence"), "0", feat['ct_text']) #exclude oct
feat['ct_text'] = np.where(groups['text'].str.contains("coherence tomograph"), "0", feat['ct_text']) #exclude oct
        
##output
print('text counts:')
print(Counter(feat['ct_text']))

text counts:
Counter({'0': 31198, '1': 2981})


In [29]:
## MRI

## text
text = ['magnetic resonance']

feat['mri_text'] = np.where(groups['text'].str.contains(" mri"), "1", "0")

for x in text:
    feat['mri_text'] = np.where(groups['text'].str.contains(x), "1", feat['mri_text'] ) #if yes then 1, if no, keep current

##output
print('text counts:')
print(Counter(feat['mri_text']))

text counts:
Counter({'0': 30726, '1': 3453})


In [30]:
## ECHO

## text
text = ['echo-cardio', 'echokardio', 'cardiac echo']

feat['echo_text'] = np.where(groups['text'].str.contains("echocardio"), "1", "0")

for x in text:
    feat['echo_text'] = np.where(groups['text'].str.contains(x), "1", feat['echo_text']) #if yes then 1, if no, keep current

##output
print('text counts:')
print(Counter(feat['echo_text']))

text counts:
Counter({'0': 33988, '1': 191})


In [31]:
## US

## text
text = ['sonography', 'ultra-sound', 'ultrasonograph', 'doppler']

feat['us_text'] = np.where(groups['text'].str.contains("ultrasound"), "1", "0")

for x in text:
    feat['us_text'] = np.where(groups['text'].str.contains(x), "1", feat['us_text']) #if yes then 1, if no, keep current

##output
print('text counts:')
print(Counter(feat['us_text']))

text counts:
Counter({'0': 33003, '1': 1176})


In [32]:
## ECG

## text
text = [' ecg', ' ekg', 'electrokardio', 'electro-cardio', 'holter monitor', 'cardiac monitor']

feat['ecg_text'] = np.where(groups['text'].str.contains("electrocardio"), "1", "0")

for x in text:
    feat['ecg_text'] = np.where(groups['text'].str.contains(x), "1", feat['ecg_text']) #if yes then 1, if no, keep current

##output
print('text counts:')
print(Counter(feat['ecg_text']))

text counts:
Counter({'0': 33072, '1': 1107})


In [33]:
## EEG

## text
text = [' eeg']

feat['eeg_text'] = np.where(groups['text'].str.contains("electroenc"), "1", "0")

for x in text:
    feat['eeg_text'] = np.where(groups['text'].str.contains(x), "1", feat['eeg_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(feat['eeg_text']))

text counts:
Counter({'0': 32309, '1': 1870})


In [34]:
## EMG

## text
text = ['myoelectric', 'electro-myo']

feat['emg_text'] = np.where(groups['text'].str.contains("electromyo"), "1", "0")

for x in text:
    feat['emg_text'] = np.where(groups['text'].str.contains(x), "1", feat['emg_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(feat['emg_text']))

text counts:
Counter({'0': 33656, '1': 523})


In [35]:
feat[feat['emg_text']=="1"].sample(5)

Unnamed: 0,text,xr_text,ct_text,mri_text,echo_text,us_text,ecg_text,eeg_text,emg_text
129872,"hand motion classification using a multi-channel surface electromyography sensor the human hand has multiple degrees of freedom dof for achieving high-dexterity motions identifying and replicating human hand motions are necessary to perform precise and delicate operations in many applications, such as haptic applications surface electromyography semg sensors are a low-cost method for identifying hand motions, in addition to the conventional methods that use data gloves and vision detection the identification of multiple hand motions is challenging because the error rate typically increases significantly with the addition of more hand motions thus, the current study proposes two new methods for feature extraction to solve the problem above the first method is the extraction of the energy ratio features in the time-domain, which are robust and invariant to motion forces and speeds for the same gesture the second method is the extraction of the concordance correlation features that describe the relationship between every two channels of the multi-channel semg sensor system the concordance correlation features of a multi-channel semg sensor system were shown to provide a vast amount of useful information for identification furthermore, a new cascaded-structure classifier is also proposed, in which 11 types of hand gestures can be identified accurately using the newly defined features experimental results show that the success rate for the identification of the 11 gestures is significantly high",0,0,0,0,0,0,0,1
105758,"a robust myoelectric pattern recognition using online sequential extreme learning machine for finger movement classification a robust myoelectric pattern-recognition-system requires a system that should work in the real application as good as in the laboratory however, this demand should be handled properly and rigorously to achieve a robust myoelectric system electrode shift is an issue that usually emerges when dealing with robustness issue in daily life, the placement of electrodes becomes a significant issue that can downgrade the performance of the system this paper proposed a new way to overcome the robustness issue by conducting an update to the system to anticipate changes in the future such as electrode shift, improvement in muscle strength or any other issue such update will be used to generate an adaptation the adaptation is done according to the users need by employing an online sequential extreme learning os-elm to learn the training data chunk by chunk os-elm enables the myoelectric system to learn from a small number of data to avoid cumbersome training process the day-to-day experiment shows that the proposed system can maintain its performance on average accuracy around 85% whereas the non-adaptive system could not",0,0,0,0,0,0,0,1
113264,"characterization of a benchmark database for myoelectric movement classification in this paper, we characterize the ninapro database and its use as a benchmark for hand prosthesis evaluation the database is a publicly available resource that aims to support research on advanced myoelectric hand prostheses the database is obtained by jointly recording surface electromyography signals from the forearm and kinematics of the hand and wrist while subjects perform a predefined set of actions and postures besides describing the acquisition protocol, overall features of the datasets and the processing procedures in detail, we present benchmark classification results using a variety of feature representations and classifiers our comparison shows that simple feature representations such as mean absolute value and waveform length can achieve similar performance to the computationally more demanding marginal discrete wavelet transform with respect to classification methods, the nonlinear support vector machine was found to be the only method consistently achieving high performance regardless of the type of feature representation furthermore, statistical analysis of these results shows that classification accuracy is negatively correlated with the subjects body mass index the analysis and the results described in this paper aim to be a strong baseline for the ninapro database thanks to the ninapro database and the characterization described in this paper, the scientific community has the opportunity to converge to a common position on hand movement recognition by surface electromyography, a field capable to strongly affect hand prosthesis capabilities",0,0,0,0,0,0,0,1
141421,"the application of machine learning algorithms to the analysis of electromyographic patterns from arthritic patients the main aim of our study was to investigate the possibility of applying machine learning techniques to the analysis of electromyographic patterns emg collected from arthritic patients during gait the emg recordings were collected from the lower limbs of patients with arthritis and compared with those of healthy subjects co with no musculoskeletal disorder the study involved subjects suffering from two forms of arthritis, viz, rheumatoid arthritis ra and hip osteoarthritis oa the analysis of the data was plagued by two problems which frequently render the analysis of this type of data extremely difficult one was the small number of human subjects that could be included in the investigation based on the terms specified in the inclusion and exclusion criteria for the study the other was the high intra- and inter-subject variability present in emg data we identified some of the muscles differently employed by the arthritic patients by using machine learning techniques to classify the two groups and then identified the muscles that were critical for the classification for the classification we employed least-squares kernel lsk algorithms, neural network algorithms like the kohonen self organizing map, learning vector quantification and the multilayer perceptron finally we also tested the more classical technique of linear discriminant analysis lda the performance of the different algorithms was compared the lsk algorithm showed the highest capacity for classification our study demonstrates that the newly developed lsk algorithm is adept for the treatment of biological data the muscles that were most important for distinguishing the ra from the co subjects were the soleus and biceps femoris for separating the oa and co subjects however, it was the gluteus medialis muscle our study demonstrates how classification with emg data can be used in the clinical setting while such procedures are unnecessary for the diagnosis of the type of arthritis present, an understanding of the muscles which are responsible for the classification can help to better identify targets for rehabilitative measures",0,0,0,0,0,0,0,1
169300,"on automatic identification of upper-limb movements using small-sized training sets of emg signals we evaluate the performance of a variety of neural and fuzzy networks for discrimination among three planar arm-pointing movements by means of electromyographic emg signals, when learning is based on small-sized training sets the aim of this work is to underline the importance that the sparse data problem has in designing pattern classifiers with good generalisation properties the results indicate that one of the proposed fuzzy networks is more robust than the other classifiers when working with small training sets",0,0,0,0,0,0,0,1


In [36]:
## CELLULAR PATHOLOGY

## text
text = ['histopath', 'histology', 'histochem', 'immunohist', 'cytolog', 'cytochem', 'cellular path', 'microscopy',
       'smear', 'cytometry', 'hematoxylin', 'specimens', 'stain', 'tissue sample', 'tissue section', 'brushing']

feat['histo_text'] = np.where(groups['text'].str.contains("histologic"), "1", "0")

for x in text:
    feat['histo_text'] = np.where(groups['text'].str.contains(x), "1", feat['histo_text']) #if yes then 1, if no, keep current
    
## output
print('text counts:')
print(Counter(feat['histo_text']))

text counts:
Counter({'0': 32009, '1': 2170})


In [37]:
## OCT / retinal

## text
text = ['coherence tomog', ' oct ', 'retinal photo', 'retinal imag', 'retinal tomograph',
        'laser ophth', 'fundus imag', 'fundus phot', 'fundal imag', 'fundal phot']

feat['oct_text'] = np.where(groups['text'].str.contains("optical coherence"), "1", "0")

for x in text:
    feat['oct_text'] = np.where(groups['text'].str.contains(x), "1", feat['oct_text']) #if yes then 1, if no, keep current

##output    
print('text counts:')
print(Counter(feat['oct_text']))

text counts:
Counter({'0': 33245, '1': 934})


In [38]:
feat[feat['oct_text']=='1'].sample(20)

Unnamed: 0,text,xr_text,ct_text,mri_text,echo_text,us_text,ecg_text,eeg_text,emg_text,histo_text,oct_text
128682,"sensitivity and specificity of machine learning classifiers and spectral domain oct for the diagnosis of glaucoma purpose to investigate the sensitivity and specificity of machine learning classifiers mlc and spectral domain optical coherence tomography sd-oct for the diagnosis of glaucoma methods sixty-two patients with early to moderate glaucomatous visual field damage and 48 healthy individuals were included all subjects underwent a complete ophthalmologic examination, achromatic standard automated perimetry, and rnfl imaging with sd-oct cirrus hd-oct; carl zeiss meditec, inc, dublin, california, usa receiver operating characteristic roc curves were obtained for all sd-oct parameters subsequently, the following mlcs were tested: classification tree ctree, random forest ran, bagging bag, adaboost m1 ada, ensemble selection ens, multilayer perceptron mlp, radial basis function rbf, naive-bayes nb, and support vector machine svm areas under the roc curves arocs obtained for each parameter and each mlc were compared results the mean age was 570±92 years for healthy individuals and 599±90 years for glaucoma patients p=0103 mean deviation values were -41±24 db for glaucoma patients and -15±16 db for healthy individuals p<0001 the sd-oct parameters with the greater arocs were inferior quadrant 0813, average thickness 0807, 7 oclock position 0765, and 6 oclock position 0754 the arocs from classifiers varied from 0785 ada to 0818 bag the aroc obtained with bag was not significantly different from the aroc obtained with the best single sd-oct parameter p=093 conclusions the sd-oct showed good diagnostic accuracy in a group of patients with early glaucoma in this series, mlcs did not improve the sensitivity and specificity of sd-oct for the diagnosis of glaucoma",0,0,0,0,0,0,0,0,0,1
82803,"comparison of machine-learning classification models for glaucoma management this study develops an objective machine-learning classification model for classifying glaucomatous optic discs and reveals the classificatory criteria to assist in clinical glaucoma management in this study, 163 glaucoma eyes were labelled with four optic disc types by three glaucoma specialists and then randomly separated into training and test data all the images of these eyes were captured using optical coherence tomography and laser speckle flowgraphy to quantify the ocular structure and blood-flow-related parameters a total of 91 parameters were extracted from each eye along with the patientsbackground information machine-learning classifiers, including the neural network nn, naïve bayes nb, support vector machine svm, and gradient boosted decision trees gbdt, were trained to build the classification models, and a hybrid feature selection method that combines minimum redundancy maximum relevance and genetic-algorithm-based feature selection was applied to find the most valid and relevant features for nn, nb, and svm a comparison of the performance of the three machine-learning classification models showed that the nn had the best classification performance with a validated accuracy of 878% using only nine ocular parameters these selected quantified parameters enabled the trained nn to classify glaucomatous optic discs with relatively high performance without requiring color fundus images",0,0,0,0,0,0,0,0,0,1
89683,"leveraging uncertainty information from deep neural networks for disease detection deep learning dl has revolutionized the field of computer vision and image processing in medical imaging, algorithmic solutions based on dl have been shown to achieve high performance on tasks that previously required medical experts however, dl-based solutions for disease detection have been proposed without methods to quantify and control their uncertainty in a decision in contrast, a physician knows whether she is uncertain about a case and will consult more experienced colleagues if needed here we evaluate drop-out based bayesian uncertainty measures for dl in diagnosing diabetic retinopathy dr from fundus images and show that it captures uncertainty better than straightforward alternatives furthermore, we show that uncertainty informed decision referral can improve diagnostic performance experiments across different networks, tasks and datasets show robust generalization depending on network capacity and task/dataset difficulty, we surpass 85% sensitivity and 80% specificity as recommended by the nhs when referring 0-20% of the most uncertain decisions for further inspection we analyse causes of uncertainty by relating intuitions from 2d visualizations to the high-dimensional image space while uncertainty is sensitive to clinically relevant cases, sensitivity to unfamiliar data samples is task dependent, but can be rendered more robust",0,0,0,0,0,0,0,0,0,1
61024,"clinical interpretable deep learning model for glaucoma diagnosis despite the potential to revolutionise disease diagnosis by performing data-driven classification, clinical interpretability of convnet remains challenging in this paper, a novel clinical interpretable convnet architecture is proposed not only for accurate glaucoma diagnosis but also for the more transparent interpretation by highlighting the distinct regions recognised by the network to the best of our knowledge, this is the first work of providing the interpretable diagnosis of glaucoma with the popular deep learning model we propose a novel scheme for aggregating features from different scales to promote the performance of glaucoma diagnosis, which we refer to as m-lap moreover, by modelling the correspondence from binary diagnosis information to the spatial pixels, the proposed scheme generates glaucoma activations, which bridge the gap between global semantical diagnosis and precise location in contrast to previous works, it can discover the distinguish local regions in fundus images as evidence for clinical interpretable glaucoma diagnosis experimental results, performed on the challenging origa datasets, show that our method on glaucoma diagnosis outperforms state-of-the-art methods with the highest auc 088 remarkably, the extensive results, optic disc segmentation dice of 09 and local disease focus localization based on the evidence map, demonstrate the effectiveness of our methods on clinical interpretability",0,0,0,0,0,0,0,0,0,1
41599,automated identification of retinopathy of prematurity by image-based deep learning retinopathy of prematurity rop is a leading cause of childhood blindness worldwide but can be a treatable retinal disease with appropriate and timely diagnosis this study was performed to develop a robust intelligent system based on deep learning to automatically classify the severity of rop from fundus images and detect the stage of rop and presence of plus disease to enable automated diagnosis and further treatment,0,0,0,0,0,0,0,0,0,1
29265,automated quantitative assessment of retinal fluid volumes as important biomarkers in neovascular age-related macular degeneration to evaluate retinal fluid volume data extracted from optical coherence tomography oct scans by artificial intelligence algorithms in the treatment of neovascular age-related macular degeneration nv-amd,0,0,0,0,0,0,0,0,0,1
9743,"multicolor image classification using the multimodal information bottleneck network mmib-net for detecting diabetic retinopathy multicolor mc imaging is an imaging modality that records confocal scanning laser ophthalmoscope cslo fundus images, which can be used for the diabetic retinopathy dr detection by utilizing this imaging technique, multiple modal images can be obtained in a single case additional symptomatic features can be obtained if these images are considered during the diagnosis of dr however, few studies have been carried out to classify mc images using deep learning methods, let alone using multi modal features for analysis in this work, we propose a novel model which uses the multimodal information bottleneck network mmib-net to classify the mc images for the detection of dr our model can extract the features of multiple modalities simultaneously while finding concise feature representations of each modality using the information bottleneck theory mc images classification can be achieved by picking up the combined representations and features of all modalities in our experiments, it is shown that the proposed method can achieve an accurate classification of mc images comparative experiments also demonstrate that the use of multimodality and information bottleneck improves the performance of mc images classification to the best of our knowledge, this is the first report of dr identification utilizing the multimodal information bottleneck convolutional neural network in mc images",0,0,0,0,0,0,0,0,0,1
51496,"deep learning segmentation for optical coherence tomography measurements of the lower tear meniscus the tear meniscus contains most of the tear fluid and therefore is a good indicator for the state of the tear film previously, we used a custom-built optical coherence tomography oct system to study the lower tear meniscus by automatically segmenting the image data with a thresholding-based segmentation algorithm tbsa in this report, we investigate whether the results of this image segmentation algorithm are suitable to train a neural network in order to obtain similar or better segmentation results with shorter processing times considering the class imbalance problem, we compare two approaches, one directly segmenting the tear meniscus dsa, the other first localizing the region of interest and then segmenting within the higher resolution image section lsa a total of 6658 images labeled by the tbsa were used to train deep convolutional neural networks with supervised learning five-fold cross-validation reveals a sensitivity of 9636% and 9643%, a specificity of 9998% and 9986% and a jaccard index of 9324% and 9316% for the dsa and lsa, respectively average segmentation times are up to 228 times faster than the tbsa additionally, we report the behavior of the dsa and lsa in cases challenging for the tbsa and further test the applicability to measurements acquired with a commercially available oct system the application of deep learning for the segmentation of the tear meniscus provides a powerful tool for the assessment of the tear film, supporting studies for the investigation of the pathophysiology of dry eye-related diseases",0,0,0,0,0,0,0,0,0,1
71646,"weakly supervised lesion localization for age-related macular degeneration detection using optical coherence tomography images age-related macular degeneration amd is the main cause of irreversible blindness among the elderly and require early diagnosis to prevent vision loss, and careful treatment is essential optical coherence tomography oct, the most commonly used imaging method in the retinal area for the diagnosis of amd, is usually interpreted by a clinician, and oct can help diagnose disease on the basis of the relevant diagnostic criteria, but these judgments can be somewhat subjective we propose an algorithm for the detection of amd based on a weakly supervised convolutional neural network cnn model to support computer-aided diagnosis cad system our main contributions are the following three things 1 we propose a concise cnn model for oct images, which outperforms the existing large cnn models using vgg16 and googlenet architectures 2 we propose an algorithm called expressive gradients eg that extends the existing integrated gradients ig algorithm so as to exploit not only the input-level attribution map, but also the high-level attribution maps due to enriched gradients, eg can highlight suspicious regions for diagnosis of amd better than the guided-backpropagation method and ig 3 our method provides two visualization options: overlay and top-k bounding boxes, which would be useful for cad through experimental evaluation using 10,100 clinical oct images from amd patients, we demonstrate that our eg algorithm outperforms the ig algorithm in terms of localization accuracy and also outperforms the existing object detection methods in terms of class accuracy",0,0,0,0,0,0,0,0,0,1
125116,"superpixel classification based optic disc and optic cup segmentation for glaucoma screening glaucoma is a chronic eye disease that leads to vision loss as it cannot be cured, detecting the disease in time is important current tests using intraocular pressure iop are not sensitive enough for population based glaucoma screening optic nerve head assessment in retinal fundus images is both more promising and superior this paper proposes optic disc and optic cup segmentation using superpixel classification for glaucoma screening in optic disc segmentation, histograms, and center surround statistics are used to classify each superpixel as disc or non-disc a self-assessment reliability score is computed to evaluate the quality of the automated optic disc segmentation for optic cup segmentation, in addition to the histograms and center surround statistics, the location information is also included into the feature space to boost the performance the proposed segmentation methods have been evaluated in a database of 650 images with optic disc and optic cup boundaries manually marked by trained professionals experimental results show an average overlapping error of 95% and 241% in optic disc and optic cup segmentation, respectively the results also show an increase in overlapping error as the reliability score is reduced, which justifies the effectiveness of the self-assessment the segmented optic disc and optic cup are then used to compute the cup to disc ratio for glaucoma screening our proposed method achieves areas under curve of 0800 and 0822 in two data sets, which is higher than other methods the methods can be used for segmentation and glaucoma screening the self-assessment will be used as an indicator of cases with large errors and enhance the clinical deployment of the automatic segmentation and screening",0,0,0,0,0,0,0,0,0,1


In [39]:
## MAMMOGRAM

## text
feat['mamm_text'] = np.where(groups['text'].str.contains("mammog"), "1", "0")

##output
print('text counts:')
print(Counter(feat['mamm_text']))

text counts:
Counter({'0': 33611, '1': 568})


In [40]:
## FIBREOPTIC ENDOSCOPY

## text
text = ['colonoscop', 'endoscop', 'bronchoscop', 'fiberoptic', 'fiber-optic', 'fiberscop', 'fibrescop',
       'cystoscop', 'enteroscop', 'hysteroscop']

feat['endo_text'] = np.where(groups['text'].str.contains('endoscopy'), "1", "0")

for x in text:
    feat['endo_text'] = np.where(groups['text'].str.contains(x), "1", feat['endo_text']) #if yes then 1, if no, keep current

##output
print('text counts:')
print(Counter(feat['endo_text']))

text counts:
Counter({'0': 33576, '1': 603})


In [41]:
## DERMATOLOGY IMAGES

## text
feat['derm_text'] = np.where(groups['text'].str.contains("dermoscop"), "1", "0")
feat['derm_text'] = np.where(groups['text'].str.contains("dermascop"), "1", feat['derm_text'])
feat['derm_text'] = np.where((groups['text'].str.contains("image")) &
                             (groups['text'].str.contains("skin cancer")) , "1", feat['derm_text'])
feat['derm_text'] = np.where((groups['text'].str.contains("photo")) &
                             (groups['text'].str.contains("skin cancer")) , "1", feat['derm_text'])
feat['derm_text'] = np.where((groups['text'].str.contains("image")) &
                             (groups['text'].str.contains("dermat")) , "1", feat['derm_text'])
feat['derm_text'] = np.where((groups['text'].str.contains("photo")) &
                             (groups['text'].str.contains("dermat")) , "1", feat['derm_text'])
feat['derm_text'] = np.where((groups['text'].str.contains("image")) &
                             (groups['text'].str.contains("melanoma")) , "1", feat['derm_text'])
feat['derm_text'] = np.where((groups['text'].str.contains("photo")) &
                             (groups['text'].str.contains("melanoma")) , "1", feat['derm_text'])
feat['derm_text'] = np.where((groups['text'].str.contains("image")) &
                             (groups['text'].str.contains("skin lesion")) , "1", feat['derm_text'])
feat['derm_text'] = np.where((groups['text'].str.contains("photo")) &
                             (groups['text'].str.contains("skin lesion")) , "1", feat['derm_text'])
feat['derm_text'] = np.where((groups['text'].str.contains("image")) &
                             (groups['text'].str.contains("rash")) , "1", feat['derm_text'])
feat['derm_text'] = np.where((groups['text'].str.contains("photo")) &
                             (groups['text'].str.contains("rash")) , "1", feat['derm_text'])

feat['derm_text'] = np.where(groups['text'].str.contains("histo"), "0", feat['derm_text']) # exclude histological studies
feat['derm_text'] = np.where(groups['text'].str.contains("microsc"), "0", feat['derm_text']) # exclude microscopy

##output
print('text counts:')
print(Counter(feat['derm_text']))

text counts:
Counter({'0': 33876, '1': 303})


In [42]:
feat[feat['derm_text']=='1'].sample(20)

Unnamed: 0,text,xr_text,ct_text,mri_text,echo_text,us_text,ecg_text,eeg_text,emg_text,histo_text,oct_text,mamm_text,endo_text,derm_text
45995,"past and present of computer-assisted dermoscopic diagnosis: performance of a conventional image analyser versus a convolutional neural network in a prospective data set of 1,981 skin lesions convolutional neural networks cnns have shown a dermatologist-level performance in the classification of skin lesions we aimed to deliver a head-to-head comparison of a conventional image analyser cia, which depends on segmentation and weighting of handcrafted features, to a cnn trained by deep learning",0,0,0,0,0,0,0,0,0,0,0,0,1
129875,"distribution quantification on dermoscopy images for computer-assisted diagnosis of cutaneous melanomas computerised analysis on skin lesion images has been reported to be helpful in achieving objective and reproducible diagnosis of melanoma in particular, asymmetry in shape, colour and structure reflects the irregular growth of melanin under the skin and is of great importance for diagnosing the malignancy of skin lesions this paper proposes a novel asymmetry analysis based on a newly developed pigmentation elevation model and the global point signatures gpss specifically, the pigmentation elevation model was first constructed by computer-based analysis of dermoscopy images, for the identification of melanin and haemoglobin asymmetry of skin lesions was then assessed through quantifying distributions of the pigmentation elevation model using the gpss, derived from a laplace-beltrami operator this new approach allows quantifying the shape and pigmentation distributions of cutaneous lesions simultaneously algorithm performance was tested on 351 dermoscopy images, including 88 malignant melanomas and 263 benign naevi, employing a support vector machine svm with tenfold cross-validation strategy competitive diagnostic results were achieved using the proposed asymmetry descriptor only, presenting 8636 % sensitivity, 8213 % specificity and overall 8343 % accuracy, respectively in addition, the proposed gps-based asymmetry analysis enables working on dermoscopy images from different databases and is approved to be inherently robust to the external imaging variations these advantages suggested that the proposed method has good potential for follow-up treatment",0,0,0,0,0,0,0,0,0,0,0,0,1
29294,a new deep learning approach integrated with clinical data for the dermoscopic differentiation of early melanomas from atypical nevi timely recognition of malignant melanoma mm is challenging for dermatologists worldwide and represents the main determinant for mortality dermoscopic examination is influenced by dermatologistsexperience and fails to achieve adequate accuracy and reproducibility in discriminating atypical nevi an from early melanomas em,0,0,0,0,0,0,0,0,0,0,0,0,1
77077,"radiomics of brain mri: utility in prediction of metastatic tumor type purpose to investigate the feasibility of tumor type prediction with mri radiomic image features of different brain metastases in a multiclass machine learning approach for patients with unknown primary lesion at the time of diagnosis materials and methods this single-center retrospective analysis included radiomic features of 658 brain metastases from t1-weighted contrast material-enhanced, t1-weighted nonenhanced, and fluid-attenuated inversion recovery flair images in 189 patients 101 women, 88 men; mean age, 61 years; age range, 32-85 years images were acquired over a 9-year period from september 2007 through december 2016 with different mri units, reflecting heterogeneous image data included metastases originated from breast cancer n = 143, small cell lung cancer n = 151, non-small cell lung cancer n = 225, gastrointestinal cancer n = 50, and melanoma n = 89 a total of 1423 quantitative image features and basic clinical data were evaluated by using random forest machine learning algorithms validation was performed with model-external fivefold cross validation comparative analysis of 10 randomly drawn cross-validation sets verified the stability of the results the classifier performance was compared with predictions from a respective conventional reading by two radiologists results areas under the receiver operating characteristic curve of the five-class problem ranged between 064 for non-small cell lung cancer and 082 for melanoma; all p values were less than 01 prediction performance of the classifier was superior to the radiologistsreadings highest differences were observed for melanoma, with a 17-percentage-point gain in sensitivity compared with the sensitivity of both readers; p values were less than 02 conclusion quantitative features of routine brain mr images used in a machine learning classifier provided high discriminatory accuracy in predicting the tumor type of brain metastases © rsna, 2018 online supplemental material is available for this article",0,0,1,0,0,0,0,0,0,0,0,0,1
171279,"computer-supported diagnosis of melanoma in profilometry laser profilometry offers new possibilities to improve non-invasive tumor diagnostics in dermatology in this paper, a new approach to computer-supported analysis and interpretation of high-resolution skin-surface profiles of melanomas and nevocellular nevi is presented image analysis methods are used to describe the profiles structures by texture parameters based on co-occurrence matrices, features extracted from the fourier power spectrum, and fractal features different feature selection strategies, including genetic algorithms, are applied to determine the best possible subsets of features for the classification task several architectures of multilayer perceptrons with error back-propagation as learning paradigm are trained for the automatic recognition of melanomas and nevi furthermore, network-pruning algorithms are applied to optimize the network topology in the study, the best neural classifier showed an error rate of 45% and was obtained after network pruning the smallest error rate in all, of 23%, was achieved with nearest neighbor classification",0,0,0,0,0,0,0,0,0,0,0,0,1
17300,"skin lesion classification using additional patient information in this paper, we describe our method for skin lesion classification the goal is to classify skin lesions based on dermoscopic images to several diagnosesclasses presented in the ham human against machine dataset: melanoma mel, melanocytic nevus nv, basal cell carcinoma bcc, actinic keratosis ak, benign keratosis bkl, dermatofibroma df, and vascular lesion vasc we propose a simplified solution which has a better accuracy than previous methods, but only predicted on a single model that is practical for a real-world scenario our results show that using a network with additional metadata as input achieves a better classification performance this metadata includes both the patient information and the extra information during the data augmentation process on the international skin imaging collaboration isic 2018 skin lesion classification challenge test set, our algorithm yields a balanced multiclass accuracy of 887% on a single model and 895% for the embedding solution, which makes it the currently first ranked algorithm on the live leaderboard to improve the inference accuracy test time augmentation tta is applied we also demonstrate how grad-cam is applied in tta therefore, tta and grad-cam can be integrated in heat map generation, which can be very helpful to assist the clinician for diagnosis",0,0,0,0,0,0,0,0,0,0,0,0,1
125522,"an incremental approach to pigmented skin lesion segmentation with classification refinements in uncertain regions skin lesion segmentation in dermatoscopic images is difficult because there are large inter variations in shape, size, color, and texture between lesions and skin types hence, computational features learned from a training set of lesion images may not be applicable to other lesion images in this paper, we propose an incremental method for lesion segmentation it leverages the expectation-maximization algorithm to find an initial segmentation a new adaptive method is proposed to define two types of segmented regions: the high-confident and the low-confident we train a support vector machine, using computational features from the high-confident regions, to further refine segmentation and, hence, achieve improved results for the low-confident regions validation experiments of our proposed method are performed on 319 dermatoscopy images and we have achieved good results with precision and recall to be 0864 and 0875 respectively",0,0,0,0,0,0,0,0,0,0,0,0,1
105212,"computer-aided diagnosis of psoriasis skin images with hos, texture and color features: a first comparative study of its kind psoriasis is an autoimmune skin disease with red and scaly plaques on skin and affecting about 125 million people worldwide currently, dermatologist use visual and haptic methods for diagnosis the disease severity this does not help them in stratification and risk assessment of the lesion stage and grade further, current methods add complexity during monitoring and follow-up phase the current diagnostic tools lead to subjectivity in decision making and are unreliable and laborious this paper presents a first comparative performance study of its kind using principal component analysis pca based cadx system for psoriasis risk stratification and image classification utilizing: i 11 higher order spectra hos features, ii 60 texture features, and iii 86 color feature sets and their seven combinations aggregate 540 image samples 270 healthy and 270 diseased from 30 psoriasis patients of indian ethnic origin are used in our database machine learning using pca is used for dominant feature selection which is then fed to support vector machine classifier svm to obtain optimized performance three different protocols are implemented using three kinds of feature sets reliability index of the cadx is computed among all feature combinations, the cadx system shows optimal performance of 100% accuracy, 100% sensitivity and specificity, when all three sets of feature are combined further, our experimental result with increasing data size shows that all feature combinations yield high reliability index throughout the pca-cutoffs except color feature set and combination of color and texture feature sets hos features are powerful in psoriasis disease classification and stratification even though, independently, all three set of features hos, texture, and color perform competitively, but when combined, the machine learning system performs the best the system is fully automated, reliable and accurate",0,0,0,0,0,0,0,0,0,0,0,0,1
94859,"dermoscopic image segmentation via multistage fully convolutional networks segmentation of skin lesions is an important step in the automated computer aided diagnosis of melanoma however, existing segmentation methods have a tendency to over- or under-segment the lesions and perform poorly when the lesions have fuzzy boundaries, low contrast with the background, inhomogeneous textures, or contain artifacts furthermore, the performance of these methods are heavily reliant on the appropriate tuning of a large number of parameters as well as the use of effective preprocessing techniques, such as illumination correction and hair removal",0,0,0,0,0,0,0,0,0,0,0,0,1
88075,"skin lesion analysis towards melanoma detection using deep learning network skin lesions are a severe disease globally early detection of melanoma in dermoscopy images significantly increases the survival rate however, the accurate recognition of melanoma is extremely challenging due to the following reasons: low contrast between lesions and skin, visual similarity between melanoma and non-melanoma lesions, etc hence, reliable automatic detection of skin tumors is very useful to increase the accuracy and efficiency of pathologists in this paper, we proposed two deep learning methods to address three main tasks emerging in the area of skin lesion image processing, ie, lesion segmentation task 1, lesion dermoscopic feature extraction task 2 and lesion classification task 3 a deep learning framework consisting of two fully convolutional residual networks fcrn is proposed to simultaneously produce the segmentation result and the coarse classification result a lesion index calculation unit licu is developed to refine the coarse classification results by calculating the distance heat-map a straight-forward cnn is proposed for the dermoscopic feature extraction task the proposed deep learning frameworks were evaluated on the isic 2017 dataset experimental results show the promising accuracies of our frameworks, ie, 0753 for task 1, 0848 for task 2 and 0912 for task 3 were achieved",0,0,0,0,0,0,0,0,0,0,0,0,1


In [43]:
## GENOMIC

## text
text = ['candidate gene', 'prognostic gene', ' gene ', ' genes ', ' dna ', ' rna ']

feat['gene_text'] = np.where(groups['text'].str.contains('genomic'), "1", "0")

for x in text:
    feat['gene_text'] = np.where(groups['text'].str.contains(x), "1", feat['gene_text']) #if yes then 1, if no, keep current

##output
print('text counts:')
print(Counter(feat['gene_text']))

text counts:
Counter({'0': 32439, '1': 1740})


In [44]:
feat[feat['gene_text']=='1'].sample(20)

Unnamed: 0,text,xr_text,ct_text,mri_text,echo_text,us_text,ecg_text,eeg_text,emg_text,histo_text,oct_text,mamm_text,endo_text,derm_text,gene_text
25404,"machine learning-driven and smartphone-based fluorescence detection for crispr diagnostic of sars-cov-2 rapid, accurate, and low-cost detection of sars-cov-2 is crucial to contain the transmission of covid-19 here, we present a cost-effective smartphone-based device coupled with machine learning-driven software that evaluates the fluorescence signals of the crispr diagnostic of sars-cov-2 the device consists of a three-dimensional 3d-printed housing and low-cost optic components that allow excitation of fluorescent reporters and selective transmission of the fluorescence emission to a smartphone custom software equipped with a binary classification model has been developed to quantify the acquired fluorescence images and determine the presence of the virus our detection system has a limit of detection lod of 625 rna copies/μl on laboratory samples and produces a test accuracy of 95% and sensitivity of 97% on 96 nasopharyngeal swab samples with transmissible viral loads our quantitative fluorescence score shows a strong correlation with the quantitative reverse transcription polymerase chain reaction rt-qpcr ct values, offering valuable information of the viral load and, therefore, presenting an important advantage over nonquantitative readouts",0,1,0,0,0,0,0,0,0,0,0,0,0,1
86153,"predicting clinical outcomes in colorectal cancer using machine learning using gene markers and other patient features to predict clinical outcomes plays a vital role in enhancing clinical decision making and improving prognostic accuracy this work uses a large set of colorectal cancer patient data to train predictive models using machine learning methods such as random forest, general linear model, and neural network for clinically relevant outcomes including disease free survival, survival, radio-chemotherapy response rct-r and relapse the most successful predictive models were created for dichotomous outcomes like relapse and rct-r with accuracies of 071 and 070 on blinded test data respectively the best prediction models regarding overall survival and disease-free survival had c-index scores of 086 and 076 respectively these models could be used in the future to aid a decision for or against chemotherapy and improve survival prognosis we propose that future work should focus on creating reusable frameworks and infrastructure for training and delivering predictive models to physicians, so that they could be readily applied to other diseases in practice and be continuously developed integrating new data",0,0,0,0,0,0,0,0,0,0,0,0,0,1
145485,"recursive fuzzy granulation for gene subsets extraction and cancer classification a typical microarray gene expression dataset is usually both extremely sparse and imbalanced to select multiple highly informative gene subsets for cancer classification and diagnosis, a new fuzzy granular support vector machine---recursive feature elimination algorithm fgsvm-rfe is designed in this paper as a hybrid algorithm of statistical learning, fuzzy clustering, and granular computing, the fgsvm-rfe separately eliminates irrelevant, redundant, or noisy genes in different granules at different stages and selects highly informative genes with potentially different biological functions in balance empirical studies on three public datasets demonstrate that the fgsvm-rfe outperforms state-of-the-art approaches moreover, the fgsvm-rfe can extract multiple gene subsets on each of which a classifier can be modeled with 100% accuracy specifically, the independent testing accuracy for the prostate cancer dataset is significantly improved the previous best result is 86% with 16 genes and our best result is 100% with only eight genes the identified genes are annotated by onto-express to be biologically meaningful",0,0,0,0,0,0,0,0,0,0,0,0,0,1
20257,"classification of molecular subtypes of high-grade serous ovarian cancer by maldi-imaging despite the correlation of clinical outcome and molecular subtypes of high-grade serous ovarian cancer hgsoc, contemporary gene expression signatures have not been implemented in clinical practice to stratify patients for targeted therapy hence, we aimed to examine the potential of unsupervised matrix-assisted laser desorption/ionization imaging mass spectrometry maldi-ims to stratify patients who might benefit from targeted therapeutic strategies molecular subtyping of paraffin-embedded tissue samples from 279 hgsoc patients was performed by nanostring analysis ground truth labeling next, we applied maldi-ims paired with machine-learning algorithms to identify distinct mass profiles on the same paraffin-embedded tissue sections and distinguish hgsoc subtypes by proteomic signature finally, we devised a novel approach to annotate spectra of stromal origin we elucidated a maldi-derived proteomic signature 135 peptides able to classify hgsoc subtypes random forest classifiers achieved an area under the curve auc of 0983 furthermore, we demonstrated that the exclusion of stroma-associated spectra provides tangible improvements to classification quality auc = 0988 moreover, novel maldi-based stroma annotation achieved near-perfect classifications auc = 0999 here, we present a concept integrating maldi-ims with machine-learning algorithms to classify patients according to distinct molecular subtypes of hgsoc this has great potential to assign patients for personalized treatment",0,0,0,0,0,0,0,0,1,0,0,0,0,1
83442,"early diagnosis of alzheimers disease based on resting-state brain networks and deep learning computerized healthcare has undergone rapid development thanks to the advances in medical imaging and machine learning technologies especially, recent progress on deep learning opens a new era for multimedia based clinical decision support in this paper, we use deep learning with brain network and clinical relevant text information to make early diagnosis of alzheimers disease ad the clinical relevant text information includes age, gender, and apoe gene of the subject the brain network is constructed by computing the functional connectivity of brain regions using resting-state functional magnetic resonance imaging r-fmri data a targeted autoencoder network is built to distinguish normal aging from mild cognitive impairment, an early stage of ad the proposed method reveals discriminative brain network features effectively and provides a reliable classifier for ad detection compared to traditional classifiers based on r-fmri time series data, about 3121 percent improvement of the prediction accuracy is achieved by the proposed deep learning method, and the standard deviation reduces by 5123 percent in the best case that means our prediction model is more stable and reliable compared to the traditional methods our work excavates deep learnings advantages of classifying high-dimensional multimedia data in medical services, and could help predict and prevent ad at an early stage",0,0,1,0,0,0,0,0,0,0,0,0,0,1
110352,"multi-class bcga-elm based classifier that identifies biomarkers associated with hallmarks of cancer traditional cancer treatments have centered on cytotoxic drugs and general purpose chemotherapy that may not be tailored to treat specific cancers identification of molecular markers that are related to different types of cancers might lead to discovery of drugs that are patient and disease specific this study aims to use microarray gene expression cancer data to identify biomarkers that are indicative of different types of cancers our aim is to provide a multi-class cancer classifier that can simultaneously differentiate between cancers and identify type-specific biomarkers, through the application of the binary coded genetic algorithm bcga and a neural network based extreme learning machine elm algorithm",0,0,0,0,0,0,0,0,0,0,0,0,0,1
13461,the application of artificial intelligence methods to gene expression data for differentiation of uncomplicated and complicated appendicitis in children and adolescents - a proof of concept study genome wide gene expression analysis has revealed hints for independent immunological pathways underlying the pathophysiologies of phlegmonous pa and gangrenous appendicitis ga methods of artificial intelligence ai have successfully been applied to routine laboratory and sonographic parameters for differentiation of the inflammatory manifestations in this study we aimed to apply ai methods to gene expression data to provide evidence for feasibility,0,0,0,0,0,0,0,0,0,0,0,0,0,1
165383,"assessing optimal neural network architecture for identifying disease-associated multi-marker genotypes using a permutation test, and application to calpain 10 polymorphisms associated with diabetes biallelic markers, such as single nucleotide polymorphisms snps, provide greater information for localising disease loci when treated as multilocus haplotypes, but often haplotypes are not immediately available from multilocus genotypes in case-control studies an artificial neural network allows investigation of association between disease phenotype and tightly linked markers without requiring haplotype phase and without modelling any evolutionary history for the disease-related haplotypes the network assesses whether marker haplotypes differ between cases and controls to the extent that classification of disease status based on multi-marker genotypes is achievable the network is trained to recognise affection status based on supplied marker genotypes, and then for each multi-marker genotype it produces outputs which aim to approximate the associated affection status next, the genotypes are permuted relative to affection status to produce many random datasets and the process of training and recording of outputs is repeated the extent to which the ability to predict affection for the real dataset exceeds that for the random datasets measures the statistical significance of the association between multi-marker genotype and affection this permutation test performs well with simulated case-control datasets, particularly when major gene effects are present we have explored the effects of systematically varying different network parameters in order to identify their optimal values we have applied the permutation test to 4 snps of the calpain 10 capn10 gene typed in a case-control sample of subjects with type 2 diabetes, impaired glucose tolerance, and controls we show that the neural network produces more highly significant evidence for association than do single marker tests corrected for the number of markers genotyped the use of a permutation test could potentially allow conditional analyses which could incorporate known risk factors alongside marker genotypes permuting only the marker genotypes relative to affection status and these risk factors would allow the contribution of the markers to disease risk to be independently assessed",0,0,0,0,0,0,0,0,0,0,0,0,0,1
2149,"identification of prognostic biomarker candidates associated with melanoma using high-dimensional genomic data survival of patients with metastatic melanoma varies widely melanoma is a highly proliferative, chemo-resistant disease with the recent availability of immunotherapies such as checkpoint inhibitors, durable response rates have improved but are often still limited to 2-3 years response rates to treatment range from 30 to 45% with combination therapy however no improvement in overall survival is frequently observed of the available therapies, many have targeted the brafv600e mutation that results in abnormal mapk pathway activation which is important for regulating cell proliferation immune checkpoint inhibitors such as anti-pd-1 and anti-pd-l1 offer better success but response rates are still low identifying biomarkers to better target those who will respond and identify the right combination of treatment is the best approach in this study, we utilize data from the cancer cell line encyclopedia ccle, including 62 samples, to examine features of gene expression 19k+ and copy number 20k+ in the melanoma cell lines we perform a clustering analysis on the feature set to assess genetically similarity among the cell lines we then discover which specific genes and combinations thereof maximize cluster density we design a feature selection approach for high-dimensional datasets that integrates multiple disparate machine learning techniques into one cohesive pipeline our approach provides a small subset of genes that can accurately distinguish between the clusters of melanoma cell lines across multiple types of classifiers in particular, we find only the 15 highest ranked genes among the original 19 k are necessary to achieve perfect or near-perfect test split classification performance of these 15 genes, some are known to be linked to melanoma or other cancer progressions, while others have not previously been linked to melanoma and are of interest for further examination",0,0,0,0,0,0,0,0,0,0,0,0,0,1
37927,"development of prognostic indicator based on autophagy-related lncrna analysis in colon adenocarcinoma there were no systematic researches about autophagy-related long noncoding rna lncrna signatures to predict the survival of patients with colon adenocarcinoma it was necessary to set up corresponding autophagy-related lncrna signatures the expression profiles of lncrnas which contained 480 colon adenocarcinoma samples were obtained from the cancer genome atlas tcga database the coexpression network of lncrnas and autophagy-related genes was utilized to select autophagy-related lncrnas the lncrnas were further screened using univariate cox regression in addition, lasso regression and multivariate cox regression were used to develop an autophagy-related lncrna signature a risk score based on the signature was established, and cox regression was used to test whether it was an independent prognostic factor the functional enrichment of autophagy-related lncrnas was visualized using gene ontology and kyoto encyclopedia of genes and genomes ten prognostic autophagy-related lncrnas ac0273072, ac0685803, al1387561, cd27-as1, eif3j-dt, linc01011, linc01063, linc02381, ac0738963, and snhg16 were identified to be significantly different, which made up an autophagy-related lncrna signature the signature divided patients with colon adenocarcinoma into the low-risk group and the high-risk group a risk score based on the signature was a significantly independent factor for the patients with colon adenocarcinoma hr = 1088, 95%ci = 1057 - 1120; <i>p</i> < 0001 additionally, the ten lncrnas were significantly enriched in autophagy process, metabolism, and tumor classical pathways in conclusion, the ten autophagy-related lncrnas and their signature might be molecular biomarkers and therapeutic targets for the patients with colon adenocarcinoma",0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [45]:
## PROTEINOMICS/BIOMARKERS

## text
text = ['proteinomic', 'immunoglob', 'cytokine', 'biomarker', 'tumor marker', 'tumour marker', 'inflammatory marker',
       'peptide', 'interferon', 'laboratory test', 'blood test']

feat['bio_text'] = np.where(groups['text'].str.contains('serum marker'), "1", "0")

for x in text:
    feat['bio_text'] = np.where(groups['text'].str.contains(x), "1", feat['bio_text']) #if yes then 1, if no, keep current

##output
print('text counts:')
print(Counter(feat['bio_text']))

text counts:
Counter({'0': 31589, '1': 2590})


In [46]:
## NATURAL LANGUAGE PROCESSING

## text
feat['nlp_text'] = np.where(groups['text'].str.contains("natural language"), "1", "0")

## output
print('text counts:')
print(Counter(feat['nlp_text']))

text counts:
Counter({'0': 33668, '1': 511})


In [47]:
## EHR RECORDS

## text
text = ['electronic health', 'health record', 'electronic record', 'patient record', 'medical record',
        'care record', 'patient registry', 'research registr', 'clinical note', 'patient note', 'patient data',
        'care data', 'care note', 'medical data', 'clinical data', 'hospital data', 'hospital note', 'admission note',
        'physiological data', 'observational data', 'patient features', 'patient observations', 'patient history',
        'medical history', 'care history']

feat['ehr_text'] = np.where(groups['text'].str.contains('snomed'), "1", "0")

for x in text:
    feat['ehr_text'] = np.where(groups['text'].str.contains(x), "1", feat['ehr_text']) #if yes then 1, if no, keep current

##output
print('text counts:')
print(Counter(feat['ehr_text']))

text counts:
Counter({'0': 31583, '1': 2596})


In [48]:
## WEARABLE_SENSORS

## text
text = ['wearable sensor', 'smartwatch', 'internet of thing', 'sensor device', 'smart sensor', 'fitbit', 'fitness band',
       'activity tracker', 'fitness tracker']

feat['sensor_text'] = np.where(groups['text'].str.contains('smart watch'), "1", "0")

for x in text:
    feat['sensor_text'] = np.where(groups['text'].str.contains(x), "1", feat['sensor_text']) #if yes then 1, if no, keep current

##output
print('text counts:')
print(Counter(feat['sensor_text']))

text counts:
Counter({'0': 33731, '1': 448})


In [49]:
## PROM

## text
feat['prom_text'] = np.where(groups['text'].str.contains("patient reported outcome"), "1", "0")
feat['prom_text'] = np.where(groups['text'].str.contains("patient-reported outcome"), "1", feat['prom_text'])

##output
print('text counts:')
print(Counter(feat['prom_text']))

text counts:
Counter({'0': 34139, '1': 40})


In [50]:
## SMARTPHONE

## text
feat['phone_text'] = np.where(groups['text'].str.contains("smartphone"), "1", "0")
feat['phone_text'] = np.where(groups['text'].str.contains("iphone"), "1", feat['phone_text'])

##output
print('text counts:')
print(Counter(feat['phone_text']))

text counts:
Counter({'0': 33877, '1': 302})


In [51]:
#### DIGITAL STETH / sound

## text
text = ['heart sound', 'heart murmur', 'breath sound', 'auscultat', 'phonocardio', 'digital steth']

feat['sound_text'] = np.where(groups['text'].str.contains('electronic steth'), "1", "0")

for x in text:
    feat['sound_text'] = np.where(groups['text'].str.contains(x), "1", feat['sound_text']) #if yes then 1, if no, keep current

feat['sound_text'] = np.where((groups['text'].str.contains("heart")) &
                             (groups['text'].str.contains("stethoscope")) , "1", feat['sound_text'])
feat['sound_text'] = np.where((groups['text'].str.contains("valve")) &
                             (groups['text'].str.contains("stethoscope")) , "1", feat['sound_text'])
feat['sound_text'] = np.where((groups['text'].str.contains("murmur")) &
                             (groups['text'].str.contains("stethoscope")) , "1", feat['sound_text'])
feat['sound_text'] = np.where((groups['text'].str.contains("lung")) &
                             (groups['text'].str.contains("stethoscope")) , "1", feat['sound_text'])
feat['sound_text'] = np.where((groups['text'].str.contains("resp")) &
                             (groups['text'].str.contains("stethoscope")) , "1", feat['sound_text'])
feat['sound_text'] = np.where((groups['text'].str.contains("breath")) &
                             (groups['text'].str.contains("stethoscope")) , "1", feat['sound_text'])

## output
print('text counts:')
print(Counter(feat['sound_text']))

text counts:
Counter({'0': 34000, '1': 179})


In [52]:
feat[feat['sound_text']=='1'].sample(20)

Unnamed: 0,text,xr_text,ct_text,mri_text,echo_text,us_text,ecg_text,eeg_text,emg_text,histo_text,oct_text,mamm_text,endo_text,derm_text,gene_text,bio_text,nlp_text,ehr_text,sensor_text,prom_text,phone_text,sound_text
112843,"automatic wheezing detection based on signal processing of spectrogram and back-propagation neural network wheezing is a common clinical symptom in patients with obstructive pulmonary diseases such as asthma automatic wheezing detection offers an objective and accurate means for identifying wheezing lung sounds, helping physicians in the diagnosis, long-term auscultation, and analysis of a patient with obstructive pulmonary disease this paper describes the design of a fast and high-performance wheeze recognition system a wheezing detection algorithm based on the order truncate average method and a back-propagation neural network bpnn is proposed some features are extracted from processed spectra to train a bpnn, and subsequently, test samples are analyzed by the trained bpnn to determine whether they are wheezing sounds the respiratory sounds of 58 volunteers 32 asthmatic and 26 healthy adults were recorded for training and testing experimental results of a qualitative analysis of wheeze recognition showed a high sensitivity of 0946 and a high specificity of 10",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
890,"rheumatic heart disease screening based on phonocardiogram rheumatic heart disease rhd is one of the most common causes of cardiovascular complications in developing countries it is a heart valve disease that typically affects children impaired heart valves stop functioning properly, resulting in a turbulent blood flow within the heart known as a murmur this murmur can be detected by cardiac auscultation however, the specificity and sensitivity of manual auscultation were reported to be low the other alternative is echocardiography, which is costly and requires a highly qualified physician given the diseases current high prevalence rate the latest reported rate in the study area ethiopia was 565%, there is a pressing need for early detection of the disease through mass screening programs this paper proposes an automated rhd screening approach using machine learning that can be used by non-medically trained persons outside of a clinical setting heart sound data was collected from 124 persons with rhd pwrhd and 46 healthy controls hc in ethiopia with an additional 81 hc records from an open-access dataset thirty-one distinct features were extracted to correctly represent rhd a support vector machine svm classifier was evaluated using two nested cross-validation approaches to quantitatively assess the generalization of the system to previously unseen subjects for regular nested 10-fold cross-validation, an f1-score of 960 ± 09%, recall 958 ± 15%, precision 962 ± 06% and a specificity of 960 ± 06% were achieved in the imbalanced nested cross-validation at a prevalence rate of 5%, it achieved an f1-score of 722 ± 08%, recall 923 ± 04%, precision 592 ± 36%, and a specificity of 948 ± 06% in screening tasks where the prevalence of the disease is small, recall is more important than precision the findings are encouraging, and the proposed screening tool can be inexpensive, easy to deploy, and has an excellent detection rate as a result, it has the potential for mass screening and early detection of rhd in developing countries",0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
76550,"detection of osteoporosis from percussion responses using an electronic stethoscope and machine learning osteoporosis is an asymptomatic bone condition that affects a large proportion of the elderly population around the world, resulting in increased bone fragility and increased risk of fracture previous studies had shown that the vibroacoustic response of bone can indicate the quality of the bone condition therefore, the aim of the authorsproject is to develop a new method to exploit this phenomenon to improve detection of osteoporosis in individuals in this paper a method is described that uses a reflex hammer to exert testing stimuli on a patients tibia and an electronic stethoscope to acquire the impulse responses the signals are processed as mel frequency cepstrum coefficients and passed through an artificial neural network to determine the likelihood of osteoporosis from the tibias impulse responses following some discussions of the mechanism and procedure, this paper details the signal acquisition using the stethoscope and the subsequent signal processing and the statistical machine learning algorithm pilot testing with 12 patients achieved over 80% sensitivity with a false positive rate below 30% and accuracies in the region of 70% an extended dataset of 110 patients achieved an error rate of 30% with some room for improvement in the algorithm by using common clinical apparatus and strategic machine learning, this method might be suitable as a large population screening test for the early diagnosis of osteoporosis, thus avoiding secondary complications",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
143756,"support vectors machine-based identification of heart valve diseases using heart sounds taking into account that heart auscultation remains the dominant method for heart examination in the small health centers of the rural areas and generally in primary healthcare set-ups, the enhancement of this technique would aid significantly in the diagnosis of heart diseases in this context, the present paper initially surveys the research that has been conducted concerning the exploitation of heart sound signals for automated and semi-automated detection of pathological heart conditions then it proposes an automated diagnosis system for the identification of heart valve diseases based on the support vector machines svm classification of heart sounds this system performs a highly difficult diagnostic task even for experienced physicians, much more difficult than the basic diagnosis of the existence or not of a heart valve disease ie the classification of a heart sound as healthyor having a heart valve disease: it identifies the particular heart valve disease the system was applied in a representative global dataset of 198 heart sound signals, which come both from healthy medical cases and from cases suffering from the four most usual heart valve diseases: aortic stenosis as, aortic regurgitation ar, mitral stenosis ms and mitral regurgitation mr initially the heart sounds were successfully categorized using a svm classifier as normal or disease-related and then the corresponding murmurs in the unhealthy cases were classified as systolic or diastolic for the heart sounds diagnosed as having systolic murmur we used a svm classifier for performing a more detailed classification of them as having aortic stenosis or mitral regurgitation similarly for the heart sounds diagnosed as having diastolic murmur we used a svm classifier for classifying them as having aortic regurgitation or mitral stenosis alternative classifiers have been applied to the same data for comparison ie back-propagation neural networks, k-nearest-neighbour and naïve bayes classifiers, however their performance for the same diagnostic problems was lower than the svm classifiers proposed in this work",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
98317,s1 and s2 heart sound recognition using deep neural networks this study focuses on the first s1 and second s2 heart sound recognition based only on acoustic characteristics; the assumptions of the individual durations of s1 and s2 and time intervals of s1-s2 and s2-s1 are not involved in the recognition process the main objective is to investigate whether reliable s1 and s2 recognition performance can still be attained under situations where the duration and interval information might not be accessible,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
97085,"a mobile platform for automated screening of asthma and chronic obstructive pulmonary disease chronic obstructive pulmonary disease copd and asthma each represent a large proportion of the global disease burden; copd is the third leading cause of death worldwide and asthma is one of the most prevalent chronic diseases, afflicting over 300 million people much of this burden is concentrated in the developing world, where patients lack access to physicians trained in the diagnosis of pulmonary disease as a result, these patients experience high rates of underdiagnosis and misdiagnosis to address this need, we present a mobile platform capable of screening for asthma and copd our solution is based on a mobile smart phone and consists of an electronic stethoscope, a peak flow meter application, and a patient questionnaire this data is combined with a machine learning algorithm to identify patients with asthma and copd to test and validate the design, we collected data from 119 healthy and sick participants using our custom mobile application and ran the analysis on a pc computer for comparison, all subjects were examined by an experienced pulmonologist using a full pulmonary testing laboratory employing a two-stage logistic regression model, our algorithms were first able to identify patients with either asthma or copd from the general population, yielding an roc curve with an auc of 095 then, after identifying these patients, our algorithm was able to distinguish between patients with asthma and patients with copd, yielding an roc curve with auc of 097 this work represents an important milestone towards creating a self-contained mobile phone-based platform that can be used for screening and diagnosis of pulmonary disease in many parts of the world",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
18232,"deep learning algorithm for automated cardiac murmur detection via a digital stethoscope platform background clinicians vary markedly in their ability to detect murmurs during cardiac auscultation and identify the underlying pathological features deep learning approaches have shown promise in medicine by transforming collected data into clinically significant information the objective of this research is to assess the performance of a deep learning algorithm to detect murmurs and clinically significant valvular heart disease using recordings from a commercial digital stethoscope platform methods and results using >34 hours of previously acquired and annotated heart sound recordings, we trained a deep neural network to detect murmurs to test the algorithm, we enrolled 962 patients in a clinical study and collected recordings at the 4 primary auscultation locations ground truth was established using patient echocardiograms and annotations by 3 expert cardiologists algorithm performance for detecting murmurs has sensitivity and specificity of 763% and 914%, respectively by omitting softer murmurs, those with grade 1 intensity, sensitivity increased to 900% application of the algorithm at the appropriate anatomic auscultation location detected moderate-to-severe or greater aortic stenosis, with sensitivity of 932% and specificity of 860%, and moderate-to-severe or greater mitral regurgitation, with sensitivity of 662% and specificity of 946% conclusions the deep learning algorithms ability to detect murmurs and clinically significant aortic stenosis and mitral regurgitation is comparable to expert cardiologists based on the annotated subset of our database the findings suggest that such algorithms would have utility as front-line clinical support tools to aid clinicians in screening for cardiac murmurs caused by valvular heart disease registration url: https://clinicaltrialsgov; unique identifier: nct03458806",0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
144410,"feature extraction for murmur detection based on support vector regression of time-frequency representations this paper presents a nonlinear approach for time-frequency representations tfr data analysis, based on a statistical learning methodology - support vector regression svr, that being a nonlinear framework, matches recent findings on the underlying dynamics of cardiac mechanic activity and phonocardiographic pcg recordings the proposed methodology aims to model the estimated tfrs, and extract relevant features to perform classification between normal and pathologic pcg recordings with murmur modeling of tfr is done by means of svr, and the distance between regressions is calculated through dissimilarity measures based on dot product finally, a k-nn classifier is used for the classification stage, obtaining a validation performance of 9785%",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3375,"characterizing effortful swallows from healthy community dwelling adults across the lifespan using high-resolution cervical auscultation signals and mbsimp scores: a preliminary study there is growing enthusiasm to develop inexpensive, non-invasive, and portable methods that accurately assess swallowing and provide biofeedback during dysphagia treatment high-resolution cervical auscultation hrca, which uses acoustic and vibratory signals from non-invasive sensors attached to the anterior laryngeal framework during swallowing, is a novel method for quantifying swallowing physiology via advanced signal processing and machine learning techniques hrca has demonstrated potential as a dysphagia screening method and diagnostic adjunct to vfsss by determining swallowing safety, annotating swallow kinematic events, and classifying swallows between healthy participants and patients with a high degree of accuracy however, its feasibility as a non-invasive biofeedback system has not been explored this study investigated 1 whether hrca can accurately differentiate between non-effortful and effortful swallows; 2 whether differences exist in modified barium swallow impairment profile mbsimp scores #9, #11, #14 between non-effortful and effortful swallows we hypothesized that hrca would accurately classify non-effortful and effortful swallows and that differences in mbsimp scores would exist between the types of swallows we analyzed 247 thin liquid 3 ml command swallows 71 effortful to minimize variation from 36 healthy adults who underwent standardized vfsss with concurrent hrca results revealed differences p < 005 in 9 hrca signal features between non-effortful and effortful swallows using hrca signal features as input, decision trees classified swallows with 76% accuracy, 76% sensitivity, and 77% specificity there were no differences in mbsimp component scores between non-effortful and effortful swallows while preliminary in nature, this study demonstrates the feasibility/promise of hrca as a biofeedback method for dysphagia treatment",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
49539,"classifying dysphagic swallowing sounds with support vector machines swallowing sounds from cervical auscultation include information related to the swallowing function several studies have been conducted on the screening tests of dysphagia the literature shows a significant difference between the characteristics of swallowing sounds obtained from different subjects eg, healthy and dysphagic subjects; young and old adults these studies demonstrate the usefulness of swallowing sounds during dysphagic screening however, the degree of classification for dysphagia based on swallowing sounds has not been thoroughly studied in this study, we investigate the use of machine learning for classifying swallowing sounds into various types, such as normal swallowing or mild, moderate, and severe dysphagia in particular, swallowing sounds were recorded from patients with dysphagia support vector machines svms were trained using some features extracted from the obtained swallowing sounds moreover, the accuracy of the classification of swallowing sounds using the trained svms was evaluated via cross-validation techniques in the two-class scenario, wherein the swallowing sounds were divided into two categories viz normal and dysphagic subjects, the maximum f-measure was 789% in the four-class scenario, where the swallowing sounds were divided into four categories viz normal subject, and mild, moderate, and severe dysphagic subjects, the f-measure values for the classes were 656%, 531%, 511%, and 371%, respectively",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [53]:
## COMBINE
labelled['feat_xr'] = np.where(feat['xr_text'].str.contains("1"), "1", "0")

labelled['feat_ct'] = np.where(feat['ct_text'].str.contains("1"), "1", "0")

labelled['feat_mri'] = np.where(feat['mri_text'].str.contains("1"), "1", "0")

labelled['feat_eeg'] = np.where(feat['eeg_text'].str.contains("1"), "1", "0")

labelled['feat_ecg'] = np.where(feat['ecg_text'].str.contains("1"), "1", "0")

labelled['feat_emg'] = np.where(feat['emg_text'].str.contains("1"), "1", "0")

labelled['feat_us'] = np.where(feat['us_text'].str.contains("1"), "1", "0")

labelled['feat_echo'] = np.where(feat['echo_text'].str.contains("1"), "1", "0")

labelled['feat_histo'] = np.where(feat['histo_text'].str.contains("1"), "1", "0")

labelled['feat_oct'] = np.where(feat['oct_text'].str.contains("1"), "1", "0")

labelled['feat_mamm'] = np.where(feat['mamm_text'].str.contains("1"), "1", "0")

labelled['feat_endoscop'] = np.where(feat['endo_text'].str.contains("1"), "1", "0")

labelled['feat_derm'] = np.where(feat['derm_text'].str.contains("1"), "1", "0")

labelled['feat_gene'] = np.where(feat['gene_text'].str.contains("1"), "1", "0")

labelled['feat_bio'] = np.where(feat['bio_text'].str.contains("1"), "1", "0")

labelled['feat_nlp'] = np.where(feat['nlp_text'].str.contains("1"), "1", "0")

labelled['feat_ehr'] = np.where(feat['ehr_text'].str.contains("1"), "1", "0")

labelled['feat_sensor'] = np.where(feat['sensor_text'].str.contains("1"), "1", "0")

labelled['feat_phone'] = np.where(feat['phone_text'].str.contains("1"), "1", "0")

labelled['feat_prom'] = np.where(feat['prom_text'].str.contains("1"), "1", "0")

labelled['feat_sound'] = np.where(feat['sound_text'].str.contains("1"), "1", "0")

#feat.to_csv('output/feat_tagged.csv')

## Tag Specialties / Use-Cases

In [54]:
######################
## CLASS TAGS - by mesh for disease type
######################


######################
## CLASS TAGS - by specialty, not mutually exclusive
######################
## INTENSIVE CARE MEDICINE / icu

## EMERGENCY MEDICINE / ed

## INFECTIONS [C01] / id
    #### SEPSIS / sepsis
    #### COVID-19 / cov19
    #### MALARIA / malaria
    #### HIV / hiv
    #### TB / tb
    #### TROPICAL DISEASE / tropic
    
## DERMATOLOGY [C17] / derm
    ####SKIN CANCERS / dermca

## NEOPLASMS [C04] / onc
    #### RADIOTHERAPY / rx
    #### LUNG / lungca
    #### NEURO / neuroca
    #### GI / gica
    #### HPB / hepca
    #### GYNAE / gynonc
    #### PROSTATE / prosca
    #### RENAL / renalca
    #### HAEM / haemonc
    
## BREAST / breast (<- almost entirely onc)
    #### BREAST CA / breastca
    
## PSYCHIATRY / psych
    #### SUICIDE / suicide
    
## MUSCULOSKELETAL [C05] / msk
    #### FRACTURE / frac

## CONNECTIVE TISSUE [C17] / rheum

## GASTROINTESTINAL [C06] / gi

## HEPATOLOGY & BILIARY [C06] / hep

## RESPIRATORY [C08] / resp
    #### PNEUMONIA / pneum
    #### OBSTRUCTIVE SLEEP / osa
    #### PULMONARY EMBOLISM / pe
    
## NERVOUS SYSTEM [C10] / neuro
    #### STROKE / cva
    #### SEIZURE / epilep
    #### DEMENTIA / alzh

## CARDIOVASCULAR [C14] / cvs
    #### ISCHAEMIC HEART DISEASE / ihd
    #### CARDIAC FAILURE / hf
    #### ARRHYTHMIA / arrhyt
    
## ENDOCRINE [C19] (no dm) / endo

## DIABETES / dm
    #### INSULIN / insulin
    #### RETINOPATHY / retina
        
## OPHTHALMOLOGY [C11] / eye

## HAEMATOLOGIC [C15] / haem

## GYNAE/OBSTETRIC [C13] / obs

## NEPHROLOGY [C12] / renal
    #### ACUTE & CHRONIC KIDNEY / ackd
    
## PAEDIATRICS / paeds

## STOMATOGNATHIC [C07] / dental

## AUDIOLOGY [C09] / ent

## PUBLIC HEALTH / pubh

########exclude?############# 

## ALCOHOL & SUBSTANCES [C25] / etoh
## WOUNDS AND INJURIES [C26] -> TRAUMA
## ENVIRONMENTAL [C21] / env


######################
## SPECIAL
######################
## BCI
## CONTROL
#### PROSTHESIS CONTROL
#### WHEELCHAIR CONTROL



## vitals monitoring / deterioration
## trauma?
## sleep
## pulmonary embolism

In [55]:
spec = groups[['text']].copy()

In [56]:
## INTENSIVE CARE MEDICINE / icu

## text
text = ['intensive care', 'critical care', 'mechanical ventilation', 'invasive ventilation', 'ventilator', 'pressure ventilation', 
       'acute respiratory distress syndrome', 'organ failure', 'tracheal intubation', 'vasopressor', 'inotrope',
       'hemofiltration', 'membrane oxygenation', 'ecmo', ' ett ', 'layngoscope', 'endotracheal tube']

spec['icu_text'] = np.where(groups['text'].str.contains('intensive therapy unit'), "1", "0")

for x in text:
    spec['icu_text'] = np.where(groups['text'].str.contains(x), "1", spec['icu_text']) #if yes then 1, if no, keep current

##output    
print('text counts:')
print(Counter(spec['icu_text']))

text counts:
Counter({'0': 33431, '1': 748})


In [57]:
## EMERGENCY MEDICINE / ed

## text
text = ['emergency department', 'emergency room', 'emergency physician', 'emergency doctor', 'emergency medicine',
       'emergency care', 'accident and emergency', 'a&e', 'accident & emergency', 'prehospital', 'pre-hospital',
       'casualty room', 'emergency ward']

spec['ed_text'] = np.where(groups['text'].str.contains('casualty department'), "1", "0")

for x in text:
    spec['ed_text'] = np.where(groups['text'].str.contains(x), "1", spec['ed_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['ed_text']))

text counts:
Counter({'0': 33768, '1': 411})


In [58]:
## INFECTIONS / id + bacteriology/virology/parasitology

## text
text = ['bacter', 'microbiol', 'sepsis', 'septic', 'toxic shock', 'microbe', 'tuberculosis',
       'cholera', 'shigella', 'bubonic', 'plague', 'anthrax', 'gonorrhea', 'syphilis', 'diphtheria', 'legionell',
       'leptospirosis', 'listeriosis', 'tetanus', 'pertussis', 'staph', 'strep', 'escherichia', 'leprosy', 
        'mycobacter', 'blood culture',
       
       'fungus', 'fungal', 'fungaemia', 'fungemia', 'candida ', 'aspergill',
       
       'virolog', 'virus', 'viral', 'virulen', 'influenza', 'hepatitis', 'herpes', 'varicella',
       'measles', 'covid', 'sars-cov', 'coronavirus', 'severe acute respiratory syndrome', 'yellow fever', 'dengue',
       'rabies', 'zika', 'ebola', 'polio', 'hemorrhagic fever', 'haemorrhagic fever', 'rabies',
       
       'transmitted disease', 'sexually transmit', 'sexual transmis',
       
       ' lyme', 'malaria', 'falciparum', 'anopheles', 'parasit', 'helminth', 'protozoa', 
        'leishmaniasis', 'trypanosom', 'chagas', 'schistosomiasis', 'filariasis', 'toxoplasm' 'tropical disease',
       
       ' hiv ', 'human immunodeficiency virus', 'acquired immune deficiency syndrome']

spec['id_text'] = np.where(groups['text'].str.contains('infectio'), "1", "0")

for x in text:
    spec['id_text'] = np.where(groups['text'].str.contains(x), "1", spec['id_text']) #if yes then 1, if no, keep current
    
## output
print('text counts:')
print(Counter(spec['id_text']))


text counts:
Counter({'0': 31602, '1': 2577})


In [59]:
#### SEPSIS / sepsis

## text
text = ['sepsis', 'septic', 'bacteraem', 'bacterem', 'toxic shock syndrome', 'pyaemia']

spec['sepsis_text'] = np.where(groups['text'].str.contains('pyemia'), "1", "0")

for x in text:
    spec['sepsis_text'] = np.where(groups['text'].str.contains(x), "1", spec['sepsis_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['sepsis_text']))

text counts:
Counter({'0': 33928, '1': 251})


In [60]:
#### COVID-19 / cov19

## text
text = ['sars-cov', 'coronavirus disease 2019', 'novel coronavirus', 'coronavirus disease 19', 'sars cov']

spec['cov19_text'] = np.where(groups['text'].str.contains('covid'), "1", "0")

for x in text:
    spec['cov19_text'] = np.where(groups['text'].str.contains(x), "1", spec['cov19_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['cov19_text']))

text counts:
Counter({'0': 32994, '1': 1185})


In [61]:
#### HIV / hiv

## text
text = ['human immunodeficiency virus', 'acquired immune deficiency syndrome', ' aids ']

spec['hiv_text'] = np.where(groups['text'].str.contains(' hiv '), "1", "0")

for x in text:
    spec['hiv_text'] = np.where(groups['text'].str.contains(x), "1", spec['hiv_text']) #if yes then 1, if no, keep current

    
## output
print('text counts:')
print(Counter(spec['hiv_text']))

text counts:
Counter({'0': 33996, '1': 183})


In [62]:
#### TUBERCULOSIS / tb

## text
text = ['tuberculosis', 'mycobacterium tuberc']

spec['tb_text'] = np.where(groups['text'].str.contains('tubercu'), "1", "0")

for x in text:
    spec['tb_text'] = np.where(groups['text'].str.contains(x), "1", spec['tb_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['tb_text']))

text counts:
Counter({'0': 34011, '1': 168})


In [63]:
#### TROPICAL DISEASE / tropic

## text
text = ['malaria', 'falciparum', 'anopheles', 'parasit', 'helminth', 'protozoa', 
        'leishmaniasis', 'trypanosom', 'chagas', 'schistosomiasis', 'filariasis', 'toxoplasm',
       'yellow fever', 'dengue', 'rabies', 'cholera', 'zika', 'ebola', 'hemorrhagic fever', 'haemorrhagic fever',
        'tropical disease', 'tropical medicine', 'filariasis']

spec['tropic_text'] = np.where(groups['text'].str.contains('falciparum'), "1", "0")

for x in text:
    spec['tropic_text'] = np.where(groups['text'].str.contains(x), "1", spec['tropic_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['tropic_text']))

text counts:
Counter({'0': 34053, '1': 126})


In [64]:
#### MALARIA / malaria

## text
text = ['malaria', 'anopheles']

spec['malaria_text'] = np.where(groups['text'].str.contains('falciparum'), "1", "0")

for x in text:
    spec['malaria_text'] = np.where(groups['text'].str.contains(x), "1", spec['malaria_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['malaria_text']))

text counts:
Counter({'0': 34120, '1': 59})


In [65]:
## DERMATOLOGY / derm

## text
text = ['dermato', 'dermatitis', 'erythema', 'cutaneous', 'eczema', 'psoriasis', 'rosacea', 'vitiligo', 'urticaria',
       'pruritus', 'impetigo', 'pemphigoid', 'pityriasis', 'melanoma', 'basal cell ca', 'merkel cell',
       'skin cancer', 'skin lesion', 'skin rash', 'nevus', 'naevus', 'dermal cancer', 'dermal lesion']

spec['derm_text'] = np.where(groups['text'].str.contains('emollient'), "1", "0")

for x in text:
    spec['derm_text'] = np.where(groups['text'].str.contains(x), "1", spec['derm_text']) #if yes then 1, if no, keep current

spec['derm_text'] = np.where((groups['text'].str.contains("skin")) &
                             (groups['text'].str.contains("squamous cell")) , "1", spec['derm_text'])
spec['derm_text'] = np.where((groups['text'].str.contains("dermal")) &
                             (groups['text'].str.contains("squamous cell")) , "1", spec['derm_text'])
spec['derm_text'] = np.where((groups['text'].str.contains("skin")) &
                             (groups['text'].str.contains(" scc ")) , "1", spec['derm_text'])
spec['derm_text'] = np.where((groups['text'].str.contains("dermal")) &
                             (groups['text'].str.contains(" scc ")) , "1", spec['derm_text'])
                             
## output
print('text counts:')
print(Counter(spec['derm_text']))

text counts:
Counter({'0': 33397, '1': 782})


In [66]:
#### SKIN CANCERS / dermca

## text
text = ['melanoma', 'melanocytic', 'casal cell ca', 'skin cancer', 'dysplastic nevus', 'dysplastic naevus',
       'merkel cell', 'atypical nevus', 'atypical naevus']

spec['dermca_text'] = np.where(groups['text'].str.contains('skin cancer'), "1", "0")

for x in text:
    spec['dermca_text'] = np.where(groups['text'].str.contains(x), "1", spec['dermca_text']) #if yes then 1, if no, keep current

spec['dermca_text'] = np.where((groups['text'].str.contains("skin")) &
                             (groups['text'].str.contains("squamous cell")) , "1", spec['dermca_text'])
spec['dermca_text'] = np.where((groups['text'].str.contains("dermal")) &
                             (groups['text'].str.contains("squamous cell")) , "1", spec['dermca_text'])
spec['dermca_text'] = np.where((groups['text'].str.contains("skin")) &
                             (groups['text'].str.contains(" scc ")) , "1", spec['dermca_text'])
spec['dermca_text'] = np.where((groups['text'].str.contains("dermal")) &
                             (groups['text'].str.contains(" scc ")) , "1", spec['dermca_text'])
                          
## output
print('text counts:')
print(Counter(spec['dermca_text']))

text counts:
Counter({'0': 33787, '1': 392})


In [67]:
## ONCOLOGY / onc

## text
text = [ 'cancer', 'carcinoma', 'oncolog', 'neoplasm', 'neoplastic',
        'radiotherapy', 'radiation therapy', 'mammog', 'breast ca', 'breast tum', 'invasive lobular carcinoma', 
        ' dcis ', 'ductal carcinoma in situ', 'lung cancer', 'lung malignancy', 'lung carcinoma', 'lung nodule',
        'pulmonary nodule', 'mesothelioma', 'nsclc',
       'neuroonc', 'neuro onc', 'neuro-onc', 'brain cancer', 'brain tumor', 'brain tumour', 'brain malignancy',
       'glioma', 'glioblastoma', 'astrocytoma', 'pituitary adenoma', 'acoustic neuroma', 'meningioma',
       'cns lymphoma', 'oligodendroglioma', 'meningeal cancer', 'meningeal carcinomatosis',
       'melanoma', 'melanocytic', 'casal cell ca', 'skin cancer', 'dysplastic nevus', 'dysplastic naevus',
       'merkel cell', 'atypical nevus', 'atypical naevus',
       'gi cancer', 'gastrointestinal cancer', 'colon cancer', 'colon carcinoma', 'colon polyp', 'colon adeno', 'colon tumo',
       'colonic cancer', 'colonic carcinoma', 'colonic adeno', 'colonic polyp', 'colonic tumo', 'colonic neoplasm',
        'rectal cancer', 'rectal carcinoma', 'rectal polyp', 'rectal tumo', 'rectal neoplasm', 'bowel cancer', 'bowel neoplasm',
       'bowel tumo', 'stomach cancer', 'gastric cancer', 'gastric carcinoma', 'gastric neoplasm', 'gastric tumo',
       'esophageal cancer', 'esophageal tumo', 'esophageal neoplasm',
       'hepatocellular cancer', 'hepatocellular carcinoma', 'hepatic cancer', 'hepatic carcinoma', 'hepatic tumo',
       'hepatic neoplasm', 'liver cancer', 'liver carcinoma', 'liver tumo', 'cholangioca', 'pancreatic cancer',
       'pancreatic neoplasm', 'pancreatic tumo', 'biliary cancer', 'bile duct cancer',
       'prostate cancer', 'prostate specific antigen', 'prostate carcinoma', 'prostate neoplasm', 'prostate tumo',
       'prostate adeno', 'prostatic cancer', 'prostatic neoplasm', 'prostatic tumo', 'prostatic adeno', 'prostatectomy',
       ' psa ', 'kidney cancer', 'kidney tumo', 'renal cell carcinoma', 'renal call cancer', 'renal tumo', 'renal cancer',
       'wilms tumo', 'bladder cancer', 'bladder carcinoma', 'transitional cell ca', 'urothelial cancer', 'urothelial carcinoma',
        'gynecologic cancer', 'gynecological cancer', 'gynaecologic cancer', 'gynaecological cancer', 'ovarian cancer',
       'ovarian carcinoma', 'uterine cancer', 'uterine carcinoma', 'cervical cancer', 'cervical carcinoma', 'colposcop',
       'haematological cancer', 'hematological cancer', 'haematological malig', 'hematological malig', 'myelodysplas',
       'myeloprolif', 'lymphoprolif', 'leukaemoa', 'leukemia', 'myelofibro', 'thrombocythemia', 'polycythemia vera',
       'polycythemia rubra vera', 'thrombocythaemia', 'polycythaemia vera', 'polycythaemia rubra vera', 'lymphoma',
       'myeloma', ' gvhd', 'stem cell transpl', 'bone marrow aspirate']

spec['onc_text'] = np.where(groups['text'].str.contains('metasta'), "1", "0")

for x in text:
    spec['onc_text'] = np.where(groups['text'].str.contains(x), "1", spec['onc_text']) #if yes then 1, if no, keep current


## output
print('text counts:')
print(Counter(spec['onc_text']))

text counts:
Counter({'0': 25235, '1': 8944})


In [68]:
#### RADIOTHERAPY / rx

## text
spec['rx_text'] = np.where(groups['text'].str.contains("radiotherapy"), "1", "0")
spec['rx_text'] = np.where(groups['text'].str.contains("radiation therapy"), "1", "0")

##output
print('text counts:')
print(Counter(spec['rx_text']))

text counts:
Counter({'0': 33920, '1': 259})


In [69]:
#### BREAST / breast

## text
text = ['mammog', 'breast ca', 'breast tum', 'invasive lobular carcinoma', ' dcis ', 'ductal carcinoma in situ']

spec['breast_text'] = np.where(groups['text'].str.contains(' breast '), "1", "0")

for x in text:
    spec['breast_text'] = np.where(groups['text'].str.contains(x), "1", spec['breast_text']) #if yes then 1, if no, keep current

    
## output
print('text counts:')
print(Counter(spec['breast_text']))

text counts:
Counter({'0': 32173, '1': 2006})


In [70]:
#### BREAST CANCER / breastca

## text
text = ['mammog', 'breast ca', 'breast tum', 'invasive lobular carcinoma', ' dcis ', 'ductal carcinoma in situ']

spec['breastca_text'] = np.where(groups['text'].str.contains('breast cancer'), "1", "0")

for x in text:
    spec['breastca_text'] = np.where(groups['text'].str.contains(x), "1", spec['breastca_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['breastca_text']))

text counts:
Counter({'0': 32422, '1': 1757})


In [71]:
#### LUNG CA / lungca

## text
text = ['lung cancer', 'lung malignancy', 'lung carcinoma', 'lung nodule', 'pulmonary nodule', 'mesothelioma', 'nsclc']

spec['lungca_text'] = np.where(groups['text'].str.contains('lung cancer'), "1", "0")

for x in text:
    spec['lungca_text'] = np.where(groups['text'].str.contains(x), "1", spec['lungca_text']) #if yes then 1, if no, keep current

spec['lungca_text'] = np.where((groups['text'].str.contains("lung")) &
                             (groups['text'].str.contains("adenoca")) , "1", spec['lungca_text'])
spec['lungca_text'] = np.where((groups['text'].str.contains("lung")) &
                             (groups['text'].str.contains("small cell")) , "1", spec['lungca_text'])
spec['lungca_text'] = np.where((groups['text'].str.contains("lung")) &
                             (groups['text'].str.contains("squamous")) , "1", spec['lungca_text'])
spec['lungca_text'] = np.where((groups['text'].str.contains("lung")) &
                             (groups['text'].str.contains("small-cell")) , "1", spec['lungca_text'])

## output
print('text counts:')
print(Counter(spec['lungca_text']))

text counts:
Counter({'0': 33001, '1': 1178})


In [72]:
#### NEURO ONC / neuroca

## text
text = ['neuroonc', 'neuro onc', 'neuro-onc', 'brain cancer', 'brain tumor', 'brain tumour', 'brain malignancy',
       'glioma', 'glioblastoma', 'astrocytoma', 'pituitary adenoma', 'acoustic neuroma', 'meningioma',
       'cns lymphoma', 'oligodendroglioma', 'meningeal cancer', 'meningeal carcinomatosis']

spec['brainca_text'] = np.where(groups['text'].str.contains('brain cancer'), "1", "0")

for x in text:
    spec['brainca_text'] = np.where(groups['text'].str.contains(x), "1", spec['brainca_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['brainca_text']))

text counts:
Counter({'0': 33275, '1': 904})


In [73]:
#### GI ONC / gica

## text
text = ['gi cancer', 'gastrointestinal cancer', 'colon cancer', 'colon carcinoma', 'colon polyp', 'colon adeno', 'colon tumo',
       'colonic cancer', 'colonic carcinoma', 'colonic adeno', 'colonic polyp', 'colonic tumo', 'colonic neoplasm',
        'rectal cancer', 'rectal carcinoma', 'rectal polyp', 'rectal tumo', 'rectal neoplasm', 'bowel cancer', 'bowel neoplasm',
       'bowel tumo', 'stomach cancer', 'gastric cancer', 'gastric carcinoma', 'gastric neoplasm', 'gastric tumo',
       'esophageal cancer', 'esophageal tumo', 'esophageal neoplasm']

spec['gica_text'] = np.where(groups['text'].str.contains('luminal cancer'), "1", "0")

for x in text:
    spec['gica_text'] = np.where(groups['text'].str.contains(x), "1", spec['gica_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['gica_text']))

text counts:
Counter({'0': 33331, '1': 848})


In [74]:
#### HPB ONC / hepca

## text
text = ['hepatocellular cancer', 'hepatocellular carcinoma', 'hepatic cancer', 'hepatic carcinoma', 'hepatic tumo',
       'hepatic neoplasm', 'liver cancer', 'liver carcinoma', 'liver tumo', 'cholangioca', 'pancreatic cancer',
       'pancreatic neoplasm', 'pancreatic tumo', 'biliary cancer', 'bile duct cancer']

spec['hepca_text'] = np.where(groups['text'].str.contains('cancer of the pancreas'), "1", "0")

for x in text:
    spec['hepca_text'] = np.where(groups['text'].str.contains(x), "1", spec['hepca_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['hepca_text']))

text counts:
Counter({'0': 33682, '1': 497})


In [75]:
#### UROLOGY / urology

## text
text = ['prostate', 'prostatic', 'prostatectomy', ' psa ', 'urolog', 'urethra', 'bladder']

spec['urology_text'] = np.where(groups['text'].str.contains('prostatectomy'), "1", "0")

for x in text:
    spec['urology_text'] = np.where(groups['text'].str.contains(x), "1", spec['urology_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['urology_text']))

text counts:
Counter({'0': 32548, '1': 1631})


In [76]:
#### PROSTATE ONC / prosca

## text
text = ['prostate cancer', 'prostate specific antigen', 'prostate carcinoma', 'prostate neoplasm', 'prostate tumo',
       'prostate adeno', 'prostatic cancer', 'prostatic neoplasm', 'prostatic tumo', 'prostatic adeno', 'prostatectomy',
       ' psa ']

spec['prosca_text'] = np.where(groups['text'].str.contains('prostatectomy'), "1", "0")

for x in text:
    spec['prosca_text'] = np.where(groups['text'].str.contains(x), "1", spec['prosca_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['prosca_text']))

text counts:
Counter({'0': 33506, '1': 673})


In [77]:
#### RENAL & BLADDER / renalca

## text
text = ['kidney cancer', 'kidney tumo', 'renal cell carcinoma', 'renal call cancer', 'renal tumo', 'renal cancer',
       'wilms tumo', 'bladder cancer', 'bladder carcinoma', 'transitional cell ca', 'urothelial cancer', 'urothelial carcinoma']

spec['renalca_text'] = np.where(groups['text'].str.contains('renal carcinoma'), "1", "0")

for x in text:
    spec['renalca_text'] = np.where(groups['text'].str.contains(x), "1", spec['renalca_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['renalca_text']))

text counts:
Counter({'0': 33920, '1': 259})


In [78]:
#### GYNAE / gynonc

## text
text = ['gynecologic cancer', 'gynecological cancer', 'gynaecologic cancer', 'gynaecological cancer', 'ovarian cancer',
       'ovarian carcinoma', 'uterine cancer', 'uterine carcinoma', 'cervical cancer', 'cervical carcinoma', 'colposcop',
       'endometrial cancer']

spec['gynonc_text'] = np.where(groups['text'].str.contains('pap smear'), "1", "0")

for x in text:
    spec['gynonc_text'] = np.where(groups['text'].str.contains(x), "1", spec['gynonc_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['gynonc_text']))

text counts:
Counter({'0': 33770, '1': 409})


In [79]:
#### HAEM / haemonc

## text
text = ['haematological cancer', 'hematological cancer', 'haematological malig', 'hematological malig', 'myelodysplas',
       'myeloprolif', 'lymphoprolif', 'leukaemia', 'leukemia', 'myelofibro', 'thrombocythemia', 'polycythemia vera',
       'polycythemia rubra vera', 'thrombocythaemia', 'polycythaemia vera', 'polycythaemia rubra vera', 'lymphoma',
       'myeloma', ' gvhd', 'stem cell transpl', 'bone marrow aspirate']

spec['haemonc_text'] = np.where(groups['text'].str.contains('bone marrow biopsy'), "1", "0")

for x in text:
    spec['haemonc_text'] = np.where(groups['text'].str.contains(x), "1", spec['haemonc_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['haemonc_text']))

text counts:
Counter({'0': 33775, '1': 404})


In [80]:
## PSYCHIATRY / psych

## text
text = ['psych', 'schizo', 'depressive disorder', 'anxiety disorder', 'stress disorder', 'suicide', 'suicidal', 'mood disorder',
        'self harm', 'self-harm', 'self injury', 'self-injury',
        'mental disorder', 'hyperactivity disorder', 'hyperactive disorder', 'psychological distress', 'bipolar', 
       'addiction disorder', 'autism', 'autistic']

spec['psych_text'] = np.where(groups['text'].str.contains('mental health'), "1", "0")

for x in text:
    spec['psych_text'] = np.where(groups['text'].str.contains(x), "1", spec['psych_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['psych_text']))

text counts:
Counter({'0': 32061, '1': 2118})


In [81]:
## SUICIDE / suicide

## text
text = ['suicide', 'suicidal', 'self harm', 'self-harm', 'self injury', 'self-injury', 'depressive disorder']

spec['suicide_text'] = np.where(groups['text'].str.contains('low mood'), "1", "0")

for x in text:
    spec['suicide_text'] = np.where(groups['text'].str.contains(x), "1", spec['suicide_text']) #if yes then 1, if no, keep current

spec['suicide_text'] = np.where((groups['text'].str.contains("psych")) &
                             (groups['text'].str.contains("depression")) , "1", spec['suicide_text'])
spec['suicide_text'] = np.where((groups['text'].str.contains("mental")) &
                             (groups['text'].str.contains("depression")) , "1", spec['suicide_text'])

## output
print('text counts:')
print(Counter(spec['suicide_text']))

text counts:
Counter({'0': 33660, '1': 519})


In [82]:
## MUSCULOSKELETAL / msk

## text
text = ['musculoskeletal', 'bone disease', 'bone cyst', 'chondritis', 'fasciitis', 'ankylos', 'osteoarth', 'orthoped',
       'orthopaed', 'bursitis', 'synovitis', 'congenital hip', 'joint instability', 'joint stability', 'myositis',
       'polymyalgia', 'fibromyalgia', ' gout', 'tendinopath', 'arthro', 'ligament', 'fracture', 'hip surgery',
       'hip replacement', 'acetabul', 'cruciate', 'joint space', 'dysplatic hip', 'hip dysplas', 'vertebral', 'discectomy',
       'lumbar spine', 'thoracic spine', 'cervical spine', 'whole spine', 'osteoporosis', 'bone mineral density']

spec['msk_text'] = np.where(groups['text'].str.contains('broken bone'), "1", "0")

for x in text:
    spec['msk_text'] = np.where(groups['text'].str.contains(x), "1", spec['msk_text']) #if yes then 1, if no, keep current
         
## output
print('text counts:')
print(Counter(spec['msk_text']))

text counts:
Counter({'0': 33237, '1': 942})


In [83]:
#### FRACTURE / frac

##text
spec['frac_text'] = np.where(groups['text'].str.contains("fracture"), "1", "0")

print('text counts:')
print(Counter(spec['frac_text']))

text counts:
Counter({'0': 33942, '1': 237})


In [84]:
## CONNECTIVE TISSUE [C17] / rheum

## text
text = ['rheumatoid', 'scleroderma', 'wegener', 'polyangiitis', 'churg-strauss', 'lupus', 'connective tissue disease',
        'mixed connective tissue', 'polymyositis', 'dermatomyositis', 'sjogren', 'vasculitis', 'vasculitide', 'marfan',
       'ehlers-danlos', 'osteogenesis imperfecta']

spec['rheum_text'] = np.where(groups['text'].str.contains('rheumatolog'), "1", "0")

for x in text:
    spec['rheum_text'] = np.where(groups['text'].str.contains(x), "1", spec['rheum_text']) #if yes then 1, if no, keep current
         
## output
print('text counts:')
print(Counter(spec['rheum_text']))

text counts:
Counter({'0': 34015, '1': 164})


In [85]:
## LUMINAL GI / gi

## text
text = ['gastro', 'gastri', 'intestin', 'duoden', 'colonic', 'colonoscop', 'colitis', 'rectal', 'ileus', 'ileitis',
       'crohn', 'esophag', 'proctitis', 'proctolog', 'bowel disease', 'bowel cancer', 'bowel neoplasm' ,'bowel tumo',
       'celiac', 'coeliac', 'diverticulitis', 'diverticulosis', 'stomach', 'small bowel', 'large bowel']

spec['gi_text'] = np.where(groups['text'].str.contains('gi tract'), "1", "0")

for x in text:
    spec['gi_text'] = np.where(groups['text'].str.contains(x), "1", spec['gi_text']) #if yes then 1, if no, keep current
         
## output
print('text counts:')
print(Counter(spec['gi_text']))


text counts:
Counter({'0': 32628, '1': 1551})


In [86]:
## HEPATOLOGY (and pancreatobiliary) / hep

## text
text = ['hepato', 'hepati', 'cholang', 'gallbladder', 'gall bladder', 'biliary' , 'pancreas', 'pancreat', 'wilson disease',
       'wilsons disease', 'liver fibrosis' ,'liver cirrhosis', 'nafld', 'hemochromatosis', 'haemochromatosis']

spec['hep_text'] = np.where(groups['text'].str.contains(' liver '), "1", "0")

for x in text:
    spec['hep_text'] = np.where(groups['text'].str.contains(x), "1", spec['hep_text']) #if yes then 1, if no, keep current
         
## output
print('text counts:')
print(Counter(spec['hep_text']))


text counts:
Counter({'0': 32964, '1': 1215})


In [87]:
## RESPIRATORY / resp

## text
text = ['respiratory', 'pneumonia', 'lung cancer', 'lung disease', 'lung nodule', 'pulmonary', 'asthma', 'obstructive sleep ap',
       'copd', 'pleura', 'mesothelioma', 'lung fibrosis', 'lung adeno', 'nsclc', 'interstitial lung', 'occupational lung', 'tuberculosis',
       'bronch']

spec['resp_text'] = np.where(groups['text'].str.contains(' lung '), "1", "0")

for x in text:
    spec['resp_text'] = np.where(groups['text'].str.contains(x), "1", spec['resp_text']) #if yes then 1, if no, keep current
         
## output
print('text counts:')
print(Counter(spec['resp_text']))

text counts:
Counter({'0': 30722, '1': 3457})


In [88]:
#### PNEUMONIA / pneum

## text
text = ['respiratory infection', 'pulmonary infection', 'pneumonia', 'alveolar consolidation', 'lung consolidation', 'lung infection',
       'pulmonary consolidation']

spec['pneum_text'] = np.where(groups['text'].str.contains('lower respiratory tract infection'), "1", "0")

for x in text:
    spec['pneum_text'] = np.where(groups['text'].str.contains(x), "1", spec['pneum_text']) #if yes then 1, if no, keep current
         
## output
print('text counts:')
print(Counter(spec['pneum_text']))

text counts:
Counter({'0': 33598, '1': 581})


In [89]:
#### OBSTRUCTIVE SLEEP / osa

## text
text = ['obstructive sleep ap', 'sleep apnoea']

spec['osa_text'] = np.where(groups['text'].str.contains('sleep apnea'), "1", "0")

for x in text:
    spec['osa_text'] = np.where(groups['text'].str.contains(x), "1", spec['osa_text']) #if yes then 1, if no, keep current
         
## output
print('text counts:')
print(Counter(spec['osa_text']))

text counts:
Counter({'0': 33909, '1': 270})


In [90]:
#### PULMONARY EMBOLISM / pe

## text
text = ['saddle embol', 'pulmonary angiogr']

spec['pe_text'] = np.where(groups['text'].str.contains('pulmonary embol'), "1", "0")

for x in text:
    spec['pe_text'] = np.where(groups['text'].str.contains(x), "1", spec['pe_text']) #if yes then 1, if no, keep current
         
## output
print('text counts:')
print(Counter(spec['pe_text']))

text counts:
Counter({'0': 34126, '1': 53})


In [91]:
#### PUBLIC HEALTH / pubh

## text
spec['pubh_text'] = np.where(groups['text'].str.contains("public health"), "1", "0")
spec['pubh_text'] = np.where(groups['text'].str.contains("population health"), "1", spec['pubh_text'])
spec['pubh_text'] = np.where(groups['text'].str.contains("health protection"), "1", spec['pubh_text'])

print('text counts:')
print(Counter(spec['pubh_text']))

text counts:
Counter({'0': 33865, '1': 314})


In [92]:
## NERVOUS SYSTEM / neuro

## text
text = ['neuro', 'brain', 'nervous system', 'multiple sclerosis', 'amyotrophic', 'motor neuron disease',
       'dementia', 'cognitive impairment', 'alzheimer', 'epilepsy', 'parkinson', 'dyskinesia', 'cerebellar', 'cerebral',
       'guillain', 'myelin', 'migraine', 'headache', 'meningeal', 'meningitis', 'encephalitis', 'ischemic stroke', 'ischaemic stroke',
       'hemorrhagic stroke', 'haemorrhagic stroke', 'embolic stroke', 'thrombotic stroke', 'myasthenia', 'movement disorder',
       'subdural', 'extradural', 'arachnoid', 'glioma', 'astrocytoma', 'glioblast', ' mci ', 'cerebrovascular']

spec['neuro_text'] = np.where(groups['text'].str.contains('white matter'), "1", "0")

for x in text:
    spec['neuro_text'] = np.where(groups['text'].str.contains(x), "1", spec['neuro_text']) #if yes then 1, if no, keep current

spec['neuro_text'] = np.where((groups['text'].str.contains("brain")) &
                             (groups['text'].str.contains("aneurysm")) , "1", spec['neuro_text'])
spec['neuro_text'] = np.where((groups['text'].str.contains("cereb")) &
                             (groups['text'].str.contains("aneurysm")) , "1", spec['neuro_text'])   
spec['neuro_text'] = np.where((groups['text'].str.contains("cranial")) &
                             (groups['text'].str.contains("aneurysm")) , "1", spec['neuro_text'])    
    
## output
print('text counts:')
print(Counter(spec['neuro_text']))

text counts:
Counter({'0': 26382, '1': 7797})


In [94]:
#### STROKE/bleed / cva

## text
text = ['cerebrovascular', 'ischemic stroke', 'ischaemic stroke', 'hemorrhagic stroke', 'haemorrhagic stroke', 
        'embolic stroke', 'thrombotic stroke', 'subarachnoid hemorrhage', 'subarachnoid haemorrhage', 'cerebral artery stroke',
       'cerebral artery infarct', 'malignant middle cerebral', 'malignant mca']

spec['cva_text'] = np.where(groups['text'].str.contains(' ich '), "1", "0")

for x in text:
    spec['cva_text'] = np.where(groups['text'].str.contains(x), "1", spec['cva_text']) #if yes then 1, if no, keep current


spec['cva_text'] = np.where((groups['text'].str.contains("brain")) &
                             (groups['text'].str.contains("infarct")) , "1", spec['cva_text'])
spec['cva_text'] = np.where((groups['text'].str.contains("cereb")) &
                             (groups['text'].str.contains("infarct")) , "1", spec['cva_text'])
spec['cva_text'] = np.where((groups['text'].str.contains("brain")) &
                             (groups['text'].str.contains("stroke")) , "1", spec['cva_text'])
spec['cva_text'] = np.where((groups['text'].str.contains("cereb")) &
                             (groups['text'].str.contains("stroke")) , "1", spec['cva_text'])    
spec['cva_text'] = np.where((groups['text'].str.contains("brain")) &
                             (groups['text'].str.contains("vessel occlusion")) , "1", spec['cva_text'])
spec['cva_text'] = np.where((groups['text'].str.contains("cereb")) &
                             (groups['text'].str.contains("vessel occlusion")) , "1", spec['cva_text'])   
spec['cva_text'] = np.where((groups['text'].str.contains("brain")) &
                             (groups['text'].str.contains("bleed")) , "1", spec['cva_text'])
spec['cva_text'] = np.where((groups['text'].str.contains("cereb")) &
                             (groups['text'].str.contains("bleed")) , "1", spec['cva_text'])   
spec['cva_text'] = np.where((groups['text'].str.contains("brain")) &
                             (groups['text'].str.contains("haemorrhage")) , "1", spec['cva_text'])
spec['cva_text'] = np.where((groups['text'].str.contains("cereb")) &
                             (groups['text'].str.contains("haemorrhage")) , "1", spec['cva_text'])
spec['cva_text'] = np.where((groups['text'].str.contains("cranial")) &
                             (groups['text'].str.contains("haemorrhage")) , "1", spec['cva_text'])
spec['cva_text'] = np.where((groups['text'].str.contains("brain")) &
                             (groups['text'].str.contains("hemorrhage")) , "1", spec['cva_text'])
spec['cva_text'] = np.where((groups['text'].str.contains("cereb")) &
                             (groups['text'].str.contains("hemorrhage")) , "1", spec['cva_text'])   
spec['cva_text'] = np.where((groups['text'].str.contains("cranial")) &
                             (groups['text'].str.contains("hemorrhage")) , "1", spec['cva_text'])
spec['cva_text'] = np.where((groups['text'].str.contains("brain")) &
                             (groups['text'].str.contains("aneurysm")) , "1", spec['cva_text'])
spec['cva_text'] = np.where((groups['text'].str.contains("cereb")) &
                             (groups['text'].str.contains("aneurysm")) , "1", spec['cva_text'])   
spec['cva_text'] = np.where((groups['text'].str.contains("cranial")) &
                             (groups['text'].str.contains("aneurysm")) , "1", spec['cva_text'])

## output
print('text counts:')
print(Counter(spec['cva_text']))

text counts:
Counter({'0': 33473, '1': 706})


In [95]:
#### EPILEPSY / epilep

## text
spec['epilep_text'] = np.where(groups['text'].str.contains("epilep"), "1", "0")
spec['epilep_text'] = np.where(groups['text'].str.contains("seizure"), "1", spec['epilep_text'])

print('text counts:')
print(Counter(spec['epilep_text']))

text counts:
Counter({'0': 33357, '1': 822})


In [96]:
#### DEMENTIA / alzh

## text
text = ['dementia', 'cognitive impairment', 'alzheimer', 'cognitive dysfunction', 'cognitive decline', 'lewy body',
       'huntington', 'progressive supranuclear', 'corticobasal degen']

spec['alzh_text'] = np.where(groups['text'].str.contains(' mci '), "1", "0")

for x in text:
    spec['alzh_text'] = np.where(groups['text'].str.contains(x), "1", spec['alzh_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['alzh_text']))

text counts:
Counter({'0': 32660, '1': 1519})


In [97]:
## CARDIOVASCULAR / cvs

## text
text = ['cardiac', 'cardiovascular', 'cardial', 'cardiol', 'carditis', 'cardium', 'atherosclerosis', 'coronary', 'heart disease',
       'cardiomegaly', 'cardiomyopathy', 'valve disease', 'mitral', 'tricuspid', 'pulmonary valve', 'aortic', 'atrial', 'heart failure',
       'ventricular failure', 'right heart', 'left heart', 'cor pulm', 'hypertension', 'vascular disease', 'arrhythmia', 
       'vena cava', 'venous insuff', 'echocard', 'electrocard', 'sinus node', 'sinoatrial node', ' ecg', ' ekg', 'ventricular tachy', 'ventricular fibrillation',
       'ischemic heart', 'ischaemic heart', 'peripheral vascular']

spec['cvs_text'] = np.where(groups['text'].str.contains('cardiac'), "1", "0")

for x in text:
    spec['cvs_text'] = np.where(groups['text'].str.contains(x), "1", spec['cvs_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['cvs_text']))


text counts:
Counter({'0': 30436, '1': 3743})


In [98]:
#### ISCHAEMIC HEART DISEASE / ihd

## text
text = ['coronary', 'cardiac risk', 'cardiovascular risk', 'cardiac stent',
       'ischemic heart', 'ischaemic heart', 'cardial infarction']

spec['ihd_text'] = np.where(groups['text'].str.contains('heart attack'), "1", "0")

for x in text:
    spec['ihd_text'] = np.where(groups['text'].str.contains(x), "1", spec['ihd_text']) #if yes then 1, if no, keep current
    

spec['ihd_text'] = np.where((groups['text'].str.contains("cardia")) &
                             (groups['text'].str.contains("ischemi")) , "1", spec['ihd_text'])
spec['ihd_text'] = np.where((groups['text'].str.contains("cardia")) &
                             (groups['text'].str.contains("ischaemi")) , "1", spec['ihd_text'])    
spec['ihd_text'] = np.where((groups['text'].str.contains("cardia")) &
                             (groups['text'].str.contains("infarction")) , "1", spec['ihd_text'])
spec['ihd_text'] = np.where((groups['text'].str.contains("heart")) &
                             (groups['text'].str.contains("infarction")) , "1", spec['ihd_text'])
spec['ihd_text'] = np.where((groups['text'].str.contains("cardia")) &
                             (groups['text'].str.contains("vessel occlusion")) , "1", spec['ihd_text'])
spec['ihd_text'] = np.where((groups['text'].str.contains("heart")) &
                             (groups['text'].str.contains("vessel occlusion")) , "1", spec['ihd_text'])
spec['ihd_text'] = np.where((groups['text'].str.contains("cardiac")) &
                             (groups['text'].str.contains("angio")) , "1", spec['ihd_text'])
spec['ihd_text'] = np.where((groups['text'].str.contains("heart")) &
                             (groups['text'].str.contains("angio")) , "1", spec['ihd_text'])
spec['ihd_text'] = np.where((groups['text'].str.contains("cardiac")) &
                             (groups['text'].str.contains("atherosclero")) , "1", spec['ihd_text'])
spec['ihd_text'] = np.where((groups['text'].str.contains("heart")) &
                             (groups['text'].str.contains("atherosclero")) , "1", spec['ihd_text'])

## output
print('text counts:')
print(Counter(spec['ihd_text']))

text counts:
Counter({'0': 33229, '1': 950})


In [99]:
spec[spec['ihd_text']=='1'].sample(20)

Unnamed: 0,text,icu_text,ed_text,id_text,sepsis_text,cov19_text,hiv_text,tb_text,tropic_text,malaria_text,derm_text,dermca_text,onc_text,rx_text,breast_text,breastca_text,lungca_text,brainca_text,gica_text,hepca_text,urology_text,prosca_text,renalca_text,gynonc_text,haemonc_text,psych_text,suicide_text,msk_text,frac_text,rheum_text,gi_text,hep_text,resp_text,pneum_text,osa_text,pe_text,pubh_text,neuro_text,cva_text,epilep_text,alzh_text,cvs_text,ihd_text
36978,development of novel artificial intelligence to detect the presence of clinically meaningful coronary atherosclerotic stenosis in major branch from coronary angiography video the clinically meaningful coronary stenosis is diagnosed by trained interventional cardiologists whether artificial intelligence ai could detect coronary stenosis from cag video is unclear,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
85013,"heart rate monitoring and therapeutic devices: a wavelet transform based approach for the modeling and classification of congestive heart failure heart rate monitoring and therapeutic devices include real-time sensing capabilities reflecting the state of the heart current circuitry can be interpreted as a cardiac electrical signal compression algorithm representing the time signal information into a single event description of the cardiac activity it is observed that some detection techniques developed for ecg signal detection like artificial neural network, genetic algorithm, hilbert transform, hidden markov model are some sophisticated algorithms which provide suitable results but their implementation on a silicon chip is very complicated due to less complexity and high performance, wavelet transform based approaches are widely used in this paper, after a thorough analysis of various wavelet transforms, it is found that biorthogonal wavelet transform is best suited to detect ecg signals qrs complex the main steps involved in ecg detection process consist of de-noising and locating different ecg peaks using adaptive slope prediction thresholding furthermore, the significant challenges involved in the wireless transmission of ecg data are data conversion and power consumption as medical regulatory boards demand a lossless compression technique, lossless compression technique with a high bit compression ratio is highly required furthermore, in this work, lzma based ecg data compression technique is proposed the proposed methodology achieves the highest signal to noise ratio, and lowest root mean square error also, the proposed ecg detection technique is capable of distinguishing accurately between healthy, myocardial infarction, congestive heart failure and coronary artery disease patients with a detection accuracy, sensitivity, specificity, and error of 9992%, 9994%, 9992% and 00013, respectively the use of lzma data compression of ecg data achieves a high compression ratio of 1884 the advantages and effectiveness of the proposed algorithm are verified by comparing with the existing methods",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
104830,"intelligence system for diagnosis level of coronary heart disease with k-star algorithm coronary heart disease is the leading cause of death worldwide, and it is important to diagnose the level of the disease intelligence systems for diagnosis proved can be used to support diagnosis of the disease unfortunately, most of the data available between the level/type of coronary heart disease is unbalanced as a result system performance is low",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
124787,"use of genetic programming, logistic regression, and artificial neural nets to predict readmission after coronary artery bypass surgery as many as 14 % of patients undergoing coronary artery bypass surgery are readmitted within 30 days readmission is usually the result of morbidity and may lead to death the purpose of this study is to develop and compare statistical and genetic programming models to predict readmission patients were divided into separate construction and validation populations using 88 variables, logistic regression, genetic programs, and artificial neural nets were used to develop predictive models models were first constructed and tested on the construction populations, then validated on the validation population areas under the receiver operator characteristic curves au roc were used to compare the models two hundred and two patients 76 % in the 2,644 patient construction group and 216 80 % of the 2,711 patient validation group were re-admitted within 30 days of cabg surgery logistic regression predicted readmission with au roc = 675 ± 021 in the construction group genetic programs significantly improved the accuracy, au roc = 767 ± 001, p < 001 artificial neural nets were less accurate with au roc = 0597 ± 001 in the construction group predictive accuracy of all three techniques fell in the validation group however, the accuracy of genetic programming au roc = 654 ± 001 was still trivially but statistically non-significantly better than that of the logistic regression au roc = 644 ± 020, p = 61 genetic programming and logistic regression provide alternative methods to predict readmission that are similarly accurate",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
34005,"deep learning-based detection of early renal function impairment using retinal fundus images: model development and validation retinal imaging has been applied for detecting eye diseases and cardiovascular risks using deep learning-based methods furthermore, retinal microvascular and structural changes were found in renal function impairments however, a deep learning-based method using retinal images for detecting early renal function impairment has not yet been well studied",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
61640,"automated plaque classification using computed tomography angiography and gabor transformations cardiovascular diseases are the primary cause of death globally these are often associated with atherosclerosis this inflammation process triggers important variations in the coronary arteries ca and can lead to coronary artery disease cad the presence of ca calcification cac has recently been shown to be a strong predictor of cad in this clinical setting, computed tomography angiography cta has begun to play a crucial role as a non-intrusive imaging method to characterize and study ca plaques herein, we describe an automated algorithm to classify plaque as either normal, calcified, or non-calcified using 2646 cta images acquired from 73 patients the automated technique is based on various features that are extracted from the gabor transform of the acquired cta images specifically, seven features are extracted from the gabor coefficients : energy, and kapur, max, rényi, shannon, vajda, and yager entropies the features were then ordered based on the f-value and input to numerous classification methods to achieve the best classification accuracy with the least number of features moreover, two well-known feature reduction techniques were employed, and the features acquired were also ranked according to f-value and input to several classifiers the best classification results were obtained using all computed features without the employment of feature reduction, using a probabilistic neural network an accuracy, positive predictive value, sensitivity, and specificity of 8909%, 9170%, 9183% and 8370% was obtained, respectively based on these results, it is evident that the technique can be helpful in the automated classification of plaques present in cta images, and may become an important tool to reduce procedural costs and patient radiation dose this could also aid clinicians in plaque diagnostics",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
169598,"models to predict cardiovascular risk: comparison of cart, multilayer perceptron and logistic regression the estimate of a multivariate risk is now required in guidelines for cardiovascular prevention limitations of existing statistical risk models lead to explore machine-learning methods this study evaluates the implementation and performance of a decision tree cart and a multilayer perceptron mlp to predict cardiovascular risk from real data the study population was randomly splitted in a learning set n = 10,296 and a test set n = 5,148 cart and the mlp were implemented at their best performance on the learning set and applied on the test set and compared to a logistic model implementation, explicative and discriminative performance criteria are considered, based on roc analysis areas under roc curves and their 95% confidence interval are 078 075-081, 078 075-080 and 076 073-079 respectively for logistic regression, mlp and cart given their implementation and explicative characteristics, these methods can complement existing statistical models and contribute to the interpretation of risk",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
43066,"using intravascular ultrasound image-based fluid-structure interaction models and machine learning methods to predict human coronary plaque vulnerability change plaque vulnerability prediction is of great importance in cardiovascular research in vivo follow-up intravascular ultrasound ivus coronary plaque data were acquired from nine patients to construct fluid-structure interaction models to obtain plaque biomechanical conditions morphological plaque vulnerability index mpvi was defined to measure plaque vulnerability the generalized linear mixed regression model glmm, support vector machine svm and random forest rf were introduced to predict mpvi change δmpvi = mpvi<sub>follow-up</sub>‒mpvi<sub>baseline</sub> using ten risk factors at baseline the combination of mean wall thickness, lumen area, plaque area, critical plaque wall stress, and mpvi was the best predictor using rf with the highest prediction accuracy 9147%, compared to 9078% from svm, and 8556% from glmm machine learning method rf improved the prediction accuracy by 591% over that from glmm mpvi was the best single risk factor using both glmm 8209% and rf 7853% while plaque area was the best using svm 8129%",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
20746,"machine learning for patient risk stratification: standing on, or looking over, the shoulders of clinicians machine learning can help clinicians to make individualized patient predictions only if researchers demonstrate models that contribute novel insights, rather than learning the most likely next step in a set of actions a clinician will take we trained deep learning models using only clinician-initiated, administrative data for 429 million admissions using three subsets of data: demographic data only, demographic data and information available at admission, and the previous data plus charges recorded during the first day of admission models trained on charges during the first day of admission achieve performance close to published full emr-based benchmarks for inpatient outcomes: inhospital mortality 089 auc, prolonged length of stay 082 auc, and 30-day readmission rate 071 auc similar performance between models trained with only clinician-initiated data and those trained with full emr data purporting to include information about patient state and physiology should raise concern in the deployment of these models furthermore, these models exhibited significant declines in performance when evaluated over only myocardial infarction mi patients relative to models trained over mi patients alone, highlighting the importance of physician diagnosis in the prognostic performance of these models these results provide a benchmark for predictive accuracy trained only on prior clinical actions and indicate that models with similar performance may derive their signal by looking over clinicians shoulders-using clinical behavior as the expression of preexisting intuition and suspicion to generate a prediction for models to guide clinicians in individual decisions, performance exceeding these benchmarks is necessary",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
91858,"risk prediction model for in-hospital mortality in women with st-elevation myocardial infarction: a machine learning approach studies had shown that mortality due to st-elevation myocardial infarction stemi is higher in women compared with men the purpose of this study is to develop and validate prediction models for all-cause in-hospital mortality in women admitted with stemi using logistic regression and random forest, and to compare the performance and validity of the different models",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1


In [100]:
#### HEART FAILURE or VENTRICULAR FUNCTION / hf

## text
text = ['heart failure', 'cardiac failure', 'ejection fraction', 'ventricular dysfunction', 'cardiac dysfunction']

spec['hf_text'] = np.where(groups['text'].str.contains(' lvf '), "1", "0")

for x in text:
    spec['hf_text'] = np.where(groups['text'].str.contains(x), "1", spec['hf_text']) #if yes then 1, if no, keep current
    
spec['hf_text'] = np.where((groups['text'].str.contains("left ventric")) &
                             (groups['text'].str.contains("function")) , "1", spec['hf_text'])
spec['hf_text'] = np.where((groups['text'].str.contains("right ventric")) &
                             (groups['text'].str.contains("function")) , "1", spec['hf_text'])
spec['hf_text'] = np.where((groups['text'].str.contains("left ventric")) &
                             (groups['text'].str.contains("failure")) , "1", spec['hf_text'])
spec['hf_text'] = np.where((groups['text'].str.contains("right ventric")) &
                             (groups['text'].str.contains("failure")) , "1", spec['hf_text'])
spec['hf_text'] = np.where((groups['text'].str.contains("diastolic")) &
                             (groups['text'].str.contains("failure")) , "1", spec['hf_text'])
spec['hf_text'] = np.where((groups['text'].str.contains("diastolic")) &
                             (groups['text'].str.contains("function")) , "1", spec['hf_text'])
spec['hf_text'] = np.where((groups['text'].str.contains("systolic")) &
                             (groups['text'].str.contains("function")) , "1", spec['hf_text'])
spec['hf_text'] = np.where((groups['text'].str.contains("systolic")) &
                             (groups['text'].str.contains("failure")) , "1", spec['hf_text'])
##output
print('text counts:')
print(Counter(spec['hf_text']))

text counts:
Counter({'0': 33731, '1': 448})


In [101]:
spec[spec['hf_text']=='1'].sample(20)

Unnamed: 0,text,icu_text,ed_text,id_text,sepsis_text,cov19_text,hiv_text,tb_text,tropic_text,malaria_text,derm_text,dermca_text,onc_text,rx_text,breast_text,breastca_text,lungca_text,brainca_text,gica_text,hepca_text,urology_text,prosca_text,renalca_text,gynonc_text,haemonc_text,psych_text,suicide_text,msk_text,frac_text,rheum_text,gi_text,hep_text,resp_text,pneum_text,osa_text,pe_text,pubh_text,neuro_text,cva_text,epilep_text,alzh_text,cvs_text,ihd_text,hf_text
138429,"a neuro-fuzzy decision support system for the diagnosis of heart failure a neuro-fuzzy decision support system is proposed for the diagnosis of heart failure the system comprises; knowledge base database, neural networks and fuzzy logic of both the quantitative and qualitative knowledge of the diagnosis of heart failure, neuro-fuzzy inference engine and decision support engine the neural networks employ a multi-layers perception back propagation learning process while the fuzzy logic uses the root sum square inference procedure the neuro-fuzzy inference engine uses a weighted average of the premise and consequent parameters with the fuzzy rules serving as the nodes and the fuzzy sets representing the weights of the nodes the decision support engine carries out the cognitive and emotional filtering of the objective and subjective feelings of the medical practitioner an experimental study of the decision support system was carried out using cases of some patients from three hospitals in nigeria with the assistance of their medical personnel who collected patientsdata over a period of six months the results of the study show that the neuro-fuzzy system provides a highly reliable diagnosis, while the emotional and cognitive filters further refine the diagnosis results by taking care of the contextual elements of medical diagnosis",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1
36341,"baseline and dynamic risk predictors of appropriate implantable cardioverter defibrillator therapy background current approaches fail to separate patients at high versus low risk for ventricular arrhythmias owing to overreliance on a snapshot left ventricular ejection fraction measure we used statistical machine learning to identify important cardiac imaging and time-varying risk predictors methods and results three hundred eighty-two cardiomyopathy patients left ventricular ejection fraction ≤35% underwent cardiac magnetic resonance before primary prevention implantable cardioverter defibrillator insertion the primary end point was appropriate implantable cardioverter defibrillator discharge or sudden death patient characteristics; serum biomarkers of inflammation, neurohormonal status, and injury; and cardiac magnetic resonance-measured left ventricle and left atrial indices and myocardial scar burden were assessed at baseline time-varying covariates comprised interval heart failure hospitalizations and left ventricular ejection fractions a random forest statistical method for survival, longitudinal, and multivariable outcomes incorporating baseline and time-varying variables was compared with 1 seattle heart failure model scores and 2 random forest survival and cox regression models incorporating baseline characteristics with and without imaging variables age averaged 57±13 years with 28% women, 66% white, 51% ischemic, and follow-up time of 59±23 years the primary end point n=75 occurred at 33±24 years random forest statistical method for survival, longitudinal, and multivariable outcomes with baseline and time-varying predictors had the highest area under the receiver operating curve, median 088 95% ci, 075-096 top predictors comprised heart failure hospitalization, left ventricle scar, left ventricle and left atrial volumes, left atrial function, and interleukin-6 level; heart failure accounted for 67% of the variation explained by the prediction, imaging 27%, and interleukin-6 2% serial left ventricular ejection fraction was not a significant predictor conclusions hospitalization for heart failure and baseline cardiac metrics substantially improve ventricular arrhythmic risk prediction",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1
1670,"coronary ct fractional flow reserve before transcatheter aortic valve replacement: clinical outcomes background the role of ct angiography-derived fractional flow reserve ct-ffr in pre-transcatheter aortic valve replacement tavr assessment is uncertain purpose to evaluate the predictive value of on-site machine learning-based ct-ffr for adverse clinical outcomes in candidates for tavr materials and methods this observational retrospective study included patients with severe aortic stenosis referred to tavr after coronary ct angiography ccta between september 2014 and december 2019 clinical end points comprised major adverse cardiac events mace nonfatal myocardial infarction, unstable angina, cardiac death, or heart failure admission and all-cause mortality ct-ffr was obtained semiautomatically using an on-site machine learning algorithm the ability of ct-ffr abnormal if ≤075 to predict outcomes and improve the predictive value of the current noninvasive work-up was assessed survival analysis was performed, and the c-index was used to assess the performance of each predictive model to compare nested models, the likelihood ratio χ<sup>2</sup> test was performed results a total of 196 patients mean age ± standard deviation, 75 years ± 11; 110 women 56% were included; the median time of follow-up was 18 months mace occurred in 16% 31 of 196 patients and all-cause mortality in 19% 38 of 196 patients univariable analysis revealed ct-ffr was predictive of mace hazard ratio hr, 41; 95% ci: 16, 108; <i>p</i> = 01 but not all-cause mortality hr, 12; 95% ci: 06, 22; <i>p</i> = 63 ct-ffr was independently associated with mace hr, 40; 95% ci: 15, 105; <i>p</i> = 01 when adjusting for potential confounders adding ct-ffr as a predictor to models that include ccta and clinical data improved their predictive value for mace <i>p</i> = 002 but not all-cause mortality <i>p</i> = 67, and it showed good discriminative ability for mace c-index, 071 conclusion ct angiography-derived fractional flow reserve was associated with major adverse cardiac events in candidates for transcatheter aortic valve replacement and improved the predictive value of coronary ct angiography assessment © rsna, 2021 <i>online supplemental material is available for this article</i> see also the editorial by choe in this issue",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1
65797,"readmission prediction using deep learning on electronic health records unscheduled 30-day readmissions are a hallmark of congestive heart failure chf patients that pose significant health risks and escalate care cost in order to reduce readmissions and curb the cost of care, it is important to initiate targeted intervention programs for patients at risk of readmission this requires identifying high-risk patients at the time of discharge from hospital here, using real data from over 7500 chf patients hospitalized between 2012 and 2016 in sweden, we built and tested a deep learning framework to predict 30-day unscheduled readmission we present a cost-sensitive formulation of long short-term memory lstm neural network using expert features and contextual embedding of clinical concepts this study targets key elements of an electronic health record ehr driven prediction model in a single framework: using both expert and machine derived features, incorporating sequential patterns and addressing the class imbalance problem we evaluate the contribution of each element towards prediction performance roc-auc, f1-measure and cost-savings we show that the model with all key elements achieves higher discrimination ability auc: 077; f1: 051; cost: 22% of maximum possible savings outperforming the reduced models in at least two evaluation metrics additionally, we present a simple financial analysis to estimate annual savings if targeted interventions are offered to high risk patients",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
15921,"a model-agnostic approach for understanding heart failure risk factors understanding the risk factors for developing heart failure among patients with type 2 diabetes can contribute to preventing deterioration of quality of life for those persons electronic health records ehr provide an opportunity to use sophisticated machine learning models to understand and compare the effect of different risk factors for developing hf as the complexity of the model increases, however, the transparency of the model often decreases to interpret the results, we aimed to develop a model-agnostic approach to shed light on complex models and interpret the effect of features on developing heart failure using the healthfacts ehr database of the cerner ehr, we extracted the records of 723 patients with at least 6 yeas of follow up of type 2 diabetes, of whom 134 developed heart failure using age and comorbidities as features and heart failure as the outcome, we trained logistic regression, random forest, xgboost, neural network, and then applied our proposed approach to rank the effect of each factor on developing heart failure",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
20254,"blood pressure monitoring system using a two-channel ballistocardiogram and convolutional neural networks hypertension is a chronic disease that kills 76 million people worldwide annually a continuous blood pressure monitoring system is required to accurately diagnose hypertension here, a chair-shaped ballistocardiogram bcg-based blood pressure estimation system was developed with no sensors attached to users two experimental sessions were conducted with 30 subjects in the first session, two-channel bcg and blood pressure data were recorded for each subject in the second session, the two-channel bcg and blood pressure data were recorded after running on a treadmill and then resting on the newly developed system the empirical mode decomposition algorithm was used to remove noise in the two-channel bcg, and the instantaneous phase was calculated by applying a hilbert transform to the first intrinsic mode functions after training a convolutional neural network regression model that predicts the systolic and diastolic blood pressures sbp and dbp from the two-channel bcg phase, the results of the first session rest and second session recovery were compared the results confirmed that the proposed model accurately estimates the rapidly rising blood pressure in the recovery state results from the rest sessions satisfied the association for the advancement of medical instrumentation aami international standards the standard deviation of the sbp results in the recovery session exceeded 07",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
25997,"machine learning based congestive heart failure detection using feature importance ranking of multimodal features in this study, we ranked the multimodal features extracted from congestive heart failure chf and normal sinus rhythm nsr subjects we categorized the ranked features into 1 to 5 categories based on empirical receiver operating characteristics eroc values instead of using all multimodal features, we use high ranking features for detection of chf and normal subjects we employed powerful machine learning techniques such as decision tree dt, naïve bayes nb, svm gaussian, svm rbf and svm polynomial the performance was measured in terms of sensitivity, specificity, positive predictive value ppv, negative predictive value npv, accuracy, false positive rate fpr, and area under the receiver operating characteristic curve auc the highest detection performance in terms of accuracy and auc was obtained with all multimodal features using svm gaussian with sensitivity 9306%, specificity 8182%, accuracy 8879% and auc 095 using the top five ranked features, the highest performance was obtained with svm gaussian yields accuracy 8448%, auc 086; top nine ranked features using decision tree and naïve bayes got accuracy 8448%, auc 088; last thirteen ranked features using svm polynomial obtained accuracy 8017%, auc 084 the findings indicate that proposed approach with feature ranking can be very useful for automatic detection of congestive heart failure patients and can be very helpful for further decision making by the clinicians and physicians in order to decrease the mortality rate",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
80151,"an interactive assistant for patients with cardiac implantable electronic devices: a study protocol of the lucy trial patients with chronic heart failure chf and reduced left ventricle ejection fraction benefit from cardiac resynchronization therapy crt and implantable cardioverter defibrillator icd however, increasing numbers of patient with crt and icd devices produce overload of cardiology centers where patients are admitted to ambulatory visits this study aims to find multivariate model predicting the requirement for ambulatory follow-up of cardiac implantable electronic devices ciedsthe lucy study is an observational, cohort, prospective, 2-stage trial as equal number of patients 300 will be included in the first and the second part of the study, finally, 600 patients will be included in the study the inclusion criteria will be: age between 18 and 90 years, chf new york heart association classes i-iii and implanted icd or crt at least 30 days before study inclusion the exclusion criteria will be dementia and other conditions impeding cooperation during the study all patients included in the study will undergo standard ambulatory visit primary endpoint will be defined as any ambulatory visit qualified as necessary due to patients condition or device malfunction diagnose by the cardiologist: any change in pharmacotherapy related to patients clinical status assessed during the visit, any change in tachyarrythmia counter or discriminator status, any change in tachyarrythmia threshold, presence of ventricular undersensing or oversensing, presence of atrial or ventricular ineffective pacing, or devices pocket infection secondary endpoint will be defined as any ambulatory visit qualified as necessary due to the alarm identified via medtronic carelink express mcle: sustained or treated ventricular tachyarrythmia, any not previously diagnosed supraventricular tachyarrythmia, or elective replacement indicatorour study is the first attempt of implementation of the machine learning and elements artificial intelligence in health care optimization of patients with cied the lucy will be an open product, available for additional testing and improvement with supplementary functionalities: quality of life assessment, teleconsultation, video-streaming, automated imagine recognizing",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,1
64369,"combining structured and unstructured data for predicting risk of readmission for heart failure patients researchers have studied many models for predicting the risk of readmission for heart failure over the last decade most models have used a parametric statistical approach while a few have ventured into using machine learning methods such as statistical natural language processing we created three predictive models by combining these two techniques for the cohort of 1,629 patients from six hosptials using structured data along with their 136,963 clinical notes till their index admission, stored in the emr system over five years the aucs for structured and combined models were very close 06494 and 06447 and that for the unstructured model was 05219 the clinical impact of the models using decision curve analysis showed that, at a threshold predicted probability of 020, the combined model offered 15%, 30%, and 70% net benefit over its individual counterparts, treat-all, and treat-none strategy respectively",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
44614,"biventricular imaging markers to predict outcomes in non-compaction cardiomyopathy: a machine learning study left ventricular non-compaction cardiomyopathy lvnc is a genetic heart disease, with heart failure, arrhythmias, and embolic events as main clinical manifestations the goal of this study was to analyse a large set of echocardiographic echo and cardiac magnetic resonance imaging cmri parameters using machine learning ml techniques to find imaging predictors of clinical outcomes in a long-term follow-up of lvnc patients",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1


In [102]:
#### ARRHYTHMIA / arrhyt

## text
text = ['sinus node', 'sinoatrial', 'atrial tachy', 'atrial flutter', 'accessory pathway', 'long qt', 'holter',
        'pacemaker', 'ventricular tachy', 'atrial fibrill', 'ventricular fibrill', 'supraventricular tachy',
        'cardiover', 'defibrillat', 'heart block', 'degree block', 'av block', 'ventricular block', ' p-wave', ' p wave', 'pr interval',
       'p-r interval', 'pr-interval', 'corrected qt', ' qtc ', ' qrs complex ', 'brugada', 'short qt', 'qt syndrome', 'long qt']

spec['arrhyt_text'] = np.where(groups['text'].str.contains('arrhythmi'), "1", "0")

for x in text:
    spec['arrhyt_text'] = np.where(groups['text'].str.contains(x), "1", spec['arrhyt_text']) #if yes then 1, if no, keep current

spec['arrhyt_text'] = np.where((groups['text'].str.contains("heart")) &
                             (groups['text'].str.contains("ablation")) , "1", spec['arrhyt_text'])
spec['arrhyt_text'] = np.where((groups['text'].str.contains("cardiac")) &
                             (groups['text'].str.contains("ablation")) , "1", spec['arrhyt_text'])
spec['arrhyt_text'] = np.where((groups['text'].str.contains("heart")) &
                             (groups['text'].str.contains("bradycardia")) , "1", spec['arrhyt_text'])
spec['arrhyt_text'] = np.where((groups['text'].str.contains("cardiac")) &
                             (groups['text'].str.contains("bradycardia")) , "1", spec['arrhyt_text'])
spec['arrhyt_text'] = np.where((groups['text'].str.contains("heart")) &
                             (groups['text'].str.contains("electrophys")) , "1", spec['arrhyt_text'])
spec['arrhyt_text'] = np.where((groups['text'].str.contains("cardiac")) &
                             (groups['text'].str.contains("electrophys")) , "1", spec['arrhyt_text'])
spec['arrhyt_text'] = np.where((groups['text'].str.contains("heart")) &
                             (groups['text'].str.contains("rhythm")) , "1", spec['arrhyt_text'])
spec['arrhyt_text'] = np.where((groups['text'].str.contains("cardiac")) &
                             (groups['text'].str.contains("rhythm")) , "1", spec['arrhyt_text'])


## outputs
print('text counts:')
print(Counter(spec['arrhyt_text']))

text counts:
Counter({'0': 33397, '1': 782})


In [103]:
spec[spec['arrhyt_text']=='1'].sample(20)

Unnamed: 0,text,icu_text,ed_text,id_text,sepsis_text,cov19_text,hiv_text,tb_text,tropic_text,malaria_text,derm_text,dermca_text,onc_text,rx_text,breast_text,breastca_text,lungca_text,brainca_text,gica_text,hepca_text,urology_text,prosca_text,renalca_text,gynonc_text,haemonc_text,psych_text,suicide_text,msk_text,frac_text,rheum_text,gi_text,hep_text,resp_text,pneum_text,osa_text,pe_text,pubh_text,neuro_text,cva_text,epilep_text,alzh_text,cvs_text,ihd_text,hf_text,arrhyt_text
9533,"automated localization of focal ventricular tachycardia from simulated implanted device electrograms: a combined physics-ai approach <b>background:</b> focal ventricular tachycardia vt is a life-threating arrhythmia, responsible for high morbidity rates and sudden cardiac death scd radiofrequency ablation is the only curative therapy against incessant vt; however, its success is dependent on accurate localization of its source, which is highly invasive and time-consuming <b>objective:</b> the goal of our study is, as a proof of concept, to demonstrate the possibility of utilizing electrogram egm recordings from cardiac implantable electronic devices cieds to achieve this, we utilize fast and accurate whole torso electrophysiological ep simulations in conjunction with convolutional neural networks cnns to automate the localization of focal vts using simulated egms <b>materials and methods:</b> a highly detailed 3d torso model was used to simulate ∼4000 focal vts, evenly distributed across the left ventricle lv, utilizing a rapid reaction-eikonal environment solutions were subsequently combined with lead field computations on the torso to derive accurate electrocardiograms ecgs and egm traces, which were used as inputs to cnns to localize focal sources we compared the localization performance of a previously developed cnn architecture cartesian probability-based with our novel cnn algorithm utilizing universal ventricular coordinates uvcs <b>results:</b> implanted device egms successfully localized vt sources with localization error 874 mm comparable to ecg-based localization 669 mm our novel uvc cnn architecture outperformed the existing cartesian probability-based algorithm errors = 406 mm and 807 mm for ecgs and egms, respectively overall, localization was relatively insensitive to noise and changes in body compositions; however, displacements in ecg electrodes and cied leads caused performance to decrease errors 16-25 mm <b>conclusion:</b> egm recordings from implanted devices may be used to successfully, and robustly, localize focal vt sources, and aid ablation planning",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
137756,"field programmable gate array based fuzzy neural signal processing system for differential diagnosis of qrs complex tachycardia and tachyarrhythmia in noisy ecg signals the paper reports of a field programmable gate array fpga based embedded system for detection of qrs complex in a noisy electrocardiogram ecg signal and thereafter differential diagnosis of tachycardia and tachyarrhythmia the qrs complex has been detected after application of entropy measure of fuzziness to build a detection function of ecg signal, which has been previously filtered to remove power line interference and base line wander using the detected qrs complexes, differential diagnosis of tachycardia and tachyarrhythmia has been performed the entire algorithm has been realized in hardware on an fpga using the standard cse ecg database, the algorithm performed highly effectively the performance of the algorithm in respect of qrs detection with sensitivity se of 9974% and accuracy of 995% is achieved when tested using single channel ecg with entropy criteria the performance of the qrs detection system has been compared and found to be better than most of the qrs detection systems available in literature using the system, 200 patients have been diagnosed with an accuracy of 985%",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
111202,"symmetrical compression distance for arrhythmia discrimination in cloud-based big-data services the current development of cloud computing is completely changing the paradigm of data knowledge extraction in huge databases an example of this technology in the cardiac arrhythmia field is the scoop platform, a national-level scientific cloud-based big data service for implantable cardioverter defibrillators in this scenario, we here propose a new methodology for automatic classification of intracardiac electrograms egms in a cloud computing system, designed for minimal signal preprocessing a new compression-based similarity measure csm is created for low computational burden, so-called weighted fast compression distance, which provides better performance when compared with other csms in the literature using simple machine learning techniques, a set of 6848 egms extracted from scoop platform were classified into seven cardiac arrhythmia classes and one noise class, reaching near to 90% accuracy when previous patient arrhythmia information was available and 63% otherwise, hence overcoming in all cases the classification provided by the majority class results show that this methodology can be used as a high-quality service of cloud computing, providing support to physicians for improving the knowledge on patient diagnosis",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
113277,"variational bayesian electrophysiological imaging of myocardial infarction the presence, size, and distribution of ischemic tissue bear significant prognostic and therapeutic implication for ventricular arrhythmias while many approaches to 3d infarct detection have been developed via electrophysiological ep imaging from noninvasive electrocardiographic data, this ill-posed inverse problem remains challenging especially for septal infarcts that are hidden from body-surface data we propose a variational bayesian framework for ep imaging of 3d infarct using a total-variation prior the posterior distribution of intramural action potential and all regularization parameters are estimated from body-surface data by minimizing the kullback-leibler divergence because of the uncertainty introduced in prior models, we hypothesize that the solution uncertainty plays as important a role as the point estimate in interpreting the reconstruction this is verified in a set of phantom and real-data experiments, where regions of low confidence help to eliminate false-positives and to accurately identify infarcts of various locations including septum and distributions owing to the ability of total-variation prior in extracting the boundary between smooth regions, the presented method also has the potential to outline infarct border that is the most critical region responsible for ventricular arrhvthmias",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1
138791,"determination of a new vlf band in hrv for ventricular tachyarrhythmia patients this study presents a new very low frequency vlf band range in ventricular tachyarrhythmia patients and involves an approach for estimation of effect of vlf band on ventricular tachyarrhythmia patients a model based on wavelet packets wp and multilayer perceptron neural network mlpnn is used for determination of effective vlf band in heart rate variability hrv signals hrv is decomposed into sub-bands including very low frequency parts and variations of energy are analyzed domination test is done using mlpnn and dominant band is determined as a result, a new vlf band was described in 00039063-003125 hz frequency range this method can be used for other bands or other arrhythmia patients especially, estimation of dominant band energy using this method can be helped to diagnose for applications where have important effect of characteristic band",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
4339,"multi-task deep learning for cardiac rhythm detection in wearable devices wearable devices enable theoretically continuous, longitudinal monitoring of physiological measurements such as step count, energy expenditure, and heart rate although the classification of abnormal cardiac rhythms such as atrial fibrillation from wearable devices has great potential, commercial algorithms remain proprietary and tend to focus on heart rate variability derived from green spectrum led sensors placed on the wrist, where noise remains an unsolved problem here we develop deepbeat, a multitask deep learning method to jointly assess signal quality and arrhythmia event detection in wearable photoplethysmography devices for real-time detection of atrial fibrillation the model is trained on approximately one million simulated unlabeled physiological signals and fine-tuned on a curated dataset of over 500 k labeled signals from over 100 individuals from 3 different wearable devices we demonstrate that, in comparison with a single-task model, our architecture using unsupervised transfer learning through convolutional denoising autoencoders dramatically improves the performance of atrial fibrillation detection from a f1 score of 054 to 096 we also include in our evaluation a prospectively derived replication cohort of ambulatory participants where the algorithm performed with high sensitivity 098, specificity 099, and f1 score 093 we show that two-stage training can help address the unbalanced data problem common to biomedical applications, where large-scale well-annotated datasets are hard to generate due to the expense of manual annotation, data acquisition, and participant privacy",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
58171,"new artificial intelligence prediction model using serial prothrombin time international normalized ratio measurements in atrial fibrillation patients on vitamin k antagonists: garfield-af most clinical risk stratification models are based on measurement at a single time-point rather than serial measurements artificial intelligence ai is able to predict one-dimensional outcomes from multi-dimensional datasets using data from global anticoagulant registry in the field garfield-af registry, a new ai model was developed for predicting clinical outcomes in atrial fibrillation af patients up to 1 year based on sequential measures of prothrombin time international normalized ratio pt-inr within 30 days of enrolment",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
58654,"a 1334 μw event-driven patient-specific ann cardiac arrhythmia classifier for wearable ecg sensors artificial neural network ann and its variants are favored algorithm in designing cardiac arrhythmia classifier cac for its high accuracy however, the implementation of ultralow power ann-cac is challenging due to the intensive computations moreover, the imbalanced mit-bih database limits the ann-cac performance several novel techniques are proposed to address the challenges in the low power implementation firstly, continuous-in-time discrete-in-amplitude ctda signal flow is adopted to reduce the multiplication operations secondly, conditional grouping scheme cgs in combination with biased training bt is proposed to handle the imbalanced training samples for better training convergency and evaluation accuracy thirdly, arithmetic unit sharing with customized high-performance multiplier improves the power efficiency verified in fpga and synthesized in 018 μm cmos process, the proposed ctda ann-cac can classify an arrhythmia within 252 μs at 25 mhz clock frequency with average power of 1334 μw for 75bpm heart rate evaluated on mit-bih database, it shows over 98% classification accuracy, 97% sensitivity, and 94% positive predictivity",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
60082,"evaluation of risk prediction models of atrial fibrillation from the multi-ethnic study of atherosclerosis mesa atrial fibrillation af is prevalent and strongly associated with higher cardiovascular disease cvd risk machine learning is increasingly used to identify novel predictors of cvd risk, but prediction improvements beyond established risk scores are uncertain we evaluated improvements in predicting 5-year af risk when adding novel candidate variables identified by machine learning to the charge-af enriched score, which includes age, race/ethnicity, height, weight, systolic and diastolic blood pressure, current smoking, use of antihypertensive medication, diabetes, and nt-probnp we included 3,534 participants mean age, 613 years; 520% female with complete data from the prospective multi-ethnic study of atherosclerosis incident af was defined based on study electrocardiograms and hospital discharge diagnosis icd-9 codes, supplemented by medicare claims prediction performance was evaluated using cox regression and a parsimonious model was selected using lasso within 5 years of baseline, 124 participants had incident af compared with the charge-af enriched model c-statistic, 0804, variables identified by machine learning, including biomarkers, cardiac magnetic resonance imaging variables, electrocardiogram variables, and subclinical cvd variables, did not significantly improve prediction a 23-item score derived by machine learning achieved a c-statistic of 0806, whereas a parsimonious model including the clinical risk factors age, weight, current smoking, nt-probnp, coronary artery calcium score, and cardiac troponin-t achieved a c-statistic of 0802 this analysis confirms that the charge-af enriched model and a parsimonious 6-item model performed similarly to a more extensive model derived by machine learning in conclusion, these simple models remain the gold standard for risk prediction of af, although addition of the coronary artery calcium score should be considered",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1
71650,"a supervised approach to robust photoplethysmography quality assessment early detection of atrial fibrillation afib is crucial to prevent stroke recurrence new tools for monitoring cardiac rhythm are important for risk stratification and stroke prevention as many of new approaches to long-term afib detection are now based on photoplethysmogram ppg recordings from wearable devices, ensuring high ppg signal-to-noise ratios is a fundamental requirement for a robust detection of afib episodes traditionally, signal quality assessment is often based on the evaluation of similarity between pulses to derive signal quality indices there are limitations to using this approach for accurate assessment of ppg quality in the presence of arrhythmia, as in the case of afib, mainly due to substantial changes in pulse morphology in this paper, we first tested the performance of algorithms selected from a body of studies on ppg quality assessment using a dataset of ppg recordings from patients with afib we then propose machine learning approaches for ppg quality assessment in 30-s segments of ppg recording from 13 stroke patients admitted to the university of california san francisco ucsf neuro intensive care unit and another dataset of 3764 patients from one of the five ucsf general intensive care units we used data acquired from two systems, fingertip ppg fppg from a bedside monitor system, and radial ppg rppg measured using a wearable commercial wristband we compared various supervised machine learning techniques including k-nearest neighbors, decisions trees, and a two-class support vector machine svm svm provided the best performance fppg signals were used to build the model and achieved 09477 accuracy when tested on the data from the fppg exclusive to the test set, and 09589 accuracy when tested on the rppg data",1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1


In [104]:
## ENDOCRINE [C19] - no dm / endo

## text
text = ['acromegaly', 'adrenal', 'addisons', 'conns syn', 'cushings synd', 'cushings disease', 'thyroid', 'graves disease',
       'hashimoto', 'polycystic ovary', 'prolactin', 'pituitar', 'androgen', 'testosterone', 'gonadism', 'gonadal']

spec['endo_text'] = np.where(groups['text'].str.contains('endocrin'), "1", "0")

for x in text:
    spec['endo_text'] = np.where(groups['text'].str.contains(x), "1", spec['endo_text']) #if yes then 1, if no, keep current


## outputs
print('text counts:')
print(Counter(spec['endo_text']))

text counts:
Counter({'0': 33695, '1': 484})


In [105]:
#### DIABETES - all / dm

## text
text = ['diabet', 'mellitus', 'hypoglycemia', 'hypoglycaemi', 'hyperglycemi', 'hyperglycaemi', 'insulin', 'glucagon',
        'islet cell'
       ]

spec['dm_text'] = np.where(groups['text'].str.contains('diabetes'), "1", "0")

for x in text:
    spec['dm_text'] = np.where(groups['text'].str.contains(x), "1", spec['dm_text']) #if yes then 1, if no, keep current

spec['dm_text'] = np.where(groups['text'].str.contains("insipidus"), "0", spec['dm_text'])
spec['dm_text'] = np.where(groups['text'].str.contains('growth factor'), "0", spec['dm_text'])
spec['dm_text'] = np.where(groups['text'].str.contains(' igf'), "1", spec['dm_text'])

## output
print('text counts:')
print(Counter(spec['dm_text']))

text counts:
Counter({'0': 32897, '1': 1282})


In [106]:
#### DIABETES - insulin / insulin

spec['insulin_text'] = np.where(groups['text'].str.contains('insulin'), "1", "0")


spec['insulin_text'] = np.where(groups['text'].str.contains('growth factor'), "0", spec['insulin_text'])
spec['insulin_text'] = np.where(groups['text'].str.contains(' igf'), "0", spec['insulin_text'])

## output
print('text counts:')
print(Counter(spec['insulin_text']))

text counts:
Counter({'0': 34015, '1': 164})


In [107]:
spec[spec['insulin_text']=='1'].sample(20)

Unnamed: 0,text,icu_text,ed_text,id_text,sepsis_text,cov19_text,hiv_text,tb_text,tropic_text,malaria_text,derm_text,dermca_text,onc_text,rx_text,breast_text,breastca_text,lungca_text,brainca_text,gica_text,hepca_text,urology_text,prosca_text,renalca_text,gynonc_text,haemonc_text,psych_text,suicide_text,msk_text,frac_text,rheum_text,gi_text,hep_text,resp_text,pneum_text,osa_text,pe_text,pubh_text,neuro_text,cva_text,epilep_text,alzh_text,cvs_text,ihd_text,hf_text,arrhyt_text,endo_text,dm_text,insulin_text
77216,"predicting and understanding the response to short-term intensive insulin therapy in people with early type 2 diabetes short-term intensive insulin therapy iit early in the course of type 2 diabetes acutely improves beta-cell function with long-lasting effects on glycemic control however, conventional measures cannot determine which patients are better suited for iit, and little is known about the molecular mechanisms determining response therefore, this study aimed to develop a model that could accurately predict the response to iit and provide insight into molecular mechanisms driving such response in humans",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
7760,"hippocampal volume reduction is associated with direct measure of insulin resistance in adults hippocampal integrity is highly susceptible to metabolic dysfunction, yet its mechanisms are not well defined we studied 126 healthy individuals aged 23-61 years insulin resistance ir was quantified by measuring steady-state plasma glucose sspg concentration during the insulin suppression test body mass index bmi, adiposity, fasting insulin, glucose, leptin as well as structural neuroimaing with automatic hippocampal subfield segmentation were performed data analysis using unsupervised machine learning k-means clustering identified two subgroups reflecting a pattern of more pronounced hippocampal volume reduction being concurrently associated with greater adiposity and insulin resistance; the hippocampal volume reductions were uniform across subfields individuals in the most deviant subgroup were predominantly women 79 versus 42 % with higher bmi 279 25 versus 305 46 kg/m<sup>2</sup>, ir sspg concentration, 156 61 versus 123 70 mg/dl and leptinemia 217 170 versus 445 304 μg/l the use of person-based modeling in healthy individuals suggests that adiposity, insulin resistance and compromised structural hippocampal integrity behave as a composite phenotype; female sex emerged as risk factor for this phenotype",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1
32600,"a novel cgm metric-gradient and combining mean sensor glucose enable to improve the prediction of nocturnal hypoglycemic events in patients with diabetes nocturnal hypoglycemia is a serious complication of insulin-treated diabetes, and it is often asymptomatic a novel cgm metric-gradient was proposed in this paper, and a method of combining mean sensor glucose msg and gradient was presented for the prediction of nocturnal hypoglycemia for this purpose, the data from continuous glucose monitoring cgm encompassing 1,921 patients with diabetes were analyzed, and a total of 302 nocturnal hypoglycemic events were recorded the msg and gradient values were calculated, respectively, and then combined as a new metric <i>ie</i>, msg+gradient in addition, the prediction was conducted by four algorithms, namely, logistic regression, support vector machine, random forest, and long short-term memory the results revealed that the gradient of cgm showed a downward trend before hypoglycemic events happened additionally, the results indicated that the specificity and sensitivity based on the proposed method were better than the conventional metrics of low blood glucose index lbgi, coefficient of variation cv, mean absolute glucose mag, lability index li, <i>etc</i>, and the complex metrics of msg+lbgi, msg+cv, msg+mag, and msg+li, <i>etc</i> specifically, the specificity and sensitivity were greater than 9607% and 9603% at the prediction horizon of 15 minutes and greater than 8779% and 9007% at the prediction horizon of 30 minutes when the proposed method was adopted to predict nocturnal hypoglycemic events in the aforementioned four algorithms therefore, the proposed method of combining msg and gradient may enable to improve the prediction of nocturnal hypoglycemic events future studies are warranted to confirm the validity of this metric",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
168634,"a simple prediction rule and a neural network model to predict pancreatic beta-cell reserve in young adults with diabetes mellitus in the present study we developed and assessed the performance of a simple prediction rule and a neural network model to predict beta-cell reserve in young adults with diabetes eighty three young adults with diabetes were included in the study all were less than 40 years old and without apparent secondary causes of diabetes the subjects were randomly allocated to 2 groups; group 1 n = 59 for developing a prediction rule and training a neural network, group 2 n = 24 for validation purpose the prediction rule was developed by using stepwise logistic regression using stepwise logistic regression and modification of the derived equation, the patient would be insulin deficient if 3waist circumference in cm + 4age at diagnosis < 340 in the absence of previous diabetic ketoacidosis dka or < 400 in the presence of previous dka when tested in the validation set, the prediction rule had positive and negative predictive values of 867 per cent and 778 per cent respectively with 833 per cent accuracy while the ann model had a positive predictive value of 882 per cent and a negative predictive value of 100 per cent with 917 per cent accuracy when testing the performance of the prediction rule and the ann model compared to the assessment of 23 internists in a subgroup of 9 diabetics whose age at onset was less than 30 years and without a history of dka, the ann had the highest ability to predict beta-cell reserve accuracy = 889, followed by the prediction rule accuracy = 778% and assessments by internists accuracy = 609% we concluded that beta-cell reserve in young adults with diabetes mellitus could be predicted by a simple prediction rule or a neural network model the prediction rule and the neural network model can be helpful clinically in patients with mixed clinical features of type 1 and type 2 diabetes",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
65522,"glunet: a deep learning framework for accurate glucose forecasting for people with type 1 diabetes t1d, forecasting of blood glucose bg can be used to effectively avoid hyperglycemia, hypoglycemia and associated complications the latest continuous glucose monitoring cgm technology allows people to observe glucose in real-time however, an accurate glucose forecast remains a challenge in this work, we introduce glunet, a framework that leverages on a personalized deep neural network to predict the probabilistic distribution of short-term 30-60 minutes future cgm measurements for subjects with t1d based on their historical data including glucose measurements, meal information, insulin doses, and other factors it adopts the latest deep learning techniques consisting of four components: data pre-processing, label transform/recover, multi-layers of dilated convolution neural network cnn, and post-processing the method is evaluated in-silico for both adult and adolescent subjects the results show significant improvements over existing methods in the literature through a comprehensive comparison in terms of root mean square error rmse formula: see text mg/dl with short time lag formula: see text minutes for prediction horizons ph = 30 mins minutes, and rmse formula: see text mg/dl with time lag formula: see text mins for ph = 60 mins for virtual adult subjects in addition, glunet is also tested on two clinical data sets results show that it achieves an rmse formula: see text mg/dl with time lag formula: see text mins for ph = 30 mins and an rmse formula: see text mg/dl with time lag formula: see text mins for ph = 60 mins these are the best reported results for glucose forecasting when compared with other methods including the neural network for predicting glucose nnpg, the support vector regression svr, the latent variable with exogenous input lvx, and the auto regression with exogenous input arx algorithm",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
150895,"neural network based glucose - insulin metabolism models for children with type 1 diabetes in this paper two models for the simulation of glucose-insulin metabolism of children with type 1 diabetes are presented the models are based on the combined use of compartmental models cms and artificial neural networks nns data from children with type 1 diabetes, stored in a database, have been used as input to the models the data are taken from four children with type 1 diabetes and contain information about glucose levels taken from continuous glucose monitoring system, insulin intake and food intake, along with corresponding time the influences of taken insulin on plasma insulin concentration, as well as the effect of food intake on glucose input into the blood from the gut, are estimated from the cms the outputs of cms, along with previous glucose measurements, are fed to a nn, which provides short-term prediction of glucose values for comparative reasons two different nn architectures have been tested: a feed-forward nn ffnn trained with the back-propagation algorithm with adaptive learning rate and momentum, and a recurrent nn rnn, trained with the real time recurrent learning rtrl algorithm the results indicate that the best prediction performance can be achieved by the use of rnn",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
68229,"prediction and prevention of hypoglycaemic events in type-1 diabetic patients using machine learning tight blood glucose control reduces the risk of microvascular and macrovascular complications in patients with type 1 diabetes however, this is very difficult due to the large intra-individual variability and other factors that affect glycaemic control the main limiting factor to achieve strict control of glucose levels in patients on intensive insulin therapy is the risk of severe hypoglycaemia therefore, hypoglycaemia is the main safety problem in the treatment of type 1 diabetes, negatively affecting the quality of life of patients suffering from this disease decision support tools based on machine learning methods have become a viable way to enhance patient safety by anticipating adverse glycaemic events this study proposes the application of four machine learning algorithms to tackle the problem of safety in diabetes management: 1 grammatical evolution for the mid-term continuous prediction of blood glucose levels, 2 support vector machines to predict hypoglycaemic events during postprandial periods, 3 artificial neural networks to predict hypoglycaemic episodes overnight, and 4 data mining to profile diabetes management scenarios the proposal consists of the combination of prediction and classification capabilities of the implemented approaches the resulting system significantly reduces the number of episodes of hypoglycaemia, improving safety and providing patients with greater confidence in decision-making",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
128784,natural occurrence of nocturnal hypoglycemia detection using hybrid particle swarm optimized fuzzy reasoning model low blood glucose hypoglycemia is a common and serious side effect of insulin therapy in patients with diabetes this paper will make a contribution to knowledge in the modeling and design of a non-invasive hypoglycemia monitor for patients with type 1 diabetes mellitus t1dm using a fuzzy-reasoning system,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
136417,"evolved fuzzy reasoning model for hypoglycaemic detection hypoglycaemia is a serious side effect of insulin therapy in patients with diabetes we measure physiological parameters heart rate, corrected qt interval of the electrocardiogram ecg signal continuously to provide early detection of hypoglycemic episodes in type 1 diabetes mellitus t1dm patients based on the physiological parameters, an evolved fuzzy reasoning model frm to recognize the presence of hypoglycaemic episodes is developed to optimize the fuzzy rules and the fuzzy membership functions of frm, an evolutionary algorithm called hybrid particle swarm optimization with wavelet mutation operation is investigated all data sets are collected from department of health, government of western australia for a clinical study the results show that the proposed algorithm performs well in terms of the clinical sensitivity and specificity",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1
66252,"classification of postprandial glycemic status with application to insulin dosing in type 1 diabetes-an in silico proof-of-concept in the daily management of type 1 diabetes t1d, determining the correct insulin dose to be injected at meal-time is fundamental to achieve optimal glycemic control wearable sensors, such as continuous glucose monitoring cgm devices, are instrumental to achieve this purpose in this paper, we show how cgm data, together with commonly recorded inputs carbohydrate intake and bolus insulin, can be used to develop an algorithm that allows classifying, at meal-time, the post-prandial glycemic status ie, blood glucose concentration being too low, too high, or within target range such an outcome can then be used to improve the efficacy of insulin therapy by reducing or increasing the corresponding meal bolus dose a state-of-the-art t1d simulation environment, including intraday variability and a behavioral model, was used to generate a rich in silico dataset corresponding to 100 subjects over a two-month scenario then, an extreme gradient-boosted tree xgb algorithm was employed to classify the post-prandial glycemic status finally, we demonstrate how the xgb algorithm outcome can be exploited to improve glycemic control in t1d through real-time adjustment of the meal insulin bolus the proposed xgb algorithm obtained good accuracy at classifying post-prandial glycemic status auroc = 084 078, 087 consequently, when used to adjust, in real-time, meal insulin boluses obtained with a bolus calculator, the proposed approach improves glycemic control when compared to the baseline bolus calculator in particular, percentage time in target 70, 180 mg/dl was improved from 6198 ± 1389 to 6700 ± 1154; p < 001 without increasing hypoglycemia",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1


In [108]:
#### DM RETINOPATHIES / retina

## text
spec['retina_text'] = np.where(groups['text'].str.contains('diabetic retin'), "1", "0")

spec['retina_text'] = np.where((groups['text'].str.contains("diabet")) &
                             (groups['text'].str.contains("retina")) , "1", spec['retina_text'])
spec['retina_text'] = np.where((groups['text'].str.contains("diabet")) &
                             (groups['text'].str.contains("retino")) , "1", spec['retina_text'])
spec['retina_text'] = np.where((groups['text'].str.contains("diabet")) &
                             (groups['text'].str.contains("eye")) , "1", spec['retina_text'])

print('text counts:')
print(Counter(spec['retina_text']))

text counts:
Counter({'0': 33794, '1': 385})


In [109]:
spec[spec['retina_text']=='1'].sample(20)

Unnamed: 0,text,icu_text,ed_text,id_text,sepsis_text,cov19_text,hiv_text,tb_text,tropic_text,malaria_text,derm_text,dermca_text,onc_text,rx_text,breast_text,breastca_text,lungca_text,brainca_text,gica_text,hepca_text,urology_text,prosca_text,renalca_text,gynonc_text,haemonc_text,psych_text,suicide_text,msk_text,frac_text,rheum_text,gi_text,hep_text,resp_text,pneum_text,osa_text,pe_text,pubh_text,neuro_text,cva_text,epilep_text,alzh_text,cvs_text,ihd_text,hf_text,arrhyt_text,endo_text,dm_text,insulin_text,retina_text
80853,"feature selection and parameters optimization of support vector machines based on hybrid glowworm swarm optimization for classification of diabetic retinopathy diabetic retinopathy dr has been a leading cause of blindness in case of human beings falling between the ages of 20 and 74 years this will have a major influence on both the patient and the society as it can normally influence the humans in their gainful years an early dr detection is quite challenging as it may not be detected by humans there are several techniques and algorithms that have been established for detecting the dr these techniques have been facing problems to achieve effective sensitivity, accuracy, and specificity in order to overcome all these problems, the work has proposed one more such effective algorithm for image processing in order to increase the efficiency and also identify easily the dr diseases a major challenge in the task is the automatic detection of the microaneurysms in this work, the support vector machine svm parameters optimized with glowworm swarm optimization gso and genetic algorithm ga is used to classify the dr because the svm parameter c and γ to control the performance of the classifier for this work, the svms get fused with the hybrid gso-ga along with the feature chromosomes that are generated that will thereby direct the ga search to a straight line of the error of optimal generalization in their super parameter space this gso algorithm will not have memory and the glow worms will not retain any information in memory the results of the experiment prove that this method had achieved a better performance",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
92430,"an automated system for the detection and classification of retinal changes due to red lesions in longitudinal fundus images people with diabetes mellitus need annual screening to check for the development of diabetic retinopathy dr tracking small retinal changes due to early diabetic retinopathy lesions in longitudinal fundus image sets is challenging due to intra- and intervisit variability in illumination and image quality, the required high registration accuracy, and the subtle appearance of retinal lesions compared to other retinal features this paper presents a robust and flexible approach for automated detection of longitudinal retinal changes due to small red lesions by exploiting normalized fundus images that significantly reduce illumination variations and improve the contrast of small retinal features to detect spatio-temporal retinal changes, the absolute difference between the extremes of the multiscale blobness responses of fundus images from two time points is proposed as a simple and effective blobness measure dr related changes are then identified based on several intensity and shape features by a support vector machine classifier the proposed approach was evaluated in the context of a regular diabetic retinopathy screening program involving subjects ranging from healthy no retinal lesion to moderate with clinically relevant retinal lesions dr levels evaluation shows that the system is able to detect retinal changes due to small red lesions with a sensitivity of at an average false positive rate of 1 and 25 lesions per eye on small and large fields-of-view of the retina, respectively",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
29064,"using artificial intelligence as an initial triage strategy in diabetic retinopathy screening program in china <b>objective:</b> to investigate the diagnostic accuracy and efficiency of an artificial intelligence ai triaging model in a diabetic retinopathy dr screening program <b>methods:</b> a dr screening program was conducted in kashi city and kizilsu kirghiz autonomous prefecture of the xinjiang uyur autonomous region from may to july 2018, and 8 005 patients with diabetes mellitus were included fundus images, one centered at optic disc and one centered at macula, were taken for both eyes a previously validated ai algorithm was applied as the first step to identify the patients with all 4 images if the images were classified as gradable and negative dr, an ai-generated report was immediately provided without sending to manual grading, and 1/3 of these patients were randomly sampled for manual grading and quality control group a for the patients with at least one image classified as ungradable or positive for any dr, all images were sent for manual grading group b finally, 300 patients were randomly selected from group a and group b respectively for accuracy assessment, where the patients and their images were classified by a specialist panel for referral dr pre-proliferative dr, or proliferative dr, and/or diabetic macular edema <b>results:</b> among 8 005 patients for dr screening including 3 220 males and 4 785 females, aged 583±106 years, after ai triaging, 5 267 658% potentially received reports from ai system and 2 738 342% required manual grading in group a, the accuracy and specificity of ai classification and manual grading on referral dr were all 100% in group b, the accuracy of ai and manual grading were 758% and 903%, respectively, while the sensitivity of ai and manual grading was 100% and 791%, respectively <b>conclusion:</b> ai alleviates 60% of the workload of manual grading without missing any referral patients with the aid of the current ai triaging model",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
34564,"hybrid model structure for diabetic retinopathy classification diabetic retinopathy dr is one of the most common complications of diabetes and the main cause of blindness the progression of the disease can be prevented by early diagnosis of dr due to differences in the distribution of medical conditions and low labor efficiency, the best time for diagnosis and treatment was missed, which results in impaired vision using neural network models to classify and diagnose dr can improve efficiency and reduce costs in this work, an improved loss function and three hybrid model structures hybrid-a, hybrid-f, and hybrid-c were proposed to improve the performance of dr classification models efficientnetb4, efficientnetb5, nasnetlarge, xception, and inceptionresnetv2 cnns were chosen as the basic models these basic models were trained using enhance cross-entropy loss and cross-entropy loss, respectively the output of the basic models was used to train the hybrid model structures experiments showed that enhance cross-entropy loss can effectively accelerate the training process of the basic models and improve the performance of the models under various evaluation metrics the proposed hybrid model structures can also improve dr classification performance compared with the best-performing results in the basic models, the accuracy of dr classification was improved from 8544% to 8634%, the sensitivity was improved from 9848% to 9877%, the specificity was improved from 7182% to 7476%, the precision was improved from 9027% to 9137%, and the f1 score was improved from 9362% to 939% by using hybrid model structures",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
90440,"deep convolutional neural network-based early automated detection of diabetic retinopathy using fundus image the automatic detection of diabetic retinopathy is of vital importance, as it is the main cause of irreversible vision loss in the working-age population in the developed world the early detection of diabetic retinopathy occurrence can be very helpful for clinical treatment; although several different feature extraction approaches have been proposed, the classification task for retinal images is still tedious even for those trained clinicians recently, deep convolutional neural networks have manifested superior performance in image classification compared to previous handcrafted feature-based image classification methods thus, in this paper, we explored the use of deep convolutional neural network methodology for the automatic classification of diabetic retinopathy using color fundus image, and obtained an accuracy of 945% on our dataset, outperforming the results obtained by using classical approaches",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
84398,"automated detection of diabetic retinopathy using deep learning diabetic retinopathy is a leading cause of blindness among working-age adults early detection of this condition is critical for good prognosis in this paper, we demonstrate the use of convolutional neural networks cnns on color fundus images for the recognition task of diabetic retinopathy staging our network models achieved test metric performance comparable to baseline literature results, with validation sensitivity of 95% we additionally explored multinomial classification models, and demonstrate that errors primarily occur in the misclassification of mild disease as normal due to the cnns inability to detect subtle disease features we discovered that preprocessing with contrast limited adaptive histogram equalization and ensuring dataset fidelity by expert verification of class labels improves recognition of subtle features transfer learning on pretrained googlenet and alexnet models from imagenet improved peak test set accuracies to 745%, 688%, and 572% on 2-ary, 3-ary, and 4-ary classification models, respectively",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
74763,"artificial intelligence for the detection of diabetic retinopathy in primary care: protocol for algorithm development diabetic retinopathy dr is one of the most important causes of blindness worldwide, especially in developed countries in diabetic patients, periodic examination of the back of the eye using a nonmydriatic camera has been widely demonstrated to be an effective system to control and prevent the onset of dr convolutional neural networks have been used to detect dr, achieving very high sensitivities and specificities",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
156702,"robust detection and classification of longitudinal changes in color retinal fundus images for monitoring diabetic retinopathy a fully automated approach is presented for robust detection and classification of changes in longitudinal time-series of color retinal fundus images of diabetic retinopathy the method is robust to: 1 spatial variations in illumination resulting from instrument limitations and changes both within, and between patient visits; 2 imaging artifacts such as dust particles; 3 outliers in the training data; 4 segmentation and alignment errors robustness to illumination variation is achieved by a novel iterative algorithm to estimate the reflectance of the retina exploiting automatically extracted segmentations of the retinal vasculature, optic disk, fovea, and pathologies robustness to dust artifacts is achieved by exploiting their spectral characteristics, enabling application to film-based, as well as digital imaging systems false changes from alignment errors are minimized by subpixel accuracy registration using a 12-parameter transformation that accounts for unknown retinal curvature and camera parameters bayesian detection and classification algorithms are used to generate a color-coded output that is readily inspected a multiobserver validation on 43 image pairs from 22 eyes involving nonproliferative and proliferative diabetic retinopathies, showed a 97% change detection rate, a 3% miss rate, and a 10% false alarm rate the performance in correctly classifying the changes was 993% a self-consistency metric, and an error factor were developed to measure performance over more than two periods the average self consistency was 94% and the error factor was 006% although this study focuses on diabetic changes, the proposed techniques have broader applicability in ophthalmology",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
37508,"dcardnet: diabetic retinopathy classification at multiple levels based on structural and angiographic optical coherence tomography optical coherence tomography oct and its angiography octa have several advantages for the early detection and diagnosis of diabetic retinopathy dr however, automated, complete dr classification frameworks based on both oct and octa data have not been proposed in this study, a convolutional neural network cnn based method is proposed to fulfill a dr classification framework using en face oct and octa",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
51248,"automated quantification of photoreceptor alteration in macular disease using optical coherence tomography and deep learning diabetic macular edema dme and retina vein occlusion rvo are macular diseases in which central photoreceptors are affected due to pathological accumulation of fluid optical coherence tomography allows to visually assess and evaluate photoreceptor integrity, whose alteration has been observed as an important biomarker of both diseases however, the manual quantification of this layered structure is challenging, tedious and time-consuming in this paper we introduce a deep learning approach for automatically segmenting and characterising photoreceptor alteration the photoreceptor layer is segmented using an ensemble of four different convolutional neural networks en-face representations of the layer thickness are produced to characterize the photoreceptors the pixel-wise standard deviation of the score maps produced by the individual models is also taken to indicate areas of photoreceptor abnormality or ambiguous results experimental results showed that our ensemble is able to produce results in pair with a human expert, outperforming each of its constitutive models no statistically significant differences were observed between mean thickness estimates obtained from automated and manually generated annotations therefore, our model is able to reliable quantify photoreceptors, which can be used to improve prognosis and managment of macular diseases",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1


In [110]:
## OPHTHALMOLOGY [C11] / eye

## text
text = ['ophth', 'retina', 'retino', 'retinitis', 'eye disease', 'uveitis', 'iritis', 'conjunctiv', 'cornea', 'blephar',
       'optic nerve', 'optic atrophy', 'optic disk', 'optic disc', 'optic neuropathy', 'choroid', 'blindness', 'macular',
       'strabismus', 'ocular', 'glaucoma', 'keratoconus']

spec['eye_text'] = np.where(groups['text'].str.contains('eye disease'), "1", "0")

for x in text:
    spec['eye_text'] = np.where(groups['text'].str.contains(x), "1", spec['eye_text']) #if yes then 1, if no, keep current

spec['eye_text'] = np.where((groups['text'].str.contains("eye")) &
                             (groups['text'].str.contains("optic")) , "1", spec['eye_text'])
spec['eye_text'] = np.where((groups['text'].str.contains("eye")) &
                             (groups['text'].str.contains("fundus")) , "1", spec['eye_text'])
spec['eye_text'] = np.where((groups['text'].str.contains("eye")) &
                             (groups['text'].str.contains("fundal")) , "1", spec['eye_text'])
    
## output
print('text counts:')
print(Counter(spec['eye_text']))

text counts:
Counter({'0': 32742, '1': 1437})


In [111]:
spec[spec['eye_text']=='1'].sample(5)

Unnamed: 0,text,icu_text,ed_text,id_text,sepsis_text,cov19_text,hiv_text,tb_text,tropic_text,malaria_text,derm_text,dermca_text,onc_text,rx_text,breast_text,breastca_text,lungca_text,brainca_text,gica_text,hepca_text,urology_text,prosca_text,renalca_text,gynonc_text,haemonc_text,psych_text,suicide_text,msk_text,frac_text,rheum_text,gi_text,hep_text,resp_text,pneum_text,osa_text,pe_text,pubh_text,neuro_text,cva_text,epilep_text,alzh_text,cvs_text,ihd_text,hf_text,arrhyt_text,endo_text,dm_text,insulin_text,retina_text,eye_text
55449,"medios- an offline, smartphone-based artificial intelligence algorithm for the diagnosis of diabetic retinopathy an observational study to assess the sensitivity and specificity of the medios smartphone-based offline deep learning artificial intelligence ai software to detect diabetic retinopathy dr compared with the image diagnosis of ophthalmologists",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1
55944,"deep learning based sub-retinal fluid segmentation in central serous chorioretinopathy optical coherence tomography scans development of an automated sub-retinal fluid segmentation technique from optical coherence tomography oct scans is faced with challenges such as noise and motion artifacts present in oct images, variation in size, shape and location of fluid pockets within the retina the ability of a fully convolutional neural network to automatically learn significant low level features to differentiate subtle spatial variations makes it suitable for retinal fluid segmentation task hence, a fully convolutional neural network has been proposed in this work for the automatic segmentation of sub-retinal fluid in oct scans of central serous chorioretinopathy csc pathology the proposed method has been evaluated on a dataset of 15 oct volumes and an average dice rate, precision and recall of 091, 093 and 089 respectively has been achieved over the test set",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
87247,microaneurysm detection using fully convolutional neural networks diabetic retinopathy is a microvascular complication of diabetes that can lead to sight loss if treated not early enough microaneurysms are the earliest clinical signs of diabetic retinopathy this paper presents an automatic method for detecting microaneurysms in fundus photographies,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1
159000,optical coherence tomography machine learning classifiers for glaucoma detection: a preliminary study machine-learning classifiers are trained computerized systems with the ability to detect the relationship between multiple input parameters and a diagnosis the present study investigated whether the use of machine-learning classifiers improves optical coherence tomography oct glaucoma detection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
159396,"a mixture of experts network structure for modelling doppler ultrasound blood flow signals mixture of experts me is a modular neural network architecture for supervised learning this paper illustrates the use of me network structure to guide modelling doppler ultrasound blood flow signals expectation-maximization em algorithm was used for training the me so that the learning process is decoupled in a manner that fits well with the modular structure the ophthalmic and internal carotid arterial doppler signals were decomposed into time-frequency representations using discrete wavelet transform and statistical features were calculated to depict their distribution the me network structures were implemented for diagnosis of ophthalmic and internal carotid arterial disorders using the statistical features as inputs to improve diagnostic accuracy, the outputs of expert networks were combined by a gating network simultaneously trained in order to stochastically select the expert that is performing the best at solving the problem the me network structure achieved accuracy rates which were higher than that of the stand-alone neural network models",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [112]:
## HAEMATOLOGIC [C15] / haem

## text
text = ['haematological cancer', 'hematological cancer', 'haematological malig', 'hematological malig', 'myelodysplas',
       'myeloprolif', 'lymphoprolif', 'leukaemia', 'leukemia', 'myelofibro', 'thrombocythemia', 'polycythemia vera',
       'polycythemia rubra vera', 'thrombocythaemia', 'polycythaemia vera', 'polycythaemia rubra vera', 'lymphoma',
       'myeloma', ' gvhd', 'stem cell transpl', 'bone marrow aspirate',
       'haematolog', 'anemia', 'anaemia', 'hemoglobin', 'haemoglobin', 'sickle cell', 'thalassemia', 'thalassaemia',
       'sickle crisis', 'clotting disorder', 'coagulation disorder', 'coagulopathy', 'hemophilia', 'haemophilia',
       'von willebrand', 'disseminated intrasvascular', 'thrombocytopeni', 'hemoly', 'haemoly', 'cryoglob', 'thrombim',
       'bone marrow', 'coagulation']

spec['haem_text'] = np.where(groups['text'].str.contains('hematolog'), "1", "0")

for x in text:
    spec['haem_text'] = np.where(groups['text'].str.contains(x), "1", spec['haem_text']) #if yes then 1, if no, keep current


## output
print('text counts:')
print(Counter(spec['haem_text']))

text counts:
Counter({'0': 33435, '1': 744})


In [113]:
## GYNAE/OBSTETRIC [C13] / obs

## text
text = ['obstetric', 'fetal', 'foetal', 'foetus', 'fetus', 'gestation', 'pregnan', 'endometriosis', 'ovarian', 'gynecolog', 'uterine', 'uterus'
       'cervix', 'pap smear', 'cervical cancer', 'cervical carcinoma', ' vagina ', 'vaginal', 'vaginosis', 'macrosomia', 'colposcop',
       'gynaecolog', 'menopaus', 'eclamp', ' iugr ', 'caesarean', 'endometrial']

spec['obs_text'] = np.where(groups['text'].str.contains('cesarean'), "1", "0")

for x in text:
    spec['obs_text'] = np.where(groups['text'].str.contains(x), "1", spec['obs_text']) #if yes then 1, if no, keep current


## output
print('text counts:')
print(Counter(spec['obs_text']))

text counts:
Counter({'0': 33097, '1': 1082})


In [114]:
## NEPHROLOGY [C12] / renal

## text
text = [' renal ', 'kidney', 'hemodialysis', 'haemodialysis', 'hemofilt', 'haemofilt', 'nephro', 'nephrit', 'glomerulus']

spec['renal_text'] = np.where(groups['text'].str.contains('renovasc'), "1", "0")

for x in text:
    spec['renal_text'] = np.where(groups['text'].str.contains(x), "1", spec['renal_text']) #if yes then 1, if no, keep current


## output
print('text counts:')
print(Counter(spec['renal_text']))

text counts:
Counter({'0': 33427, '1': 752})


In [115]:
spec[spec['renal_text']=='1'].sample(5)

Unnamed: 0,text,icu_text,ed_text,id_text,sepsis_text,cov19_text,hiv_text,tb_text,tropic_text,malaria_text,derm_text,dermca_text,onc_text,rx_text,breast_text,breastca_text,lungca_text,brainca_text,gica_text,hepca_text,urology_text,prosca_text,renalca_text,gynonc_text,haemonc_text,psych_text,suicide_text,msk_text,frac_text,rheum_text,gi_text,hep_text,resp_text,pneum_text,osa_text,pe_text,pubh_text,neuro_text,cva_text,epilep_text,alzh_text,cvs_text,ihd_text,hf_text,arrhyt_text,endo_text,dm_text,insulin_text,retina_text,eye_text,haem_text,obs_text,renal_text
46837,"predictive modeling of blood pressure during hemodialysis: a comparison of linear model, random forest, support vector regression, xgboost, lasso regression and ensemble method intradialytic hypotension idh is commonly occurred and links to higher mortality among patients undergoing hemodialysis hd its early prediction and prevention will dramatically improve the quality of life however, predicting the occurrence of idh clinically is not simple the aims of this study are to develop an intelligent system with capability of predicting blood pressure bp during hd, and to further compare different machine learning algorithms for next systolic bp sbp prediction",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
82297,"modeling the covariates effects on the hazard function by piecewise exponential artificial neural networks: an application to a controlled clinical trial on renal carcinoma in exploring the time course of a disease to support or generate biological hypotheses, the shape of the hazard function provides relevant information for long follow-ups the shape of hazard function may be complex, with the presence of multiple peaks in this paper we present the use of a neural network extension of the piecewise exponential model to study the shape of the hazard function in time in dependence of covariates the technique is applied to a dataset of 247 renal cell carcinoma patients from a randomized clinical trial",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
26362,"a phenotypic risk score for predicting mortality in sickle cell disease risk assessment for patients with sickle cell disease scd remains challenging as it depends on an individual physicians experience and ability to integrate a variety of test results we aimed to provide a new risk score that combines clinical, laboratory, and imaging data in a prospective cohort of 600 adult patients with scd, we assessed the relationship of 70 baseline covariates to all-cause mortality random survival forest and regularised cox regression machine learning ml methods were used to select top predictors multivariable models and a risk score were developed and internally validated over a median follow-up of 4·3 years, 131 deaths were recorded multivariable models were developed using nine independent predictors of mortality: tricuspid regurgitant velocity, estimated right atrial pressure, mitral e velocity, left ventricular septal thickness, body mass index, blood urea nitrogen, alkaline phosphatase, heart rate and age our prognostic risk score had superior performance with a bias-corrected c-statistic of 0·763 our model stratified patients into four groups with significantly different 4-year mortality rates 3%, 11%, 35% and 75% respectively using readily available variables from patients with scd, we applied ml techniques to develop and validate a mortality risk scoring method that reflects the summation of cardiopulmonary, renal and liver end-organ damage trial registration: clinicaltrialsgov identifier: nct#00011648",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1
101239,"electronic medical record-based predictive model for acute kidney injury in an acute care hospital patients with acute kidney injury aki are at risk for increased morbidity and mortality lack of specific treatment has meant that efforts have focused on early diagnosis and timely treatment advanced algorithms for clinical assistance including aki prediction models have potential to provide accurate risk estimates in this project, we aim to provide a clinical decision supporting system cdss based on a self-learning predictive model for aki in patients of an acute care hospital data of all in-patient episodes in adults admitted will be analysed using data mining techniques to build a prediction model the subsequent machine-learning process including two algorithms for data stream and concept drift will refine the predictive ability of the model simulation studies on the model will be used to quantify the expected impact of several scenarios of change in factors that influence aki incidence the proposed dynamic cdss will apply to future in-hospital aki surveillance in clinical practice",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
23028,"predicting the appearance of hypotension during hemodialysis sessions using machine learning classifiers a patient suffering from advanced chronic renal disease undergoes several dialysis sessions on different dates several clinical parameters are monitored during the different hours of any of these sessions these parameters, together with the information provided by other parameters of analytical nature, can be very useful to determine the probability that a patient may suffer from hypotension during the session, which should be specially watched since it represents a proven factor of possible mortality however, the analytical information is not always available to the healthcare personnel, or it is far in time, so the clinical parameters monitored during the session become key to the prevention of hypotension this article presents an investigation to predict the appearance of hypotension during a dialysis session, using predictive models trained from a large dialysis database, which contains the clinical information of 98,015 sessions corresponding to 758 patients the prediction model takes into account up to 22 clinical parameters measured five times during the session, as well as the gender and age of the patient this model was trained by means of machine learning classifiers, providing a success in the prediction higher than 80%",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [116]:
## ACUTE & CHRONIC KIDNEY DISEASE / ACKD

## text
spec['ackd_text'] = np.where(groups['text'].str.contains("acute kidney"), "1", "0")
spec['ackd_text'] = np.where(groups['text'].str.contains("acute renal"), "1", spec['ackd_text'])
spec['ackd_text'] = np.where(groups['text'].str.contains("kidney failure"), "1", spec['ackd_text'])
spec['ackd_text'] = np.where(groups['text'].str.contains("renal failure"), "1", spec['ackd_text'])
spec['ackd_text'] = np.where(groups['text'].str.contains("chronic kidney disease"), "1", spec['ackd_text'])
spec['ackd_text'] = np.where(groups['text'].str.contains("chronic renal disease"), "1", spec['ackd_text'])
spec['ackd_text'] = np.where(groups['text'].str.contains("stage kidney"), "1", spec['ackd_text'])
spec['ackd_text'] = np.where(groups['text'].str.contains("stage renal"), "1", spec['ackd_text'])

print('text counts:')
print(Counter(spec['ackd_text']))

text counts:
Counter({'0': 33917, '1': 262})


In [117]:
## PAEDIATRICS / paeds

## text
text = ['paedia', 'pedia', 'neonate', 'neonatal', 'teenage', 'youth', 'children', 'childhood', 'infant', 
       'newborn', 'baby', 'babies', 'toddler']

spec['paeds_text'] = np.where(groups['text'].str.contains(' child '), "1", "0")

for x in text:
    spec['paeds_text'] = np.where(groups['text'].str.contains(x), "1", spec['paeds_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['paeds_text']))

text counts:
Counter({'0': 32370, '1': 1809})


In [118]:
spec[spec['paeds_text']=='1'].sample(5)

Unnamed: 0,text,icu_text,ed_text,id_text,sepsis_text,cov19_text,hiv_text,tb_text,tropic_text,malaria_text,derm_text,dermca_text,onc_text,rx_text,breast_text,breastca_text,lungca_text,brainca_text,gica_text,hepca_text,urology_text,prosca_text,renalca_text,gynonc_text,haemonc_text,psych_text,suicide_text,msk_text,frac_text,rheum_text,gi_text,hep_text,resp_text,pneum_text,osa_text,pe_text,pubh_text,neuro_text,cva_text,epilep_text,alzh_text,cvs_text,ihd_text,hf_text,arrhyt_text,endo_text,dm_text,insulin_text,retina_text,eye_text,haem_text,obs_text,renal_text,ackd_text,paeds_text
70919,"leveraging human microbiome features to diagnose and stratify children with irritable bowel syndrome accurate diagnosis and stratification of children with irritable bowel syndrome ibs remain challenging given the central role of recurrent abdominal pain in ibs, we evaluated the relationships of pediatric ibs and abdominal pain with intestinal microbes and fecal metabolites using a comprehensive clinical characterization and multiomics strategy using rigorous clinical phenotyping, we identified preadolescent children aged 7 to 12 years with rome iii ibs n = 23 and healthy controls n = 22 and characterized their fecal microbial communities using whole-genome shotgun metagenomics and global unbiased fecal metabolomic profiling correlation-based approaches and machine learning algorithms identified associations between microbes, metabolites, and abdominal pain ibs cases differed from controls with respect to key bacterial taxa eg, flavonifractor plautii and lachnospiraceae bacterium 7_1_58faa, metagenomic functions eg, carbohydrate metabolism and amino acid metabolism, and higher-order metabolites eg, secondary bile acids, sterols, and steroid-like compounds significant associations between abdominal pain frequency and severity and intestinal microbial features were identified a random forest classifier built on metagenomic and metabolic markers successfully distinguished ibs cases from controls area under the curve, 093 leveraging multiple lines of evidence, intestinal microbes, genes/pathways, and metabolites were associated with ibs, and these features were capable of distinguishing children with ibs from healthy children these multi-omics features, and their links to childhood ibs coupled with nutritional interventions, may lead to new microbiome-guided diagnostic and therapeutic strategies",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
57127,"predicting the serum digoxin concentrations of infants in the neonatal intensive care unit through an artificial neural network given its narrow therapeutic range, digoxins pharmacokinetic parameters in infants are difficult to predict due to variation in birth weight and gestational age, especially for critically ill newborns there is limited evidence to support the safety and dosage requirements of digoxin, let alone to predict its concentrations in infants this study aimed to compare the concentrations of digoxin predicted by traditional regression modeling and artificial neural network ann modeling for newborn infants given digoxin for clinically significant patent ductus arteriosus pda",1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
80810,"a machine learning approach to estimating preterm infants survival: development of the preterm infants survival assessment pisa predictor estimation of mortality risk of very preterm neonates is carried out in clinical and research settings we aimed at elaborating a prediction tool using machine learning methods we developed models on a cohort of 23747 neonates <30 weeks gestational age, or <1501 g birth weight, enrolled in the italian neonatal network in 2008-2014 development set, using 12 easily collected perinatal variables we used a cohort from 2015-2016 n = 5810 as a test set among several machine learning methods we chose artificial neural networks nn the resulting predictor was compared with logistic regression models in the test cohort, nn had a slightly better discrimination than logistic regression p < 0002 the differences were greater in subgroups of neonates at various gestational age or birth weight intervals, singletons using a cutoff of death probability of 05, logistic regression misclassified 67/5810 neonates 12 percent more than nn in conclusion our study - the largest published so far - shows that even in this very simplified scenario, using only limited information available up to 5 minutes after birth, a nn approach had a small but significant advantage over current approaches the software implementing the predictor is made freely available to the community",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
132500,"voxelwise multivariate statistics and brain-wide machine learning using the full diffusion tensor in this paper, we propose to use the full diffusion tensor to perform brain-wide score prediction on diffusion tensor imaging dti using the log-euclidean framework, rather than the commonly used fractional anisotropy fa indeed, scalar values such as the fa do not capture all the information contained in the diffusion tensor additionally, full tensor information is included in every step of the pre-processing pipeline: registration, smoothing and feature selection using voxelwise multivariate regression analysis this approach was tested on data obtained from 30 children and adolescents with autism spectrum disorder and showed some improvement over the fa-only analysis",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
77612,"validation of a salivary rna test for childhood autism spectrum disorder <b>background:</b> the diagnosis of autism spectrum disorder asd relies on behavioral assessment efforts to define biomarkers of asd have not resulted in an objective, reliable test studies of rna levels in asd have demonstrated potential utility, but have been limited by a focus on single rna types, small sample sizes, and lack of developmental delay controls we hypothesized that a saliva-based poly-omic rna panel could objectively distinguish children with asd from their neurotypical peers and children with non-asd developmental delay <b>methods:</b> this multi-center cross-sectional study included 456 children, ages 19-83 months children were either neurotypical <i>n</i> = 134 or had a diagnosis of asd <i>n</i> = 238, or non-asd developmental delay <i>n</i> = 84 comprehensive human and microbial rna abundance was measured in the saliva of all participants using unbiased next generation sequencing prior to analysis, the sample was randomly divided into a training set 82% of subjects and an independent validation test set 18% of subjects the training set was used to develop an rna-based algorithm that distinguished asd and non-asd children the validation set was not used in model development feature selection or training but served only to validate empirical accuracy <b>results:</b> in the training set <i>n</i> = 372; mean age 51 months; 75% male; 51% asd, a set of 32 rna features controlled for demographic and medical characteristics, identified asd status with a cross-validated area under the curve auc of 087 95% ci: 086-088 in the completely separate validation test set <i>n</i> = 84; mean age 50 months; 85% male; 60% asd, the algorithm maintained an auc of 088 82% sensitivity and 88% specificity notably, the rna features were implicated in physiologic processes related to asd axon guidance, neurotrophic signaling <b>conclusion:</b> salivary poly-omic rna measurement represents a novel, non-invasive approach that can accurately identify children with asd this technology could improve the specificity of referrals for asd evaluation or provide objective support for asd diagnoses",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [119]:
## STOMATOGNATHIC, DENTAL [C07]  / dent

## text
text = [' dental', 'dentist', 'dentition', 'teeth', 'tooth', 'canine', 'incisor', 'molars', 'maxilla', 'mandibul', 'mandible',
       'stomatognathic', 'gingiva', 'buccal', 'peridont']

spec['dent_text'] = np.where(groups['text'].str.contains('maxillofacial'), "1", "0")

for x in text:
    spec['dent_text'] = np.where(groups['text'].str.contains(x), "1", spec['dent_text']) #if yes then 1, if no, keep current

## output
print('text counts:')
print(Counter(spec['dent_text']))

text counts:
Counter({'0': 33852, '1': 327})


In [120]:
spec[spec['dent_text']=='1'].sample(5)

Unnamed: 0,text,icu_text,ed_text,id_text,sepsis_text,cov19_text,hiv_text,tb_text,tropic_text,malaria_text,derm_text,dermca_text,onc_text,rx_text,breast_text,breastca_text,lungca_text,brainca_text,gica_text,hepca_text,urology_text,prosca_text,renalca_text,gynonc_text,haemonc_text,psych_text,suicide_text,msk_text,frac_text,rheum_text,gi_text,hep_text,resp_text,pneum_text,osa_text,pe_text,pubh_text,neuro_text,cva_text,epilep_text,alzh_text,cvs_text,ihd_text,hf_text,arrhyt_text,endo_text,dm_text,insulin_text,retina_text,eye_text,haem_text,obs_text,renal_text,ackd_text,paeds_text,dent_text
34462,"prediction of 30-day hospital readmissions for all-cause dental conditions using machine learning it is unknown whether patients admitted for all-cause dental conditions acdc are at high risk for hospital readmission, or what are the risk factors for dental hospital readmission",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
76894,"deep-learning classification using convolutional neural network for evaluation of maxillary sinusitis on panoramic radiography to apply a deep-learning system for diagnosis of maxillary sinusitis on panoramic radiography, and to clarify its diagnostic performance",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
5792,"automated adenoid hypertrophy assessment with lateral cephalometry in children based on artificial intelligence adenoid hypertrophy may lead to pediatric obstructive sleep apnea and mouth breathing the routine screening of adenoid hypertrophy in dental practice is helpful for preventing relevant craniofacial and systemic consequences the purpose of this study was to develop an automated assessment tool for adenoid hypertrophy based on artificial intelligence a clinical dataset containing 581 lateral cephalograms was used to train the convolutional neural network cnn according to fujiokas method for adenoid hypertrophy assessment, the regions of interest were defined with four keypoint landmarks the adenoid ratio based on the four landmarks was used for adenoid hypertrophy assessment another dataset consisting of 160 patientslateral cephalograms were used for evaluating the performance of the network diagnostic performance was evaluated with statistical analysis the developed system exhibited high sensitivity 0906, 95% confidence interval ci: 0750-0980, specificity 0938, 95% ci: 0881-0973 and accuracy 0919, 95% ci: 0877-0961 for adenoid hypertrophy assessment the area under the receiver operating characteristic curve was 0987 95% ci: 0974-1000 these results indicated the proposed assessment system is able to assess ah accurately the cnn-incorporated system showed high accuracy and stability in the detection of adenoid hypertrophy from childrenlateral cephalograms, implying the feasibility of automated adenoid hypertrophy screening utilizing a deep neural network model",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
44421,performance of deep learning object detection technology in the detection and diagnosis of maxillary sinus lesions on panoramic radiographs the first aim of this study was to determine the performance of a deep learning object detection technique in the detection of maxillary sinuses on panoramic radiographs the second aim was to clarify the performance in the classification of maxillary sinus lesions compared with healthy maxillary sinuses,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
76289,a pilot study using machine learning methods about factors influencing prognosis of dental implants this study tried to find the most significant factors predicting implant prognosis using machine learning methods,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [121]:
## AUDIOLOGY [C09] / audio

## text
text = ['audiology', ' ear disease', 'earache', 'labyrinth', 'otitis', 'otosclerosis', 'cochlear', 'tympanic memb',
       'otoscop', 'acoustic neuroma', 'meniere', 'hearing loss', 'hearing impairment', 'cholesteatoma', 'otoacoustic', 'deafness', ' deaf ',
       'middle ear', 'outer ear', 'inner ear', 'otolog', 'paroxysmal positional vertigo']

spec['audio_text'] = np.where(groups['text'].str.contains('hearing aid'), "1", "0")

for x in text:
    spec['audio_text'] = np.where(groups['text'].str.contains(x), "1", spec['audio_text']) #if yes then 1, if no, keep current

print('text counts:')
print(Counter(spec['audio_text']))

text counts:
Counter({'0': 34020, '1': 159})


In [122]:
spec[spec['audio_text']=='1'].sample(5)

Unnamed: 0,text,icu_text,ed_text,id_text,sepsis_text,cov19_text,hiv_text,tb_text,tropic_text,malaria_text,derm_text,dermca_text,onc_text,rx_text,breast_text,breastca_text,lungca_text,brainca_text,gica_text,hepca_text,urology_text,prosca_text,renalca_text,gynonc_text,haemonc_text,psych_text,suicide_text,msk_text,frac_text,rheum_text,gi_text,hep_text,resp_text,pneum_text,osa_text,pe_text,pubh_text,neuro_text,cva_text,epilep_text,alzh_text,cvs_text,ihd_text,hf_text,arrhyt_text,endo_text,dm_text,insulin_text,retina_text,eye_text,haem_text,obs_text,renal_text,ackd_text,paeds_text,dent_text,audio_text
113605,"prediction of hearing loss among the noise-exposed workers in a steel factory using artificial intelligence approach prediction of hearing loss in noisy workplaces is considered to be an important aspect of hearing conservation program artificial intelligence, as a new approach, can be used to predict the complex phenomenon such as hearing loss using artificial neural networks, this study aims to present an empirical model for the prediction of the hearing loss threshold among noise-exposed workers",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
59051,"building an otoscopic screening prototype tool using deep learning otologic diseases are often difficult to diagnose accurately for primary care providers deep learning methods have been applied with great success in many areas of medicine, often outperforming well trained human observers the aim of this work was to develop and evaluate an automatic software prototype to identify otologic abnormalities using a deep convolutional neural network",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
12146,"machine learning-based genetic diagnosis models for hereditary hearing loss by the gjb2, slc26a4 and mt-rnr1 variants hereditary hearing loss hhl is the most common sensory deficit, which highly afflicts humans with gene sequencing technology development, more variants will be identified and support genetic diagnoses, which is difficult for human experts to diagnose this study aims to develop a machine learning-based genetic diagnosis model of hhl-related variants of gjb2, slc26a4 and mt-rnr1",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
168917,"a novel machine learning program applied to discover otological diagnoses a novel machine learning system, galactica, has been developed for knowledge discovery from databases this system was applied to discover diagnostic rules from a patient database containing 564 cases with vestibular schwannoma, bening paroxysmal positional vertigo, ménières disease, sudden deafness, traumatic vertigo and vestibular neuritis diagnoses the rules were evaluated using an independent testing set the accuracy of rules for these diagnoses were 91%, 96%, 81%, 95%, 92% and 98%, respectively besides being accurate, the rules contained the five most important diagnostic questions identified in the earlier research the knowledge presented with rules can be easily comprehended and verified",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
70347,"development of an automatic classifier for the prediction of hearing impairment from industrial noise exposure the iso-1999 2013 international organization for standardization, geneva, switzerland standard is the most commonly used approach for estimating noise-induced hearing trauma however, its insensitivity to noise characteristics limits its practical application in this study, an automatic classification method using the support vector machine svm was developed to predict hearing impairment in workers exposed to both gaussian g and non-gaussian non-g industrial noises a recently collected human database n = 2,110 from industrial workers in china was used in the present study a statistical metric, kurtosis, was used to characterize the industrial noise in addition to using all the data as one group, the data were also broken down into the following four subgroups based on the level of kurtosis: g/quasi-g, low-kurtosis, middle-kurtosis, and high-kurtosis groups the performance of the iso-1999 and the svm models was compared over these five groups the results showed that: 1 the performance of the svm model significantly outperformed the iso-1999 model in all five groups 2 the iso-1999 model could not properly predict hearing impairment for the high-kurtosis group moreover, the iso-1999 model is likely to underestimate hearing impairment caused by both g and non-g noise exposures 3 the svm model is a potential tool to predict hearing impairment caused by diverse noise exposures",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [123]:
## BRAIN COMPUTER / bci

## text
spec['bci_text'] = np.where(groups['text'].str.contains("brain control"), "1", "0")
spec['bci_text'] = np.where(groups['text'].str.contains("brain computer"), "1", "0")

print('text counts:')
print(Counter(spec['bci_text']))

text counts:
Counter({'0': 34079, '1': 100})


In [124]:
## PROSTHESIS CONTROL / prosth

## text
spec['prosth_text'] = np.where(groups['text'].str.contains("prosthetic"), "1", "0")
spec['prosth_text'] = np.where(groups['text'].str.contains("prosthesis"), "1", spec['prosth_text'])

print('text counts:')
print(Counter(spec['prosth_text']))

text counts:
Counter({'0': 33915, '1': 264})


In [125]:
spec[spec['prosth_text']=='1'].sample(5)

Unnamed: 0,text,icu_text,ed_text,id_text,sepsis_text,cov19_text,hiv_text,tb_text,tropic_text,malaria_text,derm_text,dermca_text,onc_text,rx_text,breast_text,breastca_text,lungca_text,brainca_text,gica_text,hepca_text,urology_text,prosca_text,renalca_text,gynonc_text,haemonc_text,psych_text,suicide_text,msk_text,frac_text,rheum_text,gi_text,hep_text,resp_text,pneum_text,osa_text,pe_text,pubh_text,neuro_text,cva_text,epilep_text,alzh_text,cvs_text,ihd_text,hf_text,arrhyt_text,endo_text,dm_text,insulin_text,retina_text,eye_text,haem_text,obs_text,renal_text,ackd_text,paeds_text,dent_text,audio_text,bci_text,prosth_text
9307,"a machine learning framework to optimize optic nerve electrical stimulation for vision restoration optic nerve electrical stimulation is a promising technique to restore vision in blind subjects machine learning methods can be used to select effective stimulation protocols, but they require a model of the stimulated system to generate enough training data here, we use a convolutional neural network cnn as a model of the ventral visual stream a genetic algorithm drives the activation of the units in a layer of the cnn representing a cortical region toward a desired pattern, by refining the activation imposed at a layer representing the optic nerve to simulate the pattern of activation elicited by the sites of an electrode array, a simple point-source model was introduced and its optimization process was investigated for static and dynamic scenes psychophysical data confirm that our stimulation evolution framework produces results compatible with natural vision machine learning approaches could become a very powerful tool to optimize and personalize neuroprosthetic systems",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
46246,"elbow angle generation during activities of daily living using a submovement prediction model the present study aimed to develop a realistic model for the generation of human activities of daily living adl movements the angular profiles of the elbow joint during functional adl tasks such as eating and drinking were generated by a submovement-based closed-loop model first, the adl movements recorded from three human participants were broken down into logical phases, and each phase was decomposed into submovement components three separate artificial neural networks were trained to learn the submovement parameters and were then incorporated into a closed-loop model with error correction ability the model was able to predict angular trajectories of human adl movements with target access rate = 100%, vaf = 989%, and nrmse = 47% relative to the actual trajectories in addition, the model can be used to provide the desired target for practical trajectory planning in rehabilitation systems such as functional electrical stimulation, robot therapy, brain-computer interface, and prosthetic devices",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
105929,"influence of multiple dynamic factors on the performance of myoelectric pattern recognition hand motion classification using surface electromyogram emg signals has been widely studied for the control of powered prosthetics in laboratory conditions however, clinical applicability has been limited, as imposed by factors like electrodes shift, variations in the contraction force levels, forearm rotation angles, change of limb position and many other factors that all affect the emg pattern recognition performance while the impact of several of these factors on emg parameter estimation and pattern recognition has been considered individually in previous studies, a minimum number of experiments were reported to study the influence of multiple dynamic factors in this paper, we investigate the combined effect of varying forearm rotation angles and contraction force levels on the robustness of emg pattern recognition, while utilizing different time-and-frequency based feature extraction methods the emg pattern recognition system has been validated on a set of 11 subjects ten intact-limbed and one bilateral transradial amputee performing six classes of hand motions, each with three different force levels, each at three different forearm rotation angles, with six emg electrodes plus an accelerometer on the subjectsforearm our results suggest that the performance of the learning algorithms can be improved with the time-dependent power spectrum descriptors td-psd utilized in our experiments, with average classification accuracies of up to 90% across all subjects, force levels, and forearm rotation angles",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
166141,"classification of finger activation for use in a robotic prosthesis arm hand amputees would highly benefit from a robotic prosthesis, which would allow the movement of a number of fingers in this paper we propose using the electromyographic signals recorded by two pairs of electrodes placed over the arm for operating such prosthesis multiple features from these signals are extracted whence the most relevant features are selected by a genetic algorithm as inputs for a simple classifier this method results in a probability of error of less than 2%",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
54633,"a piezoresistive array armband with reduced number of sensors for hand gesture recognition human machine interfaces hmis are employed in a broad range of applications, spanning from assistive devices for disability to remote manipulation and gaming controllers in this study, a new piezoresistive sensors array armband is proposed for hand gesture recognition the armband encloses only three sensors targeting specific forearm muscles, with the aim to discriminate eight hand movements each sensor is made by a force-sensitive resistor fsr with a dedicated mechanical coupler and is designed to sense muscle swelling during contraction the armband is designed to be easily wearable and adjustable for any user and was tested on 10 volunteers hand gestures are classified by means of different machine learning algorithms, and classification performances are assessed applying both, the 10-fold and leave-one-out cross-validations a linear support vector machine provided 96% mean accuracy across all participants ultimately, this classifier was implemented on an arduino platform and allowed successful control for videogames in real-time the low power consumption together with the high level of accuracy suggests the potential of this device for exergames commonly employed for neuromotor rehabilitation the reduced number of sensors makes this hmi also suitable for hand-prosthesis control",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [126]:
## ASSISTIVE DEVICE CONTROL / assist

## text
spec['assist_text'] = np.where(groups['text'].str.contains("wheelchair"), "1", "0")
spec['assist_text'] = np.where(groups['text'].str.contains("scooter"), "1", "0")
spec['assist_text'] = np.where(groups['text'].str.contains("mobility device"), "1", "0")
spec['assist_text'] = np.where(groups['text'].str.contains("assistive device"), "1", "0")
spec['assist_text'] = np.where(groups['text'].str.contains("exoskeleton"), "1", "0")

print('text counts:')
print(Counter(spec['assist_text']))

text counts:
Counter({'0': 34109, '1': 70})


In [127]:
spec[spec['assist_text']=='1'].sample(15)

Unnamed: 0,text,icu_text,ed_text,id_text,sepsis_text,cov19_text,hiv_text,tb_text,tropic_text,malaria_text,derm_text,dermca_text,onc_text,rx_text,breast_text,breastca_text,lungca_text,brainca_text,gica_text,hepca_text,urology_text,prosca_text,renalca_text,gynonc_text,haemonc_text,psych_text,suicide_text,msk_text,frac_text,rheum_text,gi_text,hep_text,resp_text,pneum_text,osa_text,pe_text,pubh_text,neuro_text,cva_text,epilep_text,alzh_text,cvs_text,ihd_text,hf_text,arrhyt_text,endo_text,dm_text,insulin_text,retina_text,eye_text,haem_text,obs_text,renal_text,ackd_text,paeds_text,dent_text,audio_text,bci_text,prosth_text,assist_text
11103,"assist-as-needed exoskeleton for hand joint rehabilitation based on muscle effort detection robotic-assisted systems have gained significant traction in post-stroke therapies to support rehabilitation, since these systems can provide high-intensity and high-frequency treatment while allowing accurate motion-control over the patients progress in this paper, we tackle how to provide active support through a robotic-assisted exoskeleton by developing a novel closed-loop architecture that continually measures electromyographic signals emg, in order to adjust the assistance given by the exoskeleton we used emg signals acquired from four patients with post-stroke hand impairments for training machine learning models used to characterize muscle effort by classifying three muscular condition levels based on contraction strength, co-activation, and muscular activation measurements the proposed closed-loop system takes into account the emg muscle effort to modulate the exoskeleton velocity during the rehabilitation therapy experimental results indicate the maximum variation on velocity was 07 mm/s, while the proposed control system effectively modulated the movements of the exoskeleton based on the emg readings, keeping a reference tracking error <5%",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
101488,"role of muscle synergies in real-time classification of upper limb motions using extreme learning machines myoelectric signals offer significant insights in interpreting the motion intention and extent of effort involved in performing a movement, with application in prostheses, orthosis and exoskeletons feature extraction plays a vital role, and follows two approaches: emg and synergy features more recently, muscle synergy based features are being increasingly explored, since it simplifies dimensionality of control, and are considered to be more robust to signal variations another important aspect in a myoelectrically controlled devices is the learning capability and speed of performance for online decoding extreme learning machine elm is a relatively new neural-network based learning algorithm: its performance hasnt been explored in the context of online control, which is a more reliable measure compared to offline analysis to this purpose we aim at focusing our investigation on a myoelectric-based interface which is able to identify and online classify, upper limb motions involving shoulder and elbow the main objective is to compare the performance of the decoder trained using elm, for two different features: emg and synergy features",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
57917,"gait phase classification and assist torque prediction for a lower limb exoskeleton system using kernel recursive least-squares method the gait phase classification method is a key technique to control an exoskeleton robot different people have different gait features while wearing an exoskeleton robot due to the gap between the exoskeleton and the wearer and their operation habits, such as the correspondence between the joint angle and the moment at which the foot contacts the ground, the amplitude of the joint angle and others in order to enhance the performance of the gait phase classification in an exoskeleton robot using only the angle of hip and knee joints, a kernel recursive least-squares krls algorithm is introduced to build a gait phase classification model we also build an assist torque predictor based on the krls algorithm in this work considering the adaptation of unique gait features in this paper, we evaluate the classification performance of the krls model by comparing with two other commonly used gait recognition methods-the multi-layer perceptron neural network mlpnn method and the support vector machine svm algorithm in this experiment, the training and testing datasets for the models built by krls, mlpnn and svm were collected from 10 healthy volunteers the gait data are collected from the exoskeleton robot that we designed rather than collected from the human body these data depict the human-robot coupling gait that includes unique gait features the krls classification results are in average 3% higher than mlpnn and svm the testing average accuracy of krls is about 86% the prediction results of krls are twice as good as mlpnn in assist torque prediction experiments the krls performs in a good, stable, and robust way and shows model generalization abilities",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
39334,"continuous estimation of knee joint angle based on surface electromyography using a long short-term memory neural network and time-advanced feature continuous joint angle estimation based on a surface electromyography semg signal can be used to improve the man-machine coordination performance of the exoskeleton in this study, we proposed a time-advanced feature and utilized long short-term memory lstm with a root mean square rms feature and its time-advanced feature rmstaf; collectively referred to as rrtaf of semg to estimate the knee joint angle to evaluate the effect of joint angle estimation, we used root mean square error rmse and cross-correlation coefficient <i>ρ</i> between the estimated angle and actual angle we also compared three methods ie, lstm using rms, bpnn back propagation neural network using rrtaf, and bpnn using rms with lstm using rrtaf to highlight its good performance five healthy subjects participated in the experiment and their eight muscle ie, rectus femoris rf, biceps femoris bf, semitendinosus st, gracilis gc, semimembranosus sm, sartorius sr, medial gastrocnemius mg, and tibialis anterior ta semg signals were taken as algorithm inputs moreover, the knee joint angles were used as target values the experimental results showed that, compared with lstm using rms, bpnn using rrtaf, and bpnn using rms, the average rmse values of lstm using rrtaf were respectively reduced by 857%, 4662%, and 6869%, whereas the average <i>ρ</i> values were respectively increased by 031%, 415%, and 1835% the results demonstrated that lstm using rrtaf, which contained the time-advanced feature, had better performance for estimating the knee joint motion",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
59380,"determining user intent of partly dynamic shoulder tasks in individuals with chronic stroke using pattern recognition stroke remains the leading cause of long-term disability in the us although therapy can achieve limited improvement of paretic arm use and performance, weakness and abnormal muscle synergies-which cause unintentional elbow, wrist, and finger flexion during shoulder abduction-contribute significantly to limb disuse and compound rehabilitation efforts emerging wearable exoskeleton technology could provide powered abduction support for the paretic arm, but requires a clinically feasible, robust control scheme capable of differentiating multiple shoulder degrees-of-freedom this study examines whether pattern recognition of sensor data can accurately identify user intent for 9 combinations of 1- and 2- degree-of-freedom shoulder tasks participants with stroke n = 12 used their paretic and non-paretic arms, and healthy controls n = 12 used their dominant arm to complete tasks on a lab-based robot involving combinations of abduction, adduction, and internal and external rotation of the shoulder we examined the effect of arm paretic, non-paretic, load level 25% vs 50% maximal voluntary torque, and dataset electromyography, load cell, or combined on classifier performance results suggest that paretic arm, lower load levels, and using load cell or emg data alone reduced classifier accuracy however, this method still shows promise further work will examine classifier-user interaction during active control of a robotic device and optimization/minimization of sensors",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
55778,"prediction of plantar forces during gait using wearable sensors and deep neural networks<sup></sup> to enable on-time and high-fidelity lower-limb exoskeleton control, it is effective to predict the future human motion from the observed status in this research, we propose a novel method to predict future plantar force during the gait using imu and plantar sensors deep neural networks dnn are used to learn the non-linear relationship between the measured sensor data and the future plantar force data using the trained network, we can predict the plantar force not only during walking but also at the start and end of walking in the experiments, the performance of the proposed method is confirmed for different prediction time",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
63793,"sub-optimally solving actuator redundancy in a hybrid neuroprosthetic system with a multi-layer neural network structure functional electrical stimulation fes has recently been proposed as a supplementary torque assist in lower-limb powered exoskeletons for persons with paraplegia in the combined system, also known as a hybrid neuroprosthesis, both fes-assist and the exoskeleton act to generate lower-limb torques to achieve standing and walking functions due to this actuator redundancy, we are motivated to optimally allocate fes-assist and exoskeleton torque based on a performance index that penalizes fes overuse to minimize muscle fatigue while also minimizing regulation or tracking errors traditional optimal control approaches need a system model to optimize; however, it is often difficult to formulate a musculoskeletal model that accurately predicts muscle responses due to fes in this paper, we use a novel identification and control structure that contains a recurrent neural network rnn and several feedforward neural networks fnns the rnn is trained by supervised learning to identify the system dynamics, while the fnns are trained by a reinforcement learning method to provide sub-optimal control actions the output layer of each fnn has its unique activation functions, so that the asymmetric constraint of fes and the symmetric constraint of exoskeleton motor control input can be realized this new structure is experimentally validated on a seated human participant using a single joint hybrid neuroprosthesis",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
36564,"elbow movement estimation based on emg with narx neural networks the use of the electrical activity from the muscles may provide a natural way to control exoskeletons or other robotic devices seamlessly the major challenges to achieve this goal are human motor redundancy and surface electromyography semg variability the goal of this work is to find a feature extraction and classification procedures to estimate accurately elbow angular trajectory by means of a narx neural network the processing time-step should be small enough to make it feasible its further use for online control of an exoskeleton in order to do so we analysed the biceps and triceps brachii data from an elbow flexo-extension coincident timing task performed in the horizontal plane the semg data was pre-processed and its energy was divided in five frequency intervals that were fed to a nonlinear auto regressive with exogenous inputs narx neural network the estimated angular trajectory was compared with the measured one showing a high correlation between them and a rmse error maximum of 7 degrees the procedure presented here shows a reasonably good estimation that, after training, allows real-time implementation in addition, the results are encouraging to include more complex tasks including the shoulder joint",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
101114,"pso-svm-based online locomotion mode identification for rehabilitation robotic exoskeletons locomotion mode identification is essential for the control of a robotic rehabilitation exoskeletons this paper proposes an online support vector machine svm optimized by particle swarm optimization pso to identify different locomotion modes to realize a smooth and automatic locomotion transition a pso algorithm is used to obtain the optimal parameters of svm for a better overall performance signals measured by the foot pressure sensors integrated in the insoles of wearable shoes and the mems-based attitude and heading reference systems ahrs attached on the shoes and shanks of leg segments are fused together as the input information of svm based on the chosen window whose size is 200 ms with sampling frequency of 40 hz, a three-layer wavelet packet analysis wpa is used for feature extraction, after which, the kernel principal component analysis kpca is utilized to reduce the dimension of the feature set to reduce computation cost of the svm since the signals are from two types of different sensors, the normalization is conducted to scale the input into the interval of 0, 1 five-fold cross validation is adapted to train the classifier, which prevents the classifier over-fitting based on the svm model obtained offline in matlab, an online svm algorithm is constructed for locomotion mode identification experiments are performed for different locomotion modes and experimental results show the effectiveness of the proposed algorithm with an accuracy of 9600% ± 245% to improve its accuracy, majority vote algorithm mva is used for post-processing, with which the identification accuracy is better than 9835% ± 165% the proposed algorithm can be extended and employed in the field of robotic rehabilitation and assistance",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
65406,"assessment of an on-board classifier for activity recognition on an active back-support exoskeleton despite the growing interest, the adoption of industrial exoskeletons may still be held back by technical limitations to enhance versatility and promote adoption, one aspect of interest could be represented by the potential of active and quasi-passive devices to automatically distinguish different activities and adjust their assistive profiles accordingly this contribution focuses on an active back-support exoskeleton and extends previous work proposing the use of a support vector machine to classify walking, bending and standing thanks to the introduction of a new feature-forearm muscle activity-this study shows that it is possible to perform reliable online classification as a consequence, the authors introduce a new hierarchically-structured controller for the exoskeleton under analysis",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [128]:
## HOME ACTIVITY, REHABIILTATION / active

## text
spec['activity_text'] = np.where(groups['text'].str.contains("activity monitor"), "1", "0")
spec['activity_text'] = np.where(groups['text'].str.contains("activity detect"), "1", spec['activity_text'])
spec['activity_text'] = np.where(groups['text'].str.contains("activities monitor"), "1", spec['activity_text'])
spec['activity_text'] = np.where(groups['text'].str.contains("activities detect"), "1", spec['activity_text'])
spec['activity_text'] = np.where(groups['text'].str.contains("home environ"), "1", spec['activity_text'])
spec['activity_text'] = np.where(groups['text'].str.contains("fall detect"), "1", spec['activity_text'])
spec['activity_text'] = np.where(groups['text'].str.contains("fall monitor"), "1", spec['activity_text'])
spec['activity_text'] = np.where(groups['text'].str.contains("falls detect"), "1", spec['activity_text'])
spec['activity_text'] = np.where(groups['text'].str.contains("falls monitor"), "1", spec['activity_text'])
spec['activity_text'] = np.where(groups['text'].str.contains("daily activit"), "1", spec['activity_text'])
spec['activity_text'] = np.where(groups['text'].str.contains("activity classif"), "1", spec['activity_text'])
spec['activity_text'] = np.where(groups['text'].str.contains("daily living"), "1", spec['activity_text'])
spec['activity_text'] = np.where(groups['text'].str.contains("fall prevent"), "1", spec['activity_text'])
spec['activity_text'] = np.where(groups['text'].str.contains("falls in home"), "1", spec['activity_text'])
spec['activity_text'] = np.where(groups['text'].str.contains("falls at home"), "1", spec['activity_text'])
spec['activity_text'] = np.where(groups['text'].str.contains("home sensor"), "1", spec['activity_text'])
spec['activity_text'] = np.where(groups['text'].str.contains("gait analysis"), "1", spec['activity_text'])
spec['activity_text'] = np.where(groups['text'].str.contains("gait detection"), "1", spec['activity_text'])
spec['activity_text'] = np.where(groups['text'].str.contains("gait classification"), "1", spec['activity_text'])

print('text counts:')
print(Counter(spec['activity_text']))

text counts:
Counter({'0': 33591, '1': 588})


In [129]:
spec[spec['activity_text']=='1'].sample(15)

Unnamed: 0,text,icu_text,ed_text,id_text,sepsis_text,cov19_text,hiv_text,tb_text,tropic_text,malaria_text,derm_text,dermca_text,onc_text,rx_text,breast_text,breastca_text,lungca_text,brainca_text,gica_text,hepca_text,urology_text,prosca_text,renalca_text,gynonc_text,haemonc_text,psych_text,suicide_text,msk_text,frac_text,rheum_text,gi_text,hep_text,resp_text,pneum_text,osa_text,pe_text,pubh_text,neuro_text,cva_text,epilep_text,alzh_text,cvs_text,ihd_text,hf_text,arrhyt_text,endo_text,dm_text,insulin_text,retina_text,eye_text,haem_text,obs_text,renal_text,ackd_text,paeds_text,dent_text,audio_text,bci_text,prosth_text,assist_text,activity_text
52887,"a novel hybrid deep neural network to predict pre-impact fall for older people based on wearable inertial sensors falls in the elderly is a major public health concern due to its high prevalence, serious consequences and heavy burden on the society many falls in older people happen within a very short time, which makes it difficult to predict a fall before it occurs and then to provide protection for the person who is falling the primary objective of this study was to develop deep neural networks for predicting a fall during its initiation and descending but before the body impacts to the ground so that a safety mechanism can be enabled to prevent fall-related injuries we divided the falling process into three stages non-fall, pre-impact fall and fall and developed deep neutral networks to perform three-class classification three deep learning models, convolutional neural network cnn, long short term memory lstm, and a novel hybrid model integrating both convolution and long short term memory convlstm were proposed and evaluated on a large public dataset of various falls and activities of daily living adl acquired with wearable inertial sensors accelerometer and gyroscope fivefold cross validation results showed that the hybrid convlstm model had mean sensitivities of 9315, 9378, and 9600% for non-fall, pre-impact fall and fall, respectively, which were higher than both lstm except the fall class and cnn models convlstm model also showed higher specificities for all three classes 9659, 9449, and 9869% than lstm and cnn models in addition, latency test on a microcontroller unit showed that convlstm model had a short latency of 106 ms, which was much lower than lstm model 315 ms and comparable with cnn model 077 ms high prediction accuracy especially for pre-impact fall and low latency on the microboard indicated that the proposed hybrid convlstm model outperformed both lstm and cnn models these findings suggest that our proposed novel hybrid convlstm model has great potential to be embedded into wearable inertial sensor-based systems to predict pre-impact fall in real-time so that protective devices could be triggered in time to prevent fall-related injuries for older people",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
94882,"extracting aerobic system dynamics during unsupervised activities of daily living using wearable sensor machine learning models physical activity levels are related through algorithms to the energetic demand, with no information regarding the integrity of the multiple physiological systems involved in the energetic supply longitudinal analysis of the oxygen uptake v̇o<sub>2</sub> by wearable sensors in realistic settings might permit development of a practical tool for the study of the longitudinal aerobic system dynamics ie, v̇o<sub>2</sub> kinetics this study evaluated aerobic system dynamics based on predicted v̇o<sub>2</sub> data obtained from wearable sensors during unsupervised activities of daily living μadl thirteen healthy men performed a laboratory-controlled moderate exercise protocol and were monitored for ≈6 h/day for 4 days μadl data variables derived from hip accelerometer acc<sub>hip</sub>, heart rate monitor, and respiratory bands during μadl were extracted and processed by a validated random forest regression model to predict v̇o<sub>2</sub> the aerobic system analysis was based on the frequency-domain analysis of acc<sub>hip</sub> and predicted v̇o<sub>2</sub> data obtained during μadl optimal samples for frequency domain analysis constrained to ≤001 hz were selected when acc<sub>hip</sub> was higher than 005 g at a given frequency ie, participants were active the temporal characteristics of predicted v̇o<sub>2</sub> data during μadl correlated with the temporal characteristics of measured v̇o<sub>2</sub> data during laboratory-controlled protocol formula: see text = 082, p < 0001, n = 13 in conclusion, aerobic system dynamics can be investigated during unsupervised activities of daily living by wearable sensors although speculative, these algorithms have the potential to be incorporated into wearable systems for early detection of changes in health status in realistic environments by detecting changes in aerobic response dynamics new & noteworthy the early detection of subclinical aerobic system impairments might be indicative of impaired physiological reserves that impact the capacity for physical activity this study is the first to use wearable sensors in unsupervised activities of daily living in combination with novel machine learning algorithms to investigate the aerobic system dynamics with the potential to contribute to models of functional health status and guide future individualized health care in the normal population",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
17351,"deep chores: estimating hallmark measures of physical activity using deep learning wrist accelerometers for assessing hallmark measures of physical activity pa are rapidly growing with the advent of smartwatch technology given the growing popularity of wrist-worn accelerometers, there needs to be a rigorous evaluation for recognizing pa type and estimating energy expenditure ee across the lifespan participants 66% women, aged 20-89 yrs performed a battery of 33 daily activities in a standardized laboratory setting while a tri-axial accelerometer collected data from the right wrist a portable metabolic unit was worn to measure metabolic intensity we built deep learning networks to extract spatial and temporal representations from the time-series data, and used them to recognize pa type and estimate ee the deep learning models resulted in high performance; the f1 score was: 082, 081, and 95 for recognizing sedentary, locomotor, and lifestyle activities, respectively the root mean square error was 11 +/-013 for the estimation of ee",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
116860,"dynamic bayesian networks for context-aware fall risk assessment fall incidents among the elderly often occur in the home and can cause serious injuries affecting their independent living this paper presents an approach where data from wearable sensors integrated in a smart home environment is combined using a dynamic bayesian network the smart home environment provides contextual data, obtained from environmental sensors, and contributes to assessing a fall risk probability the evaluation of the developed system is performed through simulation each time step is represented by a single user activity and interacts with a fall sensors located on a mobile device a posterior probability is calculated for each recognized activity or contextual information the output of the system provides a total risk assessment of falling given a response from the fall sensor",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
125139,"impact of study design on development and evaluation of an activity-type classifier methods to classify activity types are often evaluated with an experimental protocol involving prescribed physical activities under confined laboratory conditions, which may not reflect real-life conditions the present study aims to evaluate how study design may impact on classifier performance in real life twenty-eight healthy participants 21-53 yr were asked to wear nine triaxial accelerometers while performing 58 activity types selected to simulate activities in real life for each sensor location, logistic classifiers were trained in subsets of up to 8 activities to distinguish between walking and nonwalking activities and were then evaluated in all 58 activities different weighting factors were used to convert the resulting confusion matrices into an estimation of the confusion matrix as would apply in the real-life setting by creating four different real-life scenarios, as well as one traditional laboratory scenario the sensitivity of a classifier estimated with a traditional laboratory protocol is within the range of estimates derived from real-life scenarios for any body location the specificity, however, was systematically overestimated by the traditional laboratory scenario walking time was systematically overestimated, except for lower back sensor data range: 7-757% in conclusion, classifier performance under confined conditions may not accurately reflect classifier performance in real life future studies that aim to evaluate activity classification methods are warranted to pay special attention to the representativeness of experimental conditions for real-life conditions",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
81852,"imu-based classification of parkinsons disease from gait: a sensitivity analysis on sensor location and feature selection inertial measurement units imus have a long-lasting popularity in a variety of industrial applications from navigation systems to guidance and robotics their use in clinical practice is now becoming more common, thanks to miniaturization and the ability to integrate on-board computational and decision-support features imu-based gait analysis is a paradigm of this evolving process, and in this study its use for the assessment of parkinsons disease pd is comprehensively analyzed data coming from 25 individuals with different levels of pd symptoms severity and an equal number of age-matched healthy individuals were included into a set of 6 different machine learning ml techniques, processing 18 different configurations of gait parameters taken from 8 imu sensors classification accuracy was calculated for each configuration and ml technique, adding two meta-classifiers based on the results obtained from all individual techniques through majority of voting, with two different weighting schemes average classification accuracy ranged between 63% and 80% among classifiers and increased up to 96% for one meta-classifier configuration configurations based on a statistical preselection process showed the highest average classification accuracy when reducing the number of sensors, features based on the joint range of motion were more accurate than those based on spatio-temporal parameters in particular, best results were obtained with the knee range of motion, calculated with four imus, placed bilaterally the obtained findings provide data-driven evidence on which combination of sensor configurations and classification methods to be used during imu-based gait analysis to grade the severity level of pd",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
36550,"skeleton data pre-processing for human pose recognition using neural network automatic monitoring of daily living activities can greatly improve the possibility of living autonomously for frail individuals pose recognition based on skeleton tracking data is promising for identifying dangerous situations and trigger external intervention or other alarms, while avoiding privacy issues and the need for patient compliance here we present the benefits of pre-processing kinect-recorded skeleton data to limit the several errors produced by the system when the subject is not in ideal tracking conditions the accuracy of our two hidden layers mlp classifier improved from about 82% to over 92% in recognizing actors in four different poses: standing, sitting, lying and dangerous sitting",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
921,"a comparative study of time frequency representation techniques for freeze of gait detection and prediction freezing of gait fog is an impairment that affects the majority of patients in the advanced stages of parkinsons disease pd fog can lead to sudden falls and injuries, negatively impacting the quality of life for the patients and their families rhythmic auditory stimulation ras can be used to help patients recover from fog and resume normal gait ras might be ineffective due to the latency between the start of a fog event, its detection and initialization of ras we propose a system capable of both fog prediction and detection using signals from tri-axial accelerometer sensors that will be useful in initializing ras with minimal latency we compared the performance of several time frequency analysis techniques, including moving windows extracted from the signals, handcrafted features, recurrence plots rp, short time fourier transform stft, discreet wavelet transform dwt and pseudo wigner ville distribution pwvd with deep learning dl based long short term memory lstm and convolutional neural networks cnn we also propose three ensemble network architectures that combine all the time frequency representations and dl architectures experimental results show that our ensemble architectures significantly improve the performance compared with existing techniques we also present the results of applying our method trained on a publicly available dataset to data collected from patients using wearable sensors in collaboration with at still university",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
9522,"hand tremor detection in videos with cluttered background using neural network based approaches with the increasing prevalence of neurodegenerative diseases, including parkinsons disease, hand tremor detection has become a popular research topic because it helps with the diagnosis and tracking of disease progression conventional hand tremor detection algorithms involved wearable sensors a non-invasive hand tremor detection algorithm using videos as input is desirable but the existing video-based algorithms are sensitive to environmental conditions an algorithm, with the capability of detecting hand tremor from videos with a cluttered background, would allow the videos recorded in a non-research environment to be used clinicians and researchers could use videos collected from patients and participants in their own home environment or standard clinical settings neural network based machine learning architectures provide high accuracy classification results in related fields including hand gesture recognition and body movement detection systems we thus investigated the accuracy of advanced neural network architectures to automatically detect hand tremor in videos with a cluttered background we examined configurations with different sets of features and neural network based classification models we compared the performance of different combinations of features and classification models and then selected the combination which provided the highest accuracy of hand tremor detection we used cross validation to test the accuracy of the trained model predictions the highest classification accuracy for automatically detecting tremor vs non tremor was 806% and this was obtained using convolutional neural network-long short-term memory and features based on measures of frequency and amplitude change",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
96342,"prediction of oxygen uptake dynamics by machine learning analysis of wearable sensors during activities of daily living currently, oxygen uptake is the most precise means of investigating aerobic fitness and level of physical activity; however, can only be directly measured in supervised conditions with the advancement of new wearable sensor technologies and data processing approaches, it is possible to accurately infer work rate and predict during activities of daily living adl the main objective of this study was to develop and verify the methods required to predict and investigate the dynamics during adl the variables derived from the wearable sensors were used to create a predictor based on a random forest method the temporal dynamics were assessed by the mean normalized gain amplitude mng obtained from frequency domain analysis the mng provides a means to assess aerobic fitness the predicted during adl was strongly correlated r = 087, p < 0001 with the measured and the prediction bias was 02 ml·min<sup>-1</sup>·kg<sup>-1</sup> the mng calculated based on predicted was strongly correlated r = 071, p < 0001 with mng calculated based on measured data this new technology provides an important advance in ambulatory and continuous assessment of aerobic fitness with potential for future applications such as the early detection of deterioration of physical health",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [130]:
## combine

labelled['subspec_icu'] = np.where(spec['icu_text'].str.contains("1"), "1", "0")

labelled['subspec_ed'] = np.where(spec['ed_text'].str.contains("1"), "1", "0")

labelled['spec_paeds'] = np.where(spec['paeds_text'].str.contains("1"), "1", "0")

labelled['spec_dent'] = np.where(spec['dent_text'].str.contains("1"), "1", "0")

labelled['spec_audio'] = np.where(spec['audio_text'].str.contains("1"), "1", "0")

labelled['spec_id'] = np.where(spec['id_text'].str.contains("1"), "1", "0")

labelled['subspec_sepsis'] = np.where(spec['sepsis_text'].str.contains("1"), "1", "0")

labelled['subspec_hiv'] = np.where(spec['hiv_text'].str.contains("1"), "1", "0")

labelled['subspec_cov19'] = np.where(spec['cov19_text'].str.contains("1"), "1", "0")

labelled['subspec_tb'] = np.where(spec['tb_text'].str.contains("1"), "1", "0")

labelled['subspec_malaria'] = np.where(spec['malaria_text'].str.contains("1"), "1", "0")

labelled['subspec_tropic'] = np.where(spec['tropic_text'].str.contains("1"), "1", "0")

labelled['spec_derm'] = np.where(spec['derm_text'].str.contains("1"), "1", "0")

labelled['subspec_dermca'] = np.where(spec['dermca_text'].str.contains("1"), "1", "0")

labelled['spec_onc'] = np.where(spec['onc_text'].str.contains("1"), "1", "0")

labelled['subspec_rx'] = np.where(spec['rx_text'].str.contains("1"), "1", "0")

labelled['subspec_lungca'] = np.where(spec['lungca_text'].str.contains("1"), "1", "0")

labelled['subspec_brainca'] = np.where(spec['brainca_text'].str.contains("1"), "1", "0")

labelled['subspec_gica'] = np.where(spec['gica_text'].str.contains("1"), "1", "0")

labelled['subspec_hepca'] = np.where(spec['hepca_text'].str.contains("1"), "1", "0")

labelled['subspec_prosca'] = np.where(spec['prosca_text'].str.contains("1"), "1", "0")

labelled['subspec_gynonc'] = np.where(spec['gynonc_text'].str.contains("1"), "1", "0")

labelled['subspec_renalca'] = np.where(spec['renalca_text'].str.contains("1"), "1", "0")

labelled['subspec_haemonc'] = np.where(spec['haemonc_text'].str.contains("1"), "1", "0")

labelled['subspec_breast'] = np.where(spec['breast_text'].str.contains("1"), "1", "0")

labelled['subspec_breastca'] = np.where(spec['breastca_text'].str.contains("1"), "1", "0")

labelled['subspec_urology'] = np.where(spec['urology_text'].str.contains("1"), "1", "0")

labelled['spec_psych'] = np.where(spec['psych_text'].str.contains("1"), "1", "0")

labelled['subspec_suicide'] = np.where(spec['suicide_text'].str.contains("1"), "1", "0")

labelled['spec_msk'] = np.where(spec['msk_text'].str.contains("1"), "1", "0")

labelled['subspec_frac'] = np.where(spec['frac_text'].str.contains("1"), "1", "0")

labelled['spec_rheum'] = np.where(spec['rheum_text'].str.contains("1"), "1", "0")

labelled['spec_gi'] = np.where(spec['gi_text'].str.contains("1"), "1", "0")

labelled['spec_hep'] = np.where(spec['hep_text'].str.contains("1"), "1", "0")

labelled['spec_resp'] = np.where(spec['resp_text'].str.contains("1"), "1", "0")

labelled['subspec_pneum'] = np.where(spec['pneum_text'].str.contains("1"), "1", "0")

labelled['subspec_osa'] = np.where(spec['osa_text'].str.contains("1"), "1", "0")

labelled['subspec_pe'] = np.where(spec['pe_text'].str.contains("1"), "1", "0")

labelled['spec_neuro'] = np.where(spec['neuro_text'].str.contains("1"), "1", "0")

labelled['subspec_epilep'] = np.where(spec['epilep_text'].str.contains("1"), "1", "0")

labelled['subspec_cva'] = np.where(spec['cva_text'].str.contains("1"), "1", "0")

labelled['subspec_alzh'] = np.where(spec['alzh_text'].str.contains("1"), "1", "0")

labelled['spec_cvs'] = np.where(spec['cvs_text'].str.contains("1"), "1", "0")

labelled['subspec_ihd'] = np.where(spec['ihd_text'].str.contains("1"), "1", "0")

labelled['subspec_hf'] = np.where(spec['hf_text'].str.contains("1"), "1", "0")

labelled['subspec_arrhyt'] =  np.where(spec['arrhyt_text'].str.contains("1"), "1", "0")

labelled['spec_endo'] = np.where(spec['endo_text'].str.contains("1"), "1", "0")

labelled['spec_dm'] = np.where(spec['dm_text'].str.contains("1"), "1", "0")

labelled['subspec_insulin'] = np.where(spec['insulin_text'].str.contains("1"), "1", "0")

labelled['spec_eye'] = np.where(spec['eye_text'].str.contains("1"), "1", "0")

labelled['subspec_retina'] = np.where(spec['retina_text'].str.contains("1"), "1", "0")

labelled['spec_haem'] = np.where(spec['haem_text'].str.contains("1"), "1", "0")

labelled['spec_obs'] = np.where(spec['obs_text'].str.contains("1"), "1", "0")

labelled['spec_renal'] = np.where(spec['renal_text'].str.contains("1"), "1", "0")

labelled['subspec_ackd'] = np.where(spec['ackd_text'].str.contains("1"), "1", "0")

labelled['spec_pubh'] = np.where(spec['pubh_text'].str.contains("1"), "1", "0")

labelled['subspec_bci'] = np.where(spec['bci_text'].str.contains("1"), "1", "0")

labelled['subspec_prosth'] = np.where(spec['prosth_text'].str.contains("1"), "1", "0")

labelled['subspec_assist'] = np.where(spec['assist_text'].str.contains("1"), "1", "0")

labelled['subspec_activity'] = np.where(spec['activity_text'].str.contains("1"), "1", "0")

#spec.to_csv('output/spec_tagged.csv')

  labelled['subspec_retina'] = np.where(spec['retina_text'].str.contains("1"), "1", "0")
  labelled['spec_haem'] = np.where(spec['haem_text'].str.contains("1"), "1", "0")
  labelled['spec_obs'] = np.where(spec['obs_text'].str.contains("1"), "1", "0")
  labelled['spec_renal'] = np.where(spec['renal_text'].str.contains("1"), "1", "0")
  labelled['subspec_ackd'] = np.where(spec['ackd_text'].str.contains("1"), "1", "0")
  labelled['spec_pubh'] = np.where(spec['pubh_text'].str.contains("1"), "1", "0")
  labelled['subspec_bci'] = np.where(spec['bci_text'].str.contains("1"), "1", "0")
  labelled['subspec_prosth'] = np.where(spec['prosth_text'].str.contains("1"), "1", "0")
  labelled['subspec_assist'] = np.where(spec['assist_text'].str.contains("1"), "1", "0")
  labelled['subspec_activity'] = np.where(spec['activity_text'].str.contains("1"), "1", "0")


In [131]:
## Why NER?
# non specific e.g. TB could be in the middle of a ward. NER recognises context
# words separate by unspecified distance -> lung and adenocarcinoma
## too many possible specific terms for subconditions e.g. lung adenocarcinoma, NSCLC -> adenocarcinoma of the lung

## Combination of general terms in main text
## NER for specific terms

## What are the most used **use-cases**
## Can we find what the prediction target is?

## Other Tags

In [132]:
#lmic_list = ['afghanistan', 'albania', 'algeria', 'angola', 'antigua', 'barbuda', 'argentina', 'armenia', 'china',
#             'azerbaijan', 'bangladesh', 'belarus', 'belize', 'benin', 'bhutan', 'bolivia', 'bosnia', 'herzegovina', 
#             'botswana', 'brazil', 'burkina', 'faso', 'burundi', 'verde', 'cambodia', 'cameroon', 'africa', 'chad', 
#             'colombia', 'comoros', 'congo', 'costa rica', 'ivoire', 'cuba', 'djibouti', 'dominica', 'dominica', 
#             'ecuador', 'egypt', 'salvador', 'guinea', 'eritrea', 'eswatini', 'ethiopia', 'fiji', 'gabon', 'gambia', 
#             'georgia', 'ghana', 'grenada', 'guatemala', 'guinea', 'guyana', 'haiti', 'honduras', 'india', 
#             'indonesia', 'iran', 'iraq', 'jamaica', 'jordan', 'kazakhstan', 'kenya', 'kiribati', 'dpr', 'north korea', 
#             'kosovo', 'kyrgyzstan', 'lao', 'lebanon', 'lesotho', 'liberia', 'libya', 'macedonia', 'madagascar', 'malawi', 
#             'malaysia', 'maldives', 'mali', 'marshall', 'mauritania', 'mauritius', 'mexico', 'micronesia', 'moldova', 
#             'mongolia', 'montenegro', 'montserrat', 'morocco', 'mozambique', 'myanmar', 'namibia', 'nauru', 'nepal', 
#             'nicaragua', 'niger', 'nigeria', 'niue', 'pakistan', 'palau', 'panama', 'papua', 'paraguay', 'peru', 
#             'philippines', 'rwanda', 'helena', 'samoa', 'príncipe', 'senegal', 'serbia', 'sierra leone', 'solomon', 
#             'somalia', 'south africa', 'sudan', 'sri lanka', 'saint lucia', 'saint vincent', 'grenadines', 'sudan', 
#             'suriname', 'syria', 'tajikistan', 'tanzania', 'thailand', 'timor', 'togo', 'tokelau', 'tonga', 'tunisia', 
#             'turkey', 'turkmenistan', 'tuvalu', 'uganda', 'ukraine', 'uzbekistan', 'vanuatu', 'venezuela', 'vietnam', 
#             'wallis', 'west bank', 'gaza', 'palestine', 'yemen', 'zambia', 'zimbabwe', 'low-income', 'middle-income', 
#             'lmic', 'scarce', 'resource limited', 'resource-limited']

## Final Tagged Dataset

In [133]:
#all_tagged = pd.concat([algo, feat, spec], axis=1)
#
print(len(labelled))

34179


In [134]:
labelled.info(verbose=1)

<class 'pandas.core.frame.DataFrame'>
Index: 34179 entries, 1 to 172538
Data columns (total 110 columns):
 #    Column               Dtype 
---   ------               ----- 
 0    pmid                 string
 1    doi                  string
 2    title                string
 3    abstract             string
 4    article_date         string
 5    pubmed_date          string
 6    article_type         string
 7    lang                 string
 8    journal              string
 9    journal_short        string
 10   journal_country      string
 11   authors              string
 12   author_affils        string
 13   keywords             string
 14   mesh_terms           string
 15   references_pmids     string
 16   feature              string
 17   include              string
 18   mature               string
 19   algo_neural_net      object
 20   algo_support_vector  object
 21   algo_regression      object
 22   algo_decision_tree   object
 23   algo_discriminant    object
 24   algo

In [135]:
labelled.head(10)

Unnamed: 0,pmid,doi,title,abstract,article_date,pubmed_date,article_type,lang,journal,journal_short,journal_country,authors,author_affils,keywords,mesh_terms,references_pmids,feature,include,mature,algo_neural_net,algo_support_vector,algo_regression,algo_decision_tree,algo_discriminant,algo_naive_bayes,algo_transfer,algo_federated,algo_k_nearest,algo_unsupervised,feat_xr,feat_ct,feat_mri,feat_eeg,feat_ecg,feat_emg,feat_us,feat_echo,feat_histo,feat_oct,feat_mamm,feat_endoscop,feat_derm,feat_gene,feat_bio,feat_nlp,feat_ehr,feat_sensor,feat_phone,feat_prom,feat_sound,subspec_icu,subspec_ed,spec_paeds,spec_dent,spec_audio,spec_id,subspec_sepsis,subspec_hiv,subspec_cov19,subspec_tb,subspec_malaria,subspec_tropic,spec_derm,subspec_dermca,spec_onc,subspec_rx,subspec_lungca,subspec_brainca,subspec_gica,subspec_hepca,subspec_prosca,subspec_gynonc,subspec_renalca,subspec_haemonc,subspec_breast,subspec_breastca,subspec_urology,spec_psych,subspec_suicide,spec_msk,subspec_frac,spec_rheum,spec_gi,spec_hep,spec_resp,subspec_pneum,subspec_osa,subspec_pe,spec_neuro,subspec_epilep,subspec_cva,subspec_alzh,spec_cvs,subspec_ihd,subspec_hf,subspec_arrhyt,spec_endo,spec_dm,subspec_insulin,spec_eye,subspec_retina,spec_haem,spec_obs,spec_renal,subspec_ackd,spec_pubh,subspec_bci,subspec_prosth,subspec_assist,subspec_activity
1,34688173,10.1016/j.compbiomed.2021.104924,A convolutional neural network trained with dermoscopic images of psoriasis performed on par with 230 dermatologists.,Psoriasis is a common chronic inflammatory skin disease that causes physical and psychological burden to patients. A Convolutional Neural Network (CNN) focused on dermoscopic images would substantially aid the classification and increase the accuracy of diagnosis of psoriasis.,2021-10-06,2021-10-24,Journal Article,eng,Computers in biology and medicine,Comput Biol Med,United States,"['Yang Yiguang', 'Wang Juncheng', 'Xie Fengying', 'Liu Jie', 'Shu Chang', 'Wang Yukun', 'Zheng Yushan', 'Zhang Haopeng']","['Image Processing Center, School of Astronautics, Beihang University, Beijing, 100191, China; Beijing Advanced Innovation Center for Biomedical Engineering, Beihang University, Beijing, 100191, China.', 'Department of Dermatology, State Key Laboratory of Complex Severe and Rare Diseases, Peking Union Medical College Hospital, Chinese Academy of Medical Science and Peking Union Medical College, National Clinical Research Center for Dermatologic and Immunologic Diseases, Beijing, 100730, China.', 'Image Processing Center, School of Astronautics, Beihang University, Beijing, 100191, China; Beijing Advanced Innovation Center for Biomedical Engineering, Beihang University, Beijing, 100191, China. Electronic address: xfy_73@buaa.edu.cn.', 'Department of Dermatology, State Key Laboratory of Complex Severe and Rare Diseases, Peking Union Medical College Hospital, Chinese Academy of Medical Science and Peking Union Medical College, National Clinical Research Center for Dermatologic and Immunologic Diseases, Beijing, 100730, China. Electronic address: Liujie04672@pumch.cn.', 'Department of Dermatology, State Key Laboratory of Complex Severe and Rare Diseases, Peking Union Medical College Hospital, Chinese Academy of Medical Science and Peking Union Medical College, National Clinical Research Center for Dermatologic and Immunologic Diseases, Beijing, 100730, China.', 'Department of Dermatology, State Key Laboratory of Complex Severe and Rare Diseases, Peking Union Medical College Hospital, Chinese Academy of Medical Science and Peking Union Medical College, National Clinical Research Center for Dermatologic and Immunologic Diseases, Beijing, 100730, China.', 'Image Processing Center, School of Astronautics, Beihang University, Beijing, 100191, China; Beijing Advanced Innovation Center for Biomedical Engineering, Beihang University, Beijing, 100191, China.', 'Image Processing Center, School of Astronautics, Beihang University, Beijing, 100191, China; Beijing Advanced Innovation Center for Biomedical Engineering, Beihang University, Beijing, 100191, China.']","['Convolutional neural networks', 'Deep-learning', 'Dermoscopic images', 'Papulosquamous skin diseases', 'Psoriasis']",,,A convolutional neural network trained with dermoscopic images of psoriasis performed on par with 230 dermatologists. Psoriasis is a common chronic inflammatory skin disease that causes physical and psychological burden to patients. A Convolutional Neural Network (CNN) focused on dermoscopic images would substantially aid the classification and increase the accuracy of diagnosis of psoriasis.,1.0,1.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,34688172,10.1016/j.compbiomed.2021.104927,A large margin piecewise linear classifier with fusion of deep features in the diagnosis of COVID-19.,"The world has experienced epidemics of coronavirus infections several times over the last two decades. Recent studies have shown that using medical imaging techniques can be useful in developing an automatic computer-aided diagnosis system to detect pandemic diseases with high accuracy at an early stage. In this study, a large margin piecewise linear classifier was developed to diagnose COVID-19 compared to a wide range of viral pneumonia, including SARS and MERS, using chest x-ray images. In the proposed method, a preprocessing pipeline was employed. Moreover, deep pre- and post-rectified linear unit (ReLU) features were extracted using the well-known VGG-Net19, which was fine-tuned to optimize transfer learning. Afterward, the canonical correlation analysis was performed for feature fusion, and fused deep features were passed into the LMPL classifier. The introduced method reached the highest performance in comparison with related state-of-the-art methods for two different schemes (normal, COVID-19, and typical viral pneumonia) and (COVID-19, SARS, and MERS pneumonia) with 99.39% and 98.86% classification accuracy, respectively.",2021-10-11,2021-10-24,Journal Article,eng,Computers in biology and medicine,Comput Biol Med,United States,"['Azouji Neda', 'Sami Ashkan', 'Taheri Mohammad', 'Müller Henning']","['Department of Computer Science and Engineering and IT, School of Electrical and Computer Engineering, Shiraz University, Shiraz, Iran. Electronic address: azouji@shirazu.ac.ir.', 'Department of Computer Science and Engineering and IT, School of Electrical and Computer Engineering, Shiraz University, Shiraz, Iran. Electronic address: sami@shirazu.ac.ir.', 'Department of Computer Science and Engineering and IT, School of Electrical and Computer Engineering, Shiraz University, Shiraz, Iran. Electronic address: motaheri@shirazu.ac.ir.', 'Department of Business Information Systems University of Applied Sciences Western Switzerland, Sierre (HES SO), Switzerland. Electronic address: henning.mueller@hevs.ch.']","['COVID-19', 'Computer-aided diagnosis (CAD)', 'Deep feature extraction', 'Large margin classifier', 'MERS', 'SARS', 'X-ray']",,,"A large margin piecewise linear classifier with fusion of deep features in the diagnosis of COVID-19. The world has experienced epidemics of coronavirus infections several times over the last two decades. Recent studies have shown that using medical imaging techniques can be useful in developing an automatic computer-aided diagnosis system to detect pandemic diseases with high accuracy at an early stage. In this study, a large margin piecewise linear classifier was developed to diagnose COVID-19 compared to a wide range of viral pneumonia, including SARS and MERS, using chest x-ray images. In the proposed method, a preprocessing pipeline was employed. Moreover, deep pre- and post-rectified linear unit (ReLU) features were extracted using the well-known VGG-Net19, which was fine-tuned to optimize transfer learning. Afterward, the canonical correlation analysis was performed for feature fusion, and fused deep features were passed into the LMPL classifier. The introduced method reached the highest performance in comparison with related state-of-the-art methods for two different schemes (normal, COVID-19, and typical viral pneumonia) and (COVID-19, SARS, and MERS pneumonia) with 99.39% and 98.86% classification accuracy, respectively.",1.0,0.0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,34687858,10.1016/j.neuroimage.2021.118652,Causal Decoding of Individual Cortical Excitability States.,"Brain responsiveness to stimulation fluctuates with rapidly shifting cortical excitability state, as reflected by oscillations in the electroencephalogram (EEG). For example, the amplitude of motor-evoked potentials (MEPs) elicited by transcranial magnetic stimulation (TMS) of motor cortex changes from trial to trial. To date, individual estimation of the cortical processes leading to this excitability fluctuation has not been possible. Here, we propose a data-driven method to derive individually optimized EEG classifiers in healthy humans using a supervised learning approach that relates pre-TMS EEG activity dynamics to MEP amplitude. Our approach enables considering multiple brain regions and frequency bands, without defining them a priori, whose compound phase-pattern information determines the excitability. The individualized classifier leads to an increased classification accuracy of cortical excitability states from 57% to 67% when compared to μ-oscillation phase extracted by standard fixed spatial filters. Results show that, for the used TMS protocol, excitability fluctuates predominantly in the μ-oscillation range, and relevant cortical areas cluster around the stimulated motor cortex, but between subjects there is variability in relevant power spectra, phases, and cortical regions. This novel decoding method allows causal investigation of the cortical excitability state, which is critical also for individualizing therapeutic brain stimulation.",2021-10-20,2021-10-24,Journal Article,eng,NeuroImage,Neuroimage,United States,"['Metsomaa J', 'Belardinelli P', 'Ermolova M', 'Ziemann U', 'Zrenner C']","['Department of Neurology & Stroke, University of Tübingen, Tübingen, Germany; Hertie Institute for Clinical Brain Research, University of Tübingen.', 'Department of Neurology & Stroke, University of Tübingen, Tübingen, Germany; Hertie Institute for Clinical Brain Research, University of Tübingen; CIMeC, Center for Mind-Brain Sciences, University of Trento, Italy.', 'Department of Neurology & Stroke, University of Tübingen, Tübingen, Germany; Hertie Institute for Clinical Brain Research, University of Tübingen.', 'Department of Neurology & Stroke, University of Tübingen, Tübingen, Germany; Hertie Institute for Clinical Brain Research, University of Tübingen. Electronic address: ulf.ziemann@uni-tuebingen.de.', 'Department of Neurology & Stroke, University of Tübingen, Tübingen, Germany; Hertie Institute for Clinical Brain Research, University of Tübingen; Temerty Centre for Therapeutic Brain Intervention, Centre for Addiction and Mental Health, and Department of Psychiatry, University of Toronto, Toronto, ON, Canada.']","['EEG', 'TMS', 'brain state', 'classification', 'excitability', 'machine learning']",,,"Causal Decoding of Individual Cortical Excitability States. Brain responsiveness to stimulation fluctuates with rapidly shifting cortical excitability state, as reflected by oscillations in the electroencephalogram (EEG). For example, the amplitude of motor-evoked potentials (MEPs) elicited by transcranial magnetic stimulation (TMS) of motor cortex changes from trial to trial. To date, individual estimation of the cortical processes leading to this excitability fluctuation has not been possible. Here, we propose a data-driven method to derive individually optimized EEG classifiers in healthy humans using a supervised learning approach that relates pre-TMS EEG activity dynamics to MEP amplitude. Our approach enables considering multiple brain regions and frequency bands, without defining them a priori, whose compound phase-pattern information determines the excitability. The individualized classifier leads to an increased classification accuracy of cortical excitability states from 57% to 67% when compared to μ-oscillation phase extracted by standard fixed spatial filters. Results show that, for the used TMS protocol, excitability fluctuates predominantly in the μ-oscillation range, and relevant cortical areas cluster around the stimulated motor cortex, but between subjects there is variability in relevant power spectra, phases, and cortical regions. This novel decoding method allows causal investigation of the cortical excitability state, which is critical also for individualizing therapeutic brain stimulation.",1.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,34687853,10.1016/j.mri.2021.10.024,Radiomic machine learning for pretreatment assessment of prognostic risk factors for endometrial cancer and its effects on radiologists' decisions of deep myometrial invasion.,To evaluate radiomic machine learning (ML) classifiers based on multiparametric magnetic resonance images (MRI) in pretreatment assessment of endometrial cancer (EC) risk factors and to examine effects on radiologists' interpretation of deep myometrial invasion (dMI).,2021-10-20,2021-10-24,Journal Article,eng,Magnetic resonance imaging,Magn Reson Imaging,Netherlands,"['Otani Satoshi', 'Himoto Yuki', 'Nishio Mizuho', 'Fujimoto Koji', 'Moribata Yusaku', 'Yakami Masahiro', 'Kurata Yasuhisa', 'Hamanishi Junzo', 'Ueda Akihiko', 'Minamiguchi Sachiko', 'Mandai Masaki', 'Kido Aki']","['Department of Diagnostic Imaging and Nuclear Medicine, Graduate School of Medicine, Kyoto University, Kyoto 606-8507, Japan.', 'Department of Diagnostic Radiology and Nuclear Medicine, Kyoto University Hospital, Kyoto 606-8507, Japan. Electronic address: yhimoto@kuhp.kyoto-u.ac.jp.', 'Department of Diagnostic Imaging and Nuclear Medicine, Graduate School of Medicine, Kyoto University, Kyoto 606-8507, Japan.', 'Department of Real World Data Research and Developmentx, Graduate School of Medicine, Kyoto University, Kyoto 606-8507, Japan.', 'Department of Diagnostic Radiology and Nuclear Medicine, Kyoto University Hospital, Kyoto 606-8507, Japan; Preemptive Medicine and Lifestyle-related Disease Research Center, Kyoto University Hospital, Kyoto 606-8507, Japan.', 'Preemptive Medicine and Lifestyle-related Disease Research Center, Kyoto University Hospital, Kyoto 606-8507, Japan.', 'Department of Diagnostic Radiology and Nuclear Medicine, Kyoto University Hospital, Kyoto 606-8507, Japan.', 'Department of Gynecology and Obstetrics, Kyoto University, Kyoto 606-8507, Japan.', 'Department of Gynecology and Obstetrics, Kyoto University, Kyoto 606-8507, Japan.', 'Department of Diagnostic Pathology, Kyoto University, Kyoto 606-8507, Japan.', 'Department of Gynecology and Obstetrics, Kyoto University, Kyoto 606-8507, Japan.', 'Department of Diagnostic Radiology and Nuclear Medicine, Kyoto University Hospital, Kyoto 606-8507, Japan.']","['Endometrial cancer', 'Radiomic machine learning']",,,Radiomic machine learning for pretreatment assessment of prognostic risk factors for endometrial cancer and its effects on radiologists' decisions of deep myometrial invasion. To evaluate radiomic machine learning (ML) classifiers based on multiparametric magnetic resonance images (MRI) in pretreatment assessment of endometrial cancer (EC) risk factors and to examine effects on radiologists' interpretation of deep myometrial invasion (dMI).,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
10,34687850,10.1016/j.mri.2021.10.023,MRI-based machine learning for determining quantitative and qualitative characteristics affecting the survival of glioblastoma multiforme.,Our current study aims to consider the image biomarkers extracted from the MRI images for exploring their effects on glioblastoma multiforme (GBM) patients' survival. Determining its biomarker helps better manage the disease and evaluate treatments. It has been proven that imaging features could be used as a biomarker. The purpose of this study is to investigate the features in MRI and clinical features as the biomarker association of survival of GBM.,2021-10-20,2021-10-24,Journal Article,eng,Magnetic resonance imaging,Magn Reson Imaging,Netherlands,"['Jajroudi Mahdie', 'Enferadi Milad', 'Homayoun Amir Azar', 'Reiazi Reza']","['Pharmaceutical Research Center, Mashhad University of Medical Sciences, Mashhad, Iran. Electronic address: Jajroudimh991@mums.ac.ir.', 'Research Center for Nuclear Medicine, Shariati Hospital, Tehran University of Medical Sciences, Tehran, Iran.', 'Sina Trauma Research Center, Tehran University of Medical Sciences, Tehran, Iran.', 'Radiation Medicine Program, Princess Margaret Cancer Centre, University Health Network, Toronto, Ontario, Canada. Electronic address: reza.reiazi@uhnresearch.ca.']","['Biomarker', 'Clinical features', 'Glioblastoma multiforme', 'MRI features', 'Machine learning']",,,MRI-based machine learning for determining quantitative and qualitative characteristics affecting the survival of glioblastoma multiforme. Our current study aims to consider the image biomarkers extracted from the MRI images for exploring their effects on glioblastoma multiforme (GBM) patients' survival. Determining its biomarker helps better manage the disease and evaluate treatments. It has been proven that imaging features could be used as a biomarker. The purpose of this study is to investigate the features in MRI and clinical features as the biomarker association of survival of GBM.,1.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
14,34687347,10.1007/s00330-021-08284-z,Automated detection of the contrast phase in MDCT by an artificial neural network improves the accuracy of opportunistic bone mineral density measurements.,To determine the accuracy of an artificial neural network (ANN) for fully automated detection of the presence and phase of iodinated contrast agent in routine abdominal multidetector computed tomography (MDCT) scans and evaluate the effect of contrast correction for osteoporosis screening.,2021-10-23,2021-10-24,Journal Article,eng,European radiology,Eur Radiol,Germany,"['Rühling Sebastian', 'Navarro Fernando', 'Sekuboyina Anjany', 'El Husseini Malek', 'Baum Thomas', 'Menze Bjoern', 'Braren Rickmer', 'Zimmer Claus', 'Kirschke Jan S']","['Department of Neuroradiology, School of Medicine, Klinikum rechts der Isar, Technical University of Munich, Ismaninger Str 22, 81675, Munich, Germany.', 'Department of Neuroradiology, School of Medicine, Klinikum rechts der Isar, Technical University of Munich, Ismaninger Str 22, 81675, Munich, Germany.', 'Department of Neuroradiology, School of Medicine, Klinikum rechts der Isar, Technical University of Munich, Ismaninger Str 22, 81675, Munich, Germany.', 'Department of Neuroradiology, School of Medicine, Klinikum rechts der Isar, Technical University of Munich, Ismaninger Str 22, 81675, Munich, Germany.', 'Department of Neuroradiology, School of Medicine, Klinikum rechts der Isar, Technical University of Munich, Ismaninger Str 22, 81675, Munich, Germany.', 'Department of Informatics, Technical University of Munich, Munich, Germany.', 'Department of Diagnostic and Interventional Radiology, School of Medicine, Klinikum rechts der Isar, Technical University of Munich, Munich, Germany.', 'Department of Neuroradiology, School of Medicine, Klinikum rechts der Isar, Technical University of Munich, Ismaninger Str 22, 81675, Munich, Germany.', 'Department of Neuroradiology, School of Medicine, Klinikum rechts der Isar, Technical University of Munich, Ismaninger Str 22, 81675, Munich, Germany. jan.kirschke@tum.de.']","['Bone density', 'Machine learning', 'Multidetector computed tomography', 'Osteoporosis', 'Screening']",,,Automated detection of the contrast phase in MDCT by an artificial neural network improves the accuracy of opportunistic bone mineral density measurements. To determine the accuracy of an artificial neural network (ANN) for fully automated detection of the presence and phase of iodinated contrast agent in routine abdominal multidetector computed tomography (MDCT) scans and evaluate the effect of contrast correction for osteoporosis screening.,1.0,0.0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
21,34686914,10.1007/s00467-021-05321-3,Posterior Urethral Valves Outcomes Prediction (PUVOP): a machine learning tool to predict clinically relevant outcomes in boys with posterior urethral valves.,Early kidney and anatomic features may be predictive of future progression and need for additional procedures in patients with posterior urethral valve (PUV). The objective of this study was to use machine learning (ML) to predict clinically relevant outcomes in these patients.,2021-10-22,2021-10-24,Journal Article,eng,"Pediatric nephrology (Berlin, Germany)",Pediatr Nephrol,Germany,"['Kwong Jethro Cc', 'Khondker Adree', 'Kim Jin Kyu', 'Chua Michael', 'Keefe Daniel T', 'Dos Santos Joana', 'Skreta Marta', 'Erdman Lauren', ""D'Souza Neeta"", 'Selman Antoine Fermin', 'Weaver John', 'Weiss Dana A', 'Long Christopher', 'Tasian Gregory', 'Teoh Chia Wei', 'Rickard Mandy', 'Lorenzo Armando J']","['Division of Urology, Department of Surgery, University of Toronto, Toronto, ON, Canada.', 'Division of Urology, Department of Surgery, Hospital for Sick Children, 555 University Avenue, Toronto, ON, M5G 1X8, Canada.', 'Division of Urology, Department of Surgery, University of Toronto, Toronto, ON, Canada.', 'Division of Urology, Department of Surgery, Hospital for Sick Children, 555 University Avenue, Toronto, ON, M5G 1X8, Canada.', 'Division of Urology, Department of Surgery, Hospital for Sick Children, 555 University Avenue, Toronto, ON, M5G 1X8, Canada.', 'Division of Urology, Department of Surgery, Hospital for Sick Children, 555 University Avenue, Toronto, ON, M5G 1X8, Canada.', 'Centre for Computational Medicine, The Hospital for Sick Children, Toronto, ON, Canada.', 'Centre for Computational Medicine, The Hospital for Sick Children, Toronto, ON, Canada.', ""Division of Urology, Children's Hospital of Philadelphia, Philadelphia, PA, USA."", ""Division of Urology, Children's Hospital of Philadelphia, Philadelphia, PA, USA."", ""Division of Urology, Children's Hospital of Philadelphia, Philadelphia, PA, USA."", ""Division of Urology, Children's Hospital of Philadelphia, Philadelphia, PA, USA."", ""Division of Urology, Children's Hospital of Philadelphia, Philadelphia, PA, USA."", ""Division of Urology, Children's Hospital of Philadelphia, Philadelphia, PA, USA."", 'Division of Nephrology, Hospital for Sick Children, Toronto, ON, Canada.', 'Division of Urology, Department of Surgery, Hospital for Sick Children, 555 University Avenue, Toronto, ON, M5G 1X8, Canada.', 'Division of Urology, Department of Surgery, University of Toronto, Toronto, ON, Canada. armando.lorenzo@sickkids.ca.']","['Catheterization', 'Chronic kidney disease', 'Dialysis', 'Machine learning', 'Posterior urethral valve', 'Transplant']",,,Posterior Urethral Valves Outcomes Prediction (PUVOP): a machine learning tool to predict clinically relevant outcomes in boys with posterior urethral valves. Early kidney and anatomic features may be predictive of future progression and need for additional procedures in patients with posterior urethral valve (PUV). The objective of this study was to use machine learning (ML) to predict clinically relevant outcomes in these patients.,1.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
25,34686815,10.1038/s41415-021-3526-6,The ADEPT study: a comparative study of dentists' ability to detect enamel-only proximal caries in bitewing radiographs with and without the use of AssistDent artificial intelligence software.,"Introduction Reversal of enamel-only proximal caries by non-invasive treatments is important in preventive dentistry. However, detecting such caries using bitewing radiography is difficult and the subtle patterns are often missed by dental practitioners.Aims To investigate whether the ability of dentists to detect enamel-only proximal caries is enhanced by the use of AssistDent artificial intelligence (AI) software.Materials and methods In the ADEPT (AssistDent Enamel-only Proximal caries assessmenT) study, 23 dentists were randomly divided into a control arm, without AI assistance, and an experimental arm, in which AI assistance provided on-screen prompts indicating potential enamel-only proximal caries. All participants analysed a set of 24 bitewings in which an expert panel had previously identified 65 enamel-only carious lesions and 241 healthy proximal surfaces.Results The control group found 44.3% of the caries, whereas the experimental group found 75.8%. The experimental group incorrectly identified caries in 14.6% of the healthy surfaces compared to 3.7% in the control group. The increase in sensitivity of 71% and decrease in specificity of 11% are statistically significant (p <0.01).Conclusions AssistDent AI software significantly improves dentists' ability to detect enamel-only proximal caries and could be considered as a tool to support preventive dentistry in general practice.",2021-10-22,2021-10-24,Journal Article,eng,British dental journal,Br Dent J,England,"['Devlin Hugh', 'Williams Tomos', 'Graham Jim', 'Ashley Martin']","['Professor of Restorative Dentistry, Division of Dentistry, School of Medical Sciences, University of Manchester, UK; Director, Manchester Imaging Ltd, UK.', 'Honorary Research Assistant, Division of Dentistry, School of Medical Sciences, University of Manchester, UK; Software Manager, Manchester Imaging Ltd, UK. tomos.williams@manchester.ac.uk.', 'Director, Manchester Imaging Ltd, UK; Honorary Reader, Division of Informatics, Imaging and Data Sciences, School of Health Sciences, University of Manchester, UK.', 'Consultant and MAHSC Honorary Professor in Restorative Dentistry and Oral Health, University Dental Hospital of Manchester, Manchester University NHS Foundation Trust, UK.']",,,,"The ADEPT study: a comparative study of dentists' ability to detect enamel-only proximal caries in bitewing radiographs with and without the use of AssistDent artificial intelligence software. Introduction Reversal of enamel-only proximal caries by non-invasive treatments is important in preventive dentistry. However, detecting such caries using bitewing radiography is difficult and the subtle patterns are often missed by dental practitioners.Aims To investigate whether the ability of dentists to detect enamel-only proximal caries is enhanced by the use of AssistDent artificial intelligence (AI) software.Materials and methods In the ADEPT (AssistDent Enamel-only Proximal caries assessmenT) study, 23 dentists were randomly divided into a control arm, without AI assistance, and an experimental arm, in which AI assistance provided on-screen prompts indicating potential enamel-only proximal caries. All participants analysed a set of 24 bitewings in which an expert panel had previously identified 65 enamel-only carious lesions and 241 healthy proximal surfaces.Results The control group found 44.3% of the caries, whereas the experimental group found 75.8%. The experimental group incorrectly identified caries in 14.6% of the healthy surfaces compared to 3.7% in the control group. The increase in sensitivity of 71% and decrease in specificity of 11% are statistically significant (p <0.01).Conclusions AssistDent AI software significantly improves dentists' ability to detect enamel-only proximal caries and could be considered as a tool to support preventive dentistry in general practice.",1.0,1.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
29,34686646,10.1097/CMR.0000000000000774,Machine learning for the identification of decision boundaries during the transition from radial to vertical growth phase superficial spreading melanomas.,"To compute threshold values for the diameter of superficial spreading melanomas (SSMs) at which the radial growth phase (RGP) evolves into an invasive vertical growth phase (VGP). We examined reports from 1995 to 2019 of 834 primary SSMs. All the patients underwent complete surgical removal of the tumor and the diagnosis was confirmed after histologic examination. Machine learning was used to compute the thresholds. For invasive non-naevus-associated SSMs, a threshold for the diameter was found at 13.2 mm (n = 634). For the lower limb (n = 209) the threshold was at 9.8 mm, whereas for the upper limb (n = 117) at 14.1 mm. For the back (n = 106) and the trunk (n = 173), the threshold was at 16.2 mm and 17.1 mm, respectively. When considering non-naevus-associated and naevus-associated SSMs together (n = 834) a threshold for the diameter was found at 16.8 mm. For the lower limb (n = 248) the threshold was at 11.7 mm, whereas for the upper limb (n = 146) at 16.4 mm. For the back (n = 170) and the trunk (n = 236), the threshold was at 18.6 mm and 14.1 mm, respectively. Thresholds for various anatomic locations and for each gender were defined. They were based on the diameter of the melanoma and computed to suggest a transition from RGP to VGP. The transition from a radial to a more invasive vertical phase is detected by an increase of tumor size with a numeric cutoff. Besides the anamnestic, clinical and dermatoscopic findings, our proposed approach may have practical relevance in vivo during clinical presurgical inspections.",2021-10-21,2021-10-24,Journal Article,eng,Melanoma research,Melanoma Res,England,"['Moglia Andrea', 'Cerri Amilcare', 'Moglia Alessandra', 'Berchiolli Raffaella', 'Ferrari Mauro', 'Betti Roberto']",,,,,"Machine learning for the identification of decision boundaries during the transition from radial to vertical growth phase superficial spreading melanomas. To compute threshold values for the diameter of superficial spreading melanomas (SSMs) at which the radial growth phase (RGP) evolves into an invasive vertical growth phase (VGP). We examined reports from 1995 to 2019 of 834 primary SSMs. All the patients underwent complete surgical removal of the tumor and the diagnosis was confirmed after histologic examination. Machine learning was used to compute the thresholds. For invasive non-naevus-associated SSMs, a threshold for the diameter was found at 13.2 mm (n = 634). For the lower limb (n = 209) the threshold was at 9.8 mm, whereas for the upper limb (n = 117) at 14.1 mm. For the back (n = 106) and the trunk (n = 173), the threshold was at 16.2 mm and 17.1 mm, respectively. When considering non-naevus-associated and naevus-associated SSMs together (n = 834) a threshold for the diameter was found at 16.8 mm. For the lower limb (n = 248) the threshold was at 11.7 mm, whereas for the upper limb (n = 146) at 16.4 mm. For the back (n = 170) and the trunk (n = 236), the threshold was at 18.6 mm and 14.1 mm, respectively. Thresholds for various anatomic locations and for each gender were defined. They were based on the diameter of the melanoma and computed to suggest a transition from RGP to VGP. The transition from a radial to a more invasive vertical phase is detected by an increase of tumor size with a numeric cutoff. Besides the anamnestic, clinical and dermatoscopic findings, our proposed approach may have practical relevance in vivo during clinical presurgical inspections.",1.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
31,34686573,10.1136/neurintsurg-2021-017976,"Prediction of bleb formation in intracranial aneurysms using machine learning models based on aneurysm hemodynamics, geometry, location, and patient population.",Bleb presence in intracranial aneurysms (IAs) is a known indication of instability and vulnerability.,2021-10-22,2021-10-24,Journal Article,eng,Journal of neurointerventional surgery,J Neurointerv Surg,England,"['Salimi Ashkezari Seyedeh Fatemeh', 'Mut Fernando', 'Slawski Martin', 'Cheng Boyle', 'Yu Alexander K', 'White Tim G', 'Woo Henry H', 'Koch Matthew J', 'Amin-Hanjani Sepideh', 'Charbel Fady T', 'Rezai Jahromi Behnam', 'Niemelä Mika', 'Koivisto Timo', 'Frosen Juhana', 'Tobe Yasutaka', 'Maiti Spandan', 'Robertson Anne M', 'Cebral Juan R']","['Department of Bioengineering, George Mason University, Fairfax, Virginia, USA ssalimia@gmu.edu.', 'Department of Bioengineering, George Mason University, Fairfax, Virginia, USA.', 'Department of Statistics, George Mason University, Fairfax, Virginia, USA.', 'Department of Neurosurgery, Allegheny General Hospital, Pittsburgh, Pennsylvania, USA.', 'Department of Neurosurgery, Allegheny General Hospital, Pittsburgh, Pennsylvania, USA.', 'Department of Neurosurgery, Donald and Barbara Zucker School of Medicine at Hofstra/Northwell, Manhasset, New York, USA.', 'Department of Neurosurgery, Donald and Barbara Zucker School of Medicine at Hofstra/Northwell, Manhasset, New York, USA.', 'Department of Neurosurgery, University of Illinois at Chicago, Chicago, Illinois, USA.', 'Department of Neurosurgery, University of Illinois at Chicago, Chicago, Illinois, USA.', 'Department of Neurosurgery, University of Illinois at Chicago, Chicago, Illinois, USA.', 'Neurosurgery Research Group, Biomedicum Helsinki, University of Helsinki, Helsinki, Uusimaa, Finland.', 'Department of Neurosurgery, Töölö Hospital, University of Helsinki, Helsinki, Finland.', 'Department of Neurosurgery, Kuopio University Hospital, Kuopio, Pohjois-Savo, Finland.', 'Department of Neurosurgery, Tampere University Hospital, Tampere, Finland.', 'Department of Mechanical Engineering and Material Science, University of Pittsburgh, Pittsburgh, Pennsylvania, USA.', 'Department of Mechanical Engineering and Material Science, University of Pittsburgh, Pittsburgh, Pennsylvania, USA.', 'Department of Mechanical Engineering and Material Science, University of Pittsburgh, Pittsburgh, Pennsylvania, USA.', 'Department of Bioengineering, George Mason University, Fairfax, Virginia, USA.']","['aneurysm', 'blood flow', 'hemorrhage', 'statistics']",,,"Prediction of bleb formation in intracranial aneurysms using machine learning models based on aneurysm hemodynamics, geometry, location, and patient population. Bleb presence in intracranial aneurysms (IAs) is a known indication of instability and vulnerability.",1.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [136]:
#final_ner = all_tagged[['text', 
#                        'algo_neural_net', 'algo_support_vector', 'algo_regression', 'algo_decision_tree', 
#                       'algo_discriminant', 'algo_naive_bayes', 'algo_transfer', 'algo_federated', 'algo_k_nearest',
#                       'algo_unsupervised',
#                        'feat_imaging', 'feat_xr', 'feat_ct', 'feat_mri', 'feat_eeg', 'feat_ecg',
#                       'feat_us', 'feat_echo', 'feat_histo', 'feat_oct', 'feat_mamm', 'feat_endoscop', 'feat_derm',
#                       'feat_gene', 'feat_bio', 'feat_nlp', 'feat_ehr', 'feat_sensor', 'feat_phone', 
#                        'subspec_icu', 'subspec_ed', 'spec_id', 'subspec_sepsis', 'subspec_hiv', 'subspec_cov19', 'subspec_tb',
#                       'subspec_malaria', 'spec_derm', 'subspec_dermca', 'spec_onc', 'subspec_rx', 'subspec_gynonc', 
#                       'subspec_lungca', 'subspec_brainca', 'subspec_gica', 'subspec_hepca', 'subspec_prosca',
#                       'subspec_renalca', 'subspec_haemonc', 'subspec_breast', 'spec_psych','subspec_suicide', 'spec_msk', 
#                        'subspec_frac', 'spec_rheum', 'spec_gi', 'spec_hep', 'spec_resp', 'subspec_pneum',
#                        'spec_neuro', 'subspec_epilep', 'subspec_cva', 'subspec_alzh', 'spec_cvs', 'subspec_ihd', 'subspec_hf', 
#                       'spec_endo', 'subspec_dm', 'spec_eye', 'subspec_retina', 'spec_haem', 'spec_obs', 'spec_renal', 
#                        'subspec_ackd', 'spec_paeds', 'spec_dent',  'spec_audio', 'spec_pubh', 'subspec_bci',
#                       'subspec_prosth', 'subspec_assist','subspec_activity', 'subspec_arrhyt', 'countries', 'lmic_flag']].copy()
#
#final_ner.to_csv('output/final_ner.csv')

In [137]:
labelled.to_csv('data/char_labelled.csv')

## Evaluation

In [138]:
ner_eval = labelled.drop(['doi', 'title', 'abstract', 'article_date', 'pubmed_date', 'article_type', 'lang', 'journal', 'journal_short',
                         'journal_country', 'authors', 'author_affils', 'keywords', 'mesh_terms', 'references_pmids', 'include', 'mature'], axis=1)

In [139]:
ner_eval.head(3)

Unnamed: 0,pmid,feature,algo_neural_net,algo_support_vector,algo_regression,algo_decision_tree,algo_discriminant,algo_naive_bayes,algo_transfer,algo_federated,algo_k_nearest,algo_unsupervised,feat_xr,feat_ct,feat_mri,feat_eeg,feat_ecg,feat_emg,feat_us,feat_echo,feat_histo,feat_oct,feat_mamm,feat_endoscop,feat_derm,feat_gene,feat_bio,feat_nlp,feat_ehr,feat_sensor,feat_phone,feat_prom,feat_sound,subspec_icu,subspec_ed,spec_paeds,spec_dent,spec_audio,spec_id,subspec_sepsis,subspec_hiv,subspec_cov19,subspec_tb,subspec_malaria,subspec_tropic,spec_derm,subspec_dermca,spec_onc,subspec_rx,subspec_lungca,subspec_brainca,subspec_gica,subspec_hepca,subspec_prosca,subspec_gynonc,subspec_renalca,subspec_haemonc,subspec_breast,subspec_breastca,subspec_urology,spec_psych,subspec_suicide,spec_msk,subspec_frac,spec_rheum,spec_gi,spec_hep,spec_resp,subspec_pneum,subspec_osa,subspec_pe,spec_neuro,subspec_epilep,subspec_cva,subspec_alzh,spec_cvs,subspec_ihd,subspec_hf,subspec_arrhyt,spec_endo,spec_dm,subspec_insulin,spec_eye,subspec_retina,spec_haem,spec_obs,spec_renal,subspec_ackd,spec_pubh,subspec_bci,subspec_prosth,subspec_assist,subspec_activity
1,34688173,A convolutional neural network trained with dermoscopic images of psoriasis performed on par with 230 dermatologists. Psoriasis is a common chronic inflammatory skin disease that causes physical and psychological burden to patients. A Convolutional Neural Network (CNN) focused on dermoscopic images would substantially aid the classification and increase the accuracy of diagnosis of psoriasis.,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,34688172,"A large margin piecewise linear classifier with fusion of deep features in the diagnosis of COVID-19. The world has experienced epidemics of coronavirus infections several times over the last two decades. Recent studies have shown that using medical imaging techniques can be useful in developing an automatic computer-aided diagnosis system to detect pandemic diseases with high accuracy at an early stage. In this study, a large margin piecewise linear classifier was developed to diagnose COVID-19 compared to a wide range of viral pneumonia, including SARS and MERS, using chest x-ray images. In the proposed method, a preprocessing pipeline was employed. Moreover, deep pre- and post-rectified linear unit (ReLU) features were extracted using the well-known VGG-Net19, which was fine-tuned to optimize transfer learning. Afterward, the canonical correlation analysis was performed for feature fusion, and fused deep features were passed into the LMPL classifier. The introduced method reached the highest performance in comparison with related state-of-the-art methods for two different schemes (normal, COVID-19, and typical viral pneumonia) and (COVID-19, SARS, and MERS pneumonia) with 99.39% and 98.86% classification accuracy, respectively.",0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,34687858,"Causal Decoding of Individual Cortical Excitability States. Brain responsiveness to stimulation fluctuates with rapidly shifting cortical excitability state, as reflected by oscillations in the electroencephalogram (EEG). For example, the amplitude of motor-evoked potentials (MEPs) elicited by transcranial magnetic stimulation (TMS) of motor cortex changes from trial to trial. To date, individual estimation of the cortical processes leading to this excitability fluctuation has not been possible. Here, we propose a data-driven method to derive individually optimized EEG classifiers in healthy humans using a supervised learning approach that relates pre-TMS EEG activity dynamics to MEP amplitude. Our approach enables considering multiple brain regions and frequency bands, without defining them a priori, whose compound phase-pattern information determines the excitability. The individualized classifier leads to an increased classification accuracy of cortical excitability states from 57% to 67% when compared to μ-oscillation phase extracted by standard fixed spatial filters. Results show that, for the used TMS protocol, excitability fluctuates predominantly in the μ-oscillation range, and relevant cortical areas cluster around the stimulated motor cortex, but between subjects there is variability in relevant power spectra, phases, and cortical regions. This novel decoding method allows causal investigation of the cortical excitability state, which is critical also for individualizing therapeutic brain stimulation.",0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [140]:
nerdata = ner_eval.apply(lambda s: [s.name if v == "1" else np.nan for v in s])

In [141]:
nerdata.head(3)

Unnamed: 0,pmid,feature,algo_neural_net,algo_support_vector,algo_regression,algo_decision_tree,algo_discriminant,algo_naive_bayes,algo_transfer,algo_federated,algo_k_nearest,algo_unsupervised,feat_xr,feat_ct,feat_mri,feat_eeg,feat_ecg,feat_emg,feat_us,feat_echo,feat_histo,feat_oct,feat_mamm,feat_endoscop,feat_derm,feat_gene,feat_bio,feat_nlp,feat_ehr,feat_sensor,feat_phone,feat_prom,feat_sound,subspec_icu,subspec_ed,spec_paeds,spec_dent,spec_audio,spec_id,subspec_sepsis,subspec_hiv,subspec_cov19,subspec_tb,subspec_malaria,subspec_tropic,spec_derm,subspec_dermca,spec_onc,subspec_rx,subspec_lungca,subspec_brainca,subspec_gica,subspec_hepca,subspec_prosca,subspec_gynonc,subspec_renalca,subspec_haemonc,subspec_breast,subspec_breastca,subspec_urology,spec_psych,subspec_suicide,spec_msk,subspec_frac,spec_rheum,spec_gi,spec_hep,spec_resp,subspec_pneum,subspec_osa,subspec_pe,spec_neuro,subspec_epilep,subspec_cva,subspec_alzh,spec_cvs,subspec_ihd,subspec_hf,subspec_arrhyt,spec_endo,spec_dm,subspec_insulin,spec_eye,subspec_retina,spec_haem,spec_obs,spec_renal,subspec_ackd,spec_pubh,subspec_bci,subspec_prosth,subspec_assist,subspec_activity
1,,,algo_neural_net,,,,,,,,,,,,,,,,,,,,,,feat_derm,,,,,,,,,,,,,,,,,,,,,spec_derm,,,,,,,,,,,,,,,spec_psych,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,algo_transfer,,,,feat_xr,,,,,,,,,,,,,,,,,,,,,,,,,,spec_id,,,subspec_cov19,,,,,,,,,,,,,,,,,,,,,,,,,,spec_resp,subspec_pneum,,,,,,,,,,,,,,,,,,,,,,,,
8,,,,,,,,,,,,,,,,feat_eeg,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,spec_neuro,,,,,,,,,,,,,,,,,,,,,


In [142]:
ner_eval['result'] = nerdata.apply(lambda x: ','.join(x.dropna().astype(str)),axis=1)

In [143]:
ner_eval.to_csv('data/char_labelled_evaluation.csv')