In [9]:
import numpy as np, pandas as pd, re
from time import sleep

In [2]:
from Bio import Entrez, Medline

In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.neural_network import MLPClassifier

In [12]:
from typing import Dict, List, Tuple

In [5]:
with open( 'stopset.txt' ) as file :
    stop_words = set( file.read().split( ',' ) )

In [13]:
# data structures
Count = int
Array = List
Document, Label, Word = str, str, str
Data = Array[ Tuple[ Document, Label ] ]

In [14]:
data : Data = pd.read_csv( 'abstracts.psv',  sep = '|', header = None, dtype = str ).to_numpy()

In [11]:
def vocabulary( data : Data ) -> List[ Word ]:
    return list( removeFirstQ( extract( data ) ) )

def extract( data : Data ) -> Dict[ Word, Count ] :
    '''
    Extract Vocabulary from Dataset
    '''
    vocabulary = {}
    for document, label in data :
        for word in re.findall(
            pattern = '\\b[a-z]{2,}\\b',
            string  = document
            ) :
            if word not in stop_words :
                if word in vocabulary :
                    vocabulary[ word ] += 1
                else :
                    vocabulary[ word ] = 1
    return vocabulary

def removeFirstQ( vocabulary : Dict[ Word, Count ] ) -> Dict[ Word, Count ] :
    '''
    Remove First Quartile in Vocabulary
    '''
    count = sorted( set( vocabulary.values() ) )
    index = len( count ) / 4
    if index % 1 == 0 :
        limit = count[ int( index ) ]
    else :
        index = int( index ) # truncate
        limit = ( count[ index ] + count[ index + 1 ] ) / 2
    for word, count in vocabulary.copy().items() :
        if count < limit :
            del vocabulary[ word ]
    return vocabulary

In [12]:
features : List[ Word ] = vocabulary( data )

In [74]:
# training array
X : Array[ Array[ bool ] ] = np.ndarray(
    shape = ( len( data ), len( features ) ), 
    dtype = np.bool
    )
# target vector
Y : Array[ bool ] = np.ndarray(
    shape = ( len( data ), ), 
    dtype = np.bool
    )
# populating arrays
for i in range( len( data ) ) :
    doc_vocab : List[ Word ] = re.findall( 
        pattern = '\\b[a-z]{2,}\\b', 
        string = data[ i, 0 ] 
        )
    X[ i ] = np.array( [ word in doc_vocab for word in features ] )
    Y[ i ] = data[ i, 1 ] == 'positive'

In [14]:
classifier = {
    'multilayerd perceptron' : MLPClassifier( 
        hidden_layer_sizes = ( 100 ), 
        activation = 'logistic', 
        max_iter = 500 
        ),
    'bayes multinomial' : MultinomialNB(),
    'bayes bernoulli'   : BernoulliNB(),
    'logistic regression' : LogisticRegression( 
        max_iter = 500, 
        multi_class = 'multinomial' 
        ),
    'linear support vector' : LinearSVC( loss = 'hinge' ),
    'kernel support vector' : SVC( kernel = 'rbf' ),
    'kernel nu-support vector' : NuSVC( nu = 0.5, kernel = 'rbf' ),
    'k-nearest neighbors' : KNeighborsClassifier( n_neighbors = 10 ),
    'decision tree' : DecisionTreeClassifier(),
    'random forest' : RandomForestClassifier( n_estimators = 100 ),
    'adaboost decision tree' : AdaBoostClassifier( n_estimators = 100 )
}

In [78]:
# train models
input_ = np.concatenate( [ X, Y.reshape( len( Y ), 1 ) ], axis = 1 )
for model in classifier.values() :
    np.random.shuffle( input_ )
    X, Y = input_[ :, : -1 ], input_[ :, -1 ]
    model.fit( X, Y )

In [95]:
# collecting new data
examples = { 'pmid' : [], 'document' : [] }
for pmid in Entrez.read( 
    Entrez.esearch( 
        db = 'pubmed', 
        term = 'metabolite hasabstract',
        email = 'chiodini.zachary@epa.gov',
        restart = 0,
        rettype = 'medline',
        retmax = 100
        )
    ).get( 'IdList' ) :
    examples[ 'pmid' ].append( pmid )
    examples[ 'document' ].append(
        Medline.read( 
            Entrez.efetch(
                db = 'pubmed',
                id = pmid,
                email = 'chiodini.zachary@epa.gov',
                retmode = 'text',
                rettype = 'medline',
                )
            ).get( 'AB' ) # abstract
        )
    sleep( 1 ) # avoid PubMed ban

In [96]:
# test array
X : Array[ Array[ bool ] ] = np.ndarray(
    shape = ( len( examples[ 'document' ] ), len( features ) ), 
    dtype = np.bool
    )
# populating array
for i in range( len( examples[ 'document' ] ) ) :
    doc_vocab : List[ Word ] = re.findall( 
        pattern = '\\b[a-z]{2,}\\b', 
        string = examples[ 'document' ][ i ]
        )
    X[ i ] = np.array( [ word in doc_vocab for word in features ] )

In [152]:
predictions = np.array(
    [ model.predict( X ) for model in classifier.values() ],
    dtype = np.bool
    )
majority_vote = predictions.sum( axis = 0 ) >= ( len( classifier.values() ) / 2 )

In [156]:
examples[ 'predictions' ] = majority_vote

In [158]:
result = pd.DataFrame( examples )
result

Unnamed: 0,pmid,document,predictions
0,33516063,OBJECTIVE: Progressive myelopathy causes sever...,False
1,33515965,Azoxystrobin (AZ) is a broad-spectrum syntheti...,True
2,33515809,OBJECTIVES: Studies have shown that the consum...,False
3,33515705,The antiretroviral nevirapine (NVP) is associa...,False
4,33515571,Previous studies have demonstrated the potenti...,False
...,...,...,...
95,33498402,The tryptophan (Trp) metabolite kynurenic acid...,True
96,33498148,Reductions in crop yields brought about by abi...,False
97,33497193,Metabolomics is a promising approach to charac...,False
98,33496609,Trichoderma is a genus of filamentous fungi th...,False


In [8]:
result = pd.read_csv( 'textclassificationtest.tsv', sep = '\t', dtype = str )

In [5]:
posids = {
    '20732191',  '4060166',  '6706932',  '6806905', '12386122',  
     '3804130',  '3804131', '17360832', '16702114',  '6141911',  
     '2893708',  '4024104', '15625188', '16492914', '12505273', 
    '11221833', '19812350', '11792678', '20659636', '22610610', 
    '25853821', '10506015', '10101144', '10681375', '24520218', 
    '19447879', '24815348',  '7298740',  '9288499',  '3530584',
    '33515965', '33515705', '33511100'
    }
negids = {
    '33378747', '33378617', '33378573', '33378466', '33378414', 
    '33378399', '33378370', '33378360', '33378269', '33378212',
    '33378198', '33378192', '33378185', '33378184', '33378174', 
    '33378148', '33378128', '33378120', '33378101', '33378084',
    '33377782', '33377769', '33377768', '33377656', '33377640', 
    '33377615', '33377601', '33383557', '33383550', '33383548',
    '33383544', '33383518', '33383511', '33383506', '33383471', 
    '33383435', '33383434', '33383431', '33383420', '33383417',
    '33383382', '33383322', '33383300', '33383244', '33383237', 
    '33383187', '33383155', '33383095', '33383078', '33383001',
    '33382947', '33382946', '33382942', '33382808', '33385978', 
    '33385970', '33385931', '33385910', '33385904', '33385899',
    '33385969', '33385952', '33385920', '33386027', '33386025', 
    '33386020', '33386018', '33386016', '33386015', '33386014',
    '33516063', '33515571', '33515481', '33515103', '33515101',
    '33515003', '33513835', '33513822', '33513565', '33513490',
    '33513207', '33513198', '33513132', '33512810', '33512270',
    '33512020', '33511559', '33511402', '33511154', '33510981',
    '33510744', 
    }

In [13]:
result.to_numpy()

array([['33516063',
        'OBJECTIVE: Progressive myelopathy causes severe handicap in men with adrenomyeloneuropathy (AMN), an X-linked disorder due to ABCD1 pathogenic variants. At present, treatments are symptomatic but disease-modifying therapies are under evaluation. Given the small effect size of clinical scales in AMN, biomarkers with higher effect size are needed. Here we used high-resolution magnetic resonance techniques to identify non-invasive in vivo biomarkers of the brain and spine with high effect sizes. METHODS: We performed a multiparametric imaging and spectroscopy study in 23 male patients with AMN (age: 44 +/- 11) and 23 male controls (age: 43 +/- 11) of similar age and body-mass index. We combined (i) macrostructural analyses of the spine, using cross-sectional area (CSA) and magnetization transfer ratio (MTR), (ii) microstructural analyses of the spine and the brain, using diffusion tensor and the newly developed fixel-based analysis, and (iii) advanced metaboli