In [1]:
import time
from typing import Dict, Set
from Bio import Entrez, Medline
from naivebayes import NaiveBayes

In [2]:
with open( 'positive_examples.txt' ) as file :
    posids = set( file.read().split( ',' ) )

In [3]:
with open( 'negative_examples.txt' ) as file :
    negids = set( file.read().split( ',' ) )

In [4]:
# define data structure
Class, Document = str, str
data : Dict[ Class, Set[ Document ] ] = { 'positive' : set(), 'negative' : set() }

In [5]:
def get_abstract( pmid : Set[ str ] ) -> Dict[ Class, Set[ Document ] ] :
    '''
    Get Abstract from PubMed
    '''
    return Medline.read(
        Entrez.efetch(
            db = 'pubmed',
            id = pmid,
            email = 'chiodini.zachary@epa.gov',
            retmode = 'text',
            rettype = 'medline'
            )
        ).get( 'AB' )

In [6]:
# getting data
for pmid in posids :
    data[ 'positive' ].add( get_abstract( pmid ) )
    time.sleep( 1/3 ) # avoid PubMed ban
for pmid in negids :
    data[ 'negative' ].add( get_abstract( pmid ) )
    time.sleep( 1/3 ) # avoid PubMed ban

In [8]:
model = NaiveBayes()

In [9]:
model.kFoldValidate( data, k = 10 ) # 10-fold cross-validation

Examples Trained: 900
Examples Tested : 100
Total Examples  : 1000


In [10]:
import pandas as pd
pd.DataFrame( model.predictions ).fillna( '' )

Unnamed: 0,positive,negative,model
positive,270.0,0.0,
negative,0.0,630.0,
truth rate,1.0,1.0,
false rate,0.0,0.0,
precision,1.0,1.0,
accuracy,,,1.0(0.0)


In [12]:
model.trainAndTest( data, iters = 1000 ) # Monte-Carlo cross-validation

Examples Trained: 50706
Examples Tested : 49294
Total Examples  : 100000


In [13]:
pd.DataFrame( model.predictions ).fillna( '' )

Unnamed: 0,positive,negative,model
positive,13949.0,2402.0,
negative,1052.0,31891.0,
truth rate,0.929871,0.929957,
false rate,0.0700434,0.0701287,
precision,0.853098,0.968066,
accuracy,,,0.95(0.05)
