In [1]:
import time
from typing import Dict, Set
from Bio import Entrez, Medline
from logisticregression import LogisticRegression

In [2]:
with open( 'positive_examples.txt' ) as file :
    posids = set( file.read().split( ',' ) )

In [3]:
with open( 'negative_examples.txt' ) as file :
    negids = set( file.read().split( ',' ) )

In [4]:
# define data structure
Class, Document = str, str
data : Dict[ Class, Set[ Document ] ] = { 'positive' : set(), 'negative' : set() }

In [5]:
def get_abstract( pmid : Set[ str ] ) -> Dict[ Class, Set[ Document ] ] :
    '''
    Get Abstract from PubMed
    '''
    return Medline.read(
        Entrez.efetch(
            db = 'pubmed',
            id = pmid,
            email = 'chiodini.zachary@epa.gov',
            retmode = 'text',
            rettype = 'medline'
            )
        ).get( 'AB' )

In [6]:
# getting data
for pmid in posids :
    data[ 'positive' ].add( get_abstract( pmid ) )
    time.sleep( 1/3 ) # avoid PubMed ban
for pmid in negids :
    data[ 'negative' ].add( get_abstract( pmid ) )
    time.sleep( 1/3 ) # avoid PubMed ban

In [7]:
model = LogisticRegression()

In [22]:
model.kFoldValidate( data, rate = 1, convergence = 0.00001, target = 'positive', k = 2 ) # 2-fold cross-validation

Examples Trained: 100
Examples Tested : 100
Total Examples  : 200


In [23]:
import pandas as pd
pd.DataFrame( model.predictions ).fillna( '' )

Unnamed: 0,positive,negative,model
positive,30.0,0.0,
negative,0.0,70.0,
truth rate,1.0,1.0,
false rate,0.0,0.0,
precision,1.0,1.0,
accuracy,,,1.0(0.0)


In [16]:
model.trainAndTest( 
    data, target = 'positive', rate = 1, 
    batches = 1, convergence = 0.00001, iters = 1000 
    ) # Monte-Carlo cross-validation

Examples Trained: 48709
Examples Tested : 51291
Total Examples  : 100000


In [17]:
pd.DataFrame( model.predictions ).fillna( '' )

Unnamed: 0,positive,negative,model
positive,12375.0,981.0,
negative,3207.0,34728.0,
truth rate,0.794186,0.972528,
false rate,0.0274721,0.205814,
precision,0.92655,0.915461,
accuracy,,,0.93(0.07)
