In [1]:
import time, re
import numpy as np
from math import e
from typing import Dict, List, Set
from nptyping import NDArray, Float64
from Bio import Entrez, Medline

In [2]:
with open( 'positive_examples.txt' ) as file :
    posids = set( file.read().split( ',' ) )

In [3]:
with open( 'negative_examples.txt' ) as file :
    negids = set( file.read().split( ',' ) )

In [4]:
Count = int
Class, Document, Word = str, str, str

In [5]:
data : Dict[ Class, Set[ Document ] ] = { 'positive' : set(), 'negative' : set() }

In [6]:
# getting data
for pmid in posids :
    data[ 'positive' ].add(
        Medline.read(
            Entrez.efetch(
                db = 'pubmed',
                id = pmid,
                email = 'chiodini.zachary@epa.gov',
                retmode = 'text',
                rettype = 'medline'
                )
            ).get( 'AB' )
        )
    time.sleep( 1/3 ) # avoid PubMed ban
for pmid in negids :
    data[ 'negative' ].add(
        Medline.read(
            Entrez.efetch(
                db = 'pubmed',
                id = pmid,
                email = 'chiodini.zachary@epa.gov',
                retmode = 'text',
                rettype = 'medline'
                )
            ).get( 'AB' )
        )
    time.sleep( 1/3 ) # avoid PubMed ban

In [14]:
class LogisticRegression :
    '''
    Binomial Logistic Regression for Text Classification
    '''
    def __init__( self ) :
        self.network = np.array([])    # weights and bias
        self.vocabulary = np.array([]) # features
        self.__vecfindall = np.vectorize( 
            lambda voc, doc : int( voc in doc ) 
            )
        self.stop_words = {}
        with open( 'stopset.txt' ) as file :
            self.stop_words = set( file.read().split( ',' ) )
            
    def train( self, 
        data : Dict[ Class, Set[ Document ] ],
        rate : float = 1.0,
        batches : int = 10,
        convergence : float = 0.01,
        ) -> None :
        # get features
        self.vocabulary = np.array( list( 
            self.__removeFirstQ( self.__extract( data ) )
            ) )
        # append bias
        self.vocabulary = np.append( self.vocabulary, '' )
        # generate random weights
        self.network = np.random.uniform( 
            -0.5, 0.5, size = len( self.vocabulary )
            )
        # generate input vectors
        X = np.empty( shape = ( 0, len( self.vocabulary ) ) )
        for label in data :
            for document in data[ label ] :
                X = np.vstack( ( X, self.getInputFrom( document ) ) )
        # get target values
        target = list( data )[ 0 ]
        Y = np.array( [ int( label == target ) for label in data 
                        for document in data[ label ] ] )
        totgrad = np.inf
        while totgrad > convergence :
            totgrad = 0
            for xbatch, ybatch in zip( 
                np.array_split( X, batches ), 
                np.array_split( Y, batches ) 
                ) :
                grad = np.multiply( 
                    np.reshape( 
                        self.output( xbatch ) - ybatch, 
                        newshape = ( len( batch ), 1 )
                        ),
                    xbatch
                    ).sum( axis = 0 ) / len( xbatch )
                self.network = self.network - rate*grad
                totgrad += grad.sum() / len( grad )
        return
    
    def test( self, 
        data : Dict[ Class, Set[ Document ] ], 
        boundary : float = 0.5 
    ) -> None :
        pass
            
    def getInputFrom( self, document : str ) -> NDArray[ int ] :
        ''' Generate Input Vector '''
        return self.__findall( self.vocabulary, document )
            
    def output( self, X : NDArray[ int ] ) :
        ''' Logistic Model Output '''
        return self.sigmoid( np.dot( X, self.network ) )
            
    def sigmoid( self, x : float ) -> float :
        return 1 / ( 1 + e**(-x) )
    
    def __findall( self, 
        vocabulary : NDArray[ Word ], 
        document   : str
        ) -> NDArray[ int ] :
        return self.__vecfindall( vocabulary, document )

    def __extract( self,
        data : Dict[ Class, Set[ Document ] ],
        pattern : str = '\\b[a-z]{2,}\\b'
        ) -> Dict[ Word, Count ] :
        '''
        Extract Vocabulary from Dataset
        '''
        vocabulary : Dict[ Word, Count ] = {}
        for label in data :
            for document in data[ label ] :
                for word in re.findall(
                    pattern = pattern,
                    string  = document
                    ) :
                    if word not in self.stop_words :
                        if word in vocabulary :
                            vocabulary[ word ] += 1
                        else :
                            vocabulary[ word ] = 1
        return vocabulary
    
    def __removeFirstQ( self,
        vocabulary : Dict[ Word, Count ]
        ) -> Dict[ Word, Count ] :
        '''
        Remove First Quartile in Vocabulary
        '''
        count = sorted( set( vocabulary.values() ) )
        index = len( count ) / 4
        if index % 1 == 0 :
            limit = count[ int( index ) ]
        else :
            index = int( index ) # truncate
            limit = ( count[ index ] + count[ index + 1 ] ) / 2
        for word, count in vocabulary.copy().items() :
            if count < limit :
                del vocabulary[ word ]
        return vocabulary

In [15]:
model = LogisticRegression()