In [1]:
import re, requests
from fuzzywuzzy import fuzz
from typing import Dict

In [2]:
class Crossref :
    
    def __init__( self ) -> None :
        self.log = {}
        self.cache = {}
        self.ckeys = []
        return
    
    def search( self, search : str, results = 10 ) -> Dict :
        url = 'https://api.crossref.org/works?query={s}&rows={n}&sort=relevance'.format(
            s = search.replace( ' ', '+' ), n = results
            )
        if url in self.cache :
            return self.cache[ url ]
        self.cache[ url ] = requests.get( 
            url, headers = { 'User-Agent' : 'Mozilla/5.0' } 
            ).json()
        if len( self.cache ) > 1000 :
            del self.cache[ self.ckeys.pop( 0 ) ]
        self.ckeys.append( url )
        return self.cache[ url ]
    
    def citation( self, search : str, log = False ) -> Dict :
        not_found = { 
            'doi'    : [''], 'url'       : [''], 'year'   : [''],
            'month'  : [''], 'publisher' : [''], 'volume' : [''],
            'number' : [''], 'pages'     : [''], 'author' : [''],
            'title'  : [''], 'journal'   : [''], 'short'  : ['']
            }
        if not search.strip() :
            return not_found
        document = self.search( search, 1 )
        if document[ 'status' ] == 'failed' :
            return not_found
        document = document[ 'message' ][ 'items' ]
        if not document:
            return not_found
        document = document[ 0 ]
        valid = self.formatandvalidate( search, document, log )
        if not valid :
            return not_found
        return {
            'doi'   : [ document[ 'DOI' ] 
                       if 'DOI' in document else '' ], 
            'url'   : [ document[ 'URL' ] 
                       if 'URL' in document else '' ],
            'year'  : [ document[ 'date-part' ][ 0 ] ], 
            'month' : [ document[ 'date-part' ][ 1 ] ],
            'day'   : [ document[ 'date-part' ][ 2 ] ],
            'publisher' : [ document[ 'publisher' ]  ],
            'volume' : [ document[ 'volume' ] ],
            'number' : [ document[ 'issue' ] ],
            'pages'  : [ document[ 'page' ] ],
            'author' : [ ', '.join( [ 
                author[ 'given' ].strip( '.' ) + ' ' + author[ 'family' ].strip( '.' )
                for author in document[ 'author' ]
                ] ).strip() ],
            'title'   : [ ', '.join( document[ 'title' ] ) ],
            'journal' : [ ', '.join( document[ 'container-title' ] ) ],
            'short'   : [ ', '.join( document[ 'short-container-title' ] ) ]
            }
    
    def formatandvalidate( self, search : str, document : Dict, log = False ) -> bool :
        search = search.replace( '.', '' ).lower()
        # document is passed as a reference
        TITLE_FOUND   = self.match_title( search, document )
        AUTHOR_FOUND  = self.match_author( search, document )
        JOURNAL_FOUND = self.match_journal( search, document )
        YEAR_FOUND    = self.match_year( search, document )
        ISSUE_FOUND   = self.match_issue( search, document )
        VOLUME_FOUND  = self.match_volume( search, document )
        PAGES_FOUND   = self.match_pages( search, document )
        if log :
            self.log = { 
                'SEARCH'       : search,
                'DOC_TITLE'    : ', '.join( document[ 'title' ] ),
                'DOC_JOURNAL'  : ( ', '.join( document[ 'container-title' ] ),
                                   ', '.join( document[ 'short-container-title' ] ),
                                   document[ 'publisher' ] ),
                'DOC_AUTHOR'   : document[ 'author' ],
                'DOC_YEAR'     : document[ 'date-part' ][ 0 ],
                'DOC_ISSUE'    : document[ 'issue' ],
                'DOC_VOLUME'   : document[ 'volume' ],
                'DOC_PAGES'    : document[ 'page' ],
                'TITLE_FOUND'  : TITLE_FOUND , 'JOURNAL_FOUND' : JOURNAL_FOUND,
                'AUTHOR_FOUND' : AUTHOR_FOUND, 'YEAR_FOUND'    : YEAR_FOUND,
                'ISSUE_FOUND'  : ISSUE_FOUND , 'VOLUME_FOUND'  : VOLUME_FOUND,
                'PAGES_FOUND'  : PAGES_FOUND 
                }
        if TITLE_FOUND and AUTHOR_FOUND :
            return True
        if JOURNAL_FOUND and AUTHOR_FOUND and YEAR_FOUND :
            return True
        if ( ( JOURNAL_FOUND or AUTHOR_FOUND ) and YEAR_FOUND and
            ( ISSUE_FOUND or PAGES_FOUND ) and VOLUME_FOUND ) :
            return True
        return False
    
    def match_title( self, search : str, document : Dict ) -> bool :
        if 'title' in document :
            for title in document[ 'title' ] :
                if ( len( title ) > 1 and self.clean( title ).replace( ',', '' ).strip() 
                     in search.replace( ',', '' ) ) :
                    return True
        else :
            document[ 'title' ] = ['']
        return False
    
    def match_author( self, search : str, document : Dict ) -> bool :
        AUTHOR_FOUND = False
        if 'author' in document :
            for author in document[ 'author' ] :
                if 'given' in author :
                    name = self.clean( author[ 'given' ] )
                    if len( name ) > 1 and name in search :
                        AUTHOR_FOUND = True
                else :
                    author[ 'given' ] = ''
                if 'family' in author :
                    name = self.clean( author[ 'family' ] )
                    if len( name ) > 1 and name in search :
                        AUTHOR_FOUND = True
                else :
                    author[ 'family' ] = ''
        else :
            document[ 'author' ] = [ { 'given' : '', 'family' : '' } ]
        return AUTHOR_FOUND
    
    def match_journal( self, search : str, document : Dict ) -> bool :
        JOURNAL_FOUND = False
        if 'container-title' in document :
            for title in document[ 'container-title' ] :
                if ( len( title ) > 1 and self.clean( title ).replace( ',', '' ).strip()
                     in search.replace( ',', '' ) ) :
                    JOURNAL_FOUND = True
                    break
        else :
            document[ 'container-title' ] = ['']
        # matching on abbreviated journal name
        if 'short-container-title' in document :
            if not JOURNAL_FOUND :
                for title in document[ 'short-container-title' ] :
                    abbr = self.clean( title )
                    if len( abbr ) < 2 :
                        continue
                    if abbr.replace( ',', '' ).strip() in search.replace( ',', '' ) :
                        JOURNAL_FOUND = True
                        break
                    if self.fuzzy( 
                        abbr.replace( ',', '' ).strip(), 
                        search.replace( ',', '' ).strip(),
                        tokenize = True
                        ) :
                        JOURNAL_FOUND = True
                        break
                    for sep in [ ':', '|', '-' ] :
                        if sep in abbr :
                            for phrase in abbr.split( sep ) :
                                if self.fuzzy( 
                                    phrase.replace( ',', '' ).strip(), 
                                    search.replace( ',', '' ).strip(),
                                    tokenize = True
                                    ) :
                                    JOURNAL_FOUND = True
                                    break
        else :
            document[ 'short-container-title' ] = ['']
        # matching on publisher
        if 'publisher' in document :
            if not JOURNAL_FOUND :
                publisher = self.clean( document[ 'publisher' ] )
                if ( len( publisher ) > 1 and publisher.replace( ',', '' ) 
                     in search.replace( ',', '' ) ) :
                    JOURNAL_FOUND = True
        else :
            document[ 'publisher' ] = ''
        return JOURNAL_FOUND
    
    def match_year( self, search : str, document : Dict ) -> bool :
        YEAR_FOUND = False
        datepart = [ '', '', '' ]
        if 'published-print' in document :
            if 'date-parts' in document[ 'published-print' ] :       
                # sometimes the document date is missing indices
                datelist = document[ 'published-print' ][ 'date-parts' ][ 0 ]
                for i in range( len( datelist ) ) :
                    datepart[ i ] = datelist[ i ]
                if datelist :
                    year = str( datelist[ 0 ] )
                    if len( year ) > 0 and year in re.findall( '[0-9]+', search ) :
                        YEAR_FOUND = True
        document[ 'date-part' ] = datepart
        return YEAR_FOUND
    
    def match_issue( self, search : str, document : Dict ) -> bool :
        ISSUE_FOUND = False
        if 'issue' in document :
            issue = str( document[ 'issue' ] )
            if len( issue ) > 0 and issue in re.findall( '[0-9]+', search ) :
                ISSUE_FOUND = True
        else :
            document[ 'issue' ] = ''
        return ISSUE_FOUND
    
    def match_volume( self, search : str, document : Dict ) -> bool :
        VOLUME_FOUND = False
        # matching on volume
        if 'volume' in document :
            volume = str( document[ 'volume' ] )
            if len( volume ) > 0 and volume in re.findall( '[0-9]+', search ) :
                VOLUME_FOUND = True
        else :
            document[ 'volume' ] = ''
        return VOLUME_FOUND
    
    def match_pages( self, search : str, document : Dict ) -> bool :
        PAGES_FOUND = False
        if 'page' in document :
            for page in str( document[ 'page' ] ).split( '-' ) :
                if len( page ) > 0 :
                    if page in re.findall( '[0-9]+', search ) :
                        PAGES_FOUND = True
                    else :
                        PAGES_FOUND = False
        else :
            document[ 'page' ] = ''
        return PAGES_FOUND

    def clean( self, string ) -> str :
        return re.sub( 
            pattern = '\(.*\)|\.', 
            repl = '', 
            string = string.lower()
            ).strip()
    
    def fuzzy( self, s1 : str, s2 : str, tokenize = True ) -> bool :
        if tokenize :
            return fuzz.token_set_ratio( s1.lower(), s2.lower() ) >= 85
        return fuzz.ratio( s1.lower(), s2.lower() ) >= 90