In [163]:
import requests
import urllib.parse
import pickle
import time
import os

In [387]:
class MetaDataStore(dict):
    """ Store and get the metadata from Crossref
    """
    
    def __init__(self, cachelocation = 'data/cachefile.pickle'):
        """ Load the cached metadata from the `cachelocation` filename
        """
        self.cachelocation = cachelocation
        self.mailadress = 'xdze2.me@gmail.com'
        
        try:
            with open(cachelocation, 'rb') as f:
                self.update( pickle.load(f) )
                
            print( len(self), 'metadata loaded from `%s`' % cachelocation )
        
        except FileNotFoundError:
            print( '`%s` not found. A new file will be created.' % cachelocation  )

        
    def __missing__(self, doi):
        """ Perform automatically the query on Crossref if missing
            update the cache and save to the pickle file
        """
        
        url = 'https://api.crossref.org/works/'
        params = { 'mailto':self.mailadress }
        parsed_url = url + urllib.parse.quote_plus( doi )
        
        response = requests.get(parsed_url, params=params)
        
        if not response.ok:
            print('`%s` not found. Empty metadata created. ' % doi)
            #raise NameError('query error: %s' % response.url )
            metadata = {'DOI': doi}
        else:
            print( 'metadata retrieved from Crossref in %.3f s.' % response.elapsed.total_seconds() )
            response = response.json()
            metadata = response['message']
            
        self[doi] = metadata
        
        # save to file, create if not exist
        os.makedirs(os.path.dirname(self.cachelocation), exist_ok=True)
        with open(self.cachelocation, 'wb') as f:
            pickle.dump(self, f)

            
        return MetaData( metadata )
        
        
    def __getitem__(self, key):
        """ Wrap the returned value in a MetaData object
        """
        value = dict.__getitem__(self, key)
        return  MetaData( value )
    
    
    def reset(self):
        """ Empty the cache and delete the cache file
        """
        cachesize = os.path.getsize(self.cachelocation) / 1024**2
        message = 'Delete `{}` {:.2f} Mo, \n Are you sure? [type yes] '
        confirm = input( message.format(self.cachelocation, cachesize ) )
        
        if confirm == 'yes':
            self.clear()
            os.remove(self.cachelocation)
            print('file removed')
        else:
            print('canceled')
        

In [388]:
class MetaData(dict):
    """ Class based on a dict representing the metadata
    """
    
    def __init__(self, metadata):
        self.update( metadata )
        
    @property
    def refs_doi(self):
        """ List of doi of the references
        """
        references = self.get('reference', [])
        referencesWithDoi = { ref['DOI'] for ref in references if 'DOI' in ref }
        
        return list( referencesWithDoi )
    
    
    @property
    def label(self):
        """ Label for the article as AuthorYEAR
            return part of the hash is no metadata is found
        """        
        try:
            year = self['issued']['date-parts'][0][0]
            familyname = [ auth['family'] for auth in self['author'] if auth['sequence']=='first'][0]

            label = familyname + str(year)
        except KeyError:
            label = str( abs(hash( self['DOI'] )) )[:5]
            
        return label
    
    
    def __str__(self):
        """ Return nicely formated metadata
        """
        try:
            title = self['title'][0]
            title = (title[:75].strip() + '...') if len(title) > 75 else title

            year = self['issued']['date-parts'][0][0]
            
            first_author = ' '.join([ (auth['given'], auth['family'])
                                     for auth in self['author']
                                     if auth['sequence']=='first'][0] )
            
            journal = self.get('container-title', '')[0]
            
            info = '({year}) {title}\n'.format(year=year, title=title)
            info += '   ' + first_author + ' et al.'
            info += ' - ' + journal
            info += '\n   ' + self['URL']
            
        except KeyError:
            info = '[%s] no meta data :(' % self['DOI']
        
        return info

In [389]:
store = MetaDataStore('cache/cache.pickle')

2 metadata loaded from `cache/cache.pickle`


In [395]:
store.reset()

Delete `cache/cache.pickle` 0.01 Mo, 
 Are you sure? [type yes] no
canceled


In [394]:
store.keys()

dict_keys(['10.1063/1.113684', '10.1063/1.113681', '10.1063/1.113680'])

In [393]:
store['10.1063/1.113681']

metadata retrieved from Crossref in 0.625 s.


{'DOI': '10.1063/1.113681',
 'ISSN': ['0003-6951', '1077-3118'],
 'URL': 'http://dx.doi.org/10.1063/1.113681',
 'alternative-id': ['10.1063/1.113681'],
 'author': [{'affiliation': [],
   'family': 'Feng',
   'given': 'J.',
   'sequence': 'first'},
  {'affiliation': [],
   'family': 'Chen',
   'given': 'T. R.',
   'sequence': 'additional'},
  {'affiliation': [],
   'family': 'Zhao',
   'given': 'B.',
   'sequence': 'additional'},
  {'affiliation': [],
   'family': 'Yariv',
   'given': 'A.',
   'sequence': 'additional'}],
 'container-title': ['Applied Physics Letters'],
 'content-domain': {'crossmark-restriction': False, 'domain': []},
 'created': {'date-parts': [[2002, 7, 26]],
  'date-time': '2002-07-26T12:27:32Z',
  'timestamp': 1027686452000},
 'deposited': {'date-parts': [[2016, 12, 29]],
  'date-time': '2016-12-29T03:46:40Z',
  'timestamp': 1482983200000},
 'indexed': {'date-parts': [[2018, 5, 3]],
  'date-time': '2018-05-03T03:45:56Z',
  'timestamp': 1525319156954},
 'is-reference

In [339]:
print( store['10.1063/1.113684'] )

metadata retrieved from Crossref in 0.650 s.
(1995) New CdTe photoconductor array detector for x‐ray applications
   S. S. Yoo et al. - Applied Physics Letters
   http://dx.doi.org/10.1063/1.113684


In [327]:
store['10.1038/nmat1352'].refs_doi

['10.1016/S0169-409X(01)00239-3',
 '10.1038/35007047',
 '10.1038/35095031',
 '10.1126/science.290.5496.1555',
 '10.1023/A:1020137616302',
 '10.1021/ja972139o',
 '10.1063/1.113679',
 '10.1126/science.290.5496.1540',
 '10.1038/nmat960',
 '10.1038/15100',
 '10.1002/1521-4095(20020517)14:10<743::AID-ADMA743>3.0.CO;2-H',
 '10.1103/PhysRevLett.45.1636',
 '10.1016/S0167-7799(99)01345-1',
 '10.1002/(SICI)1097-4636(2000)53:3<258::AID-JBM11>3.0.CO;2-O',
 '10.1016/S0925-4005(00)00396-8',
 '10.1038/nmat761',
 '10.1016/S0168-583X(99)00118-4',
 '10.1038/21619',
 '10.1021/ac970853i',
 '10.1016/S0169-409X(01)00241-1',
 '10.1039/b210140h',
 '10.1016/S0142-9612(96)00198-6',
 '10.1016/S0378-5173(01)00976-0',
 '10.1038/417388a',
 '10.1016/S0169-409X(01)00246-0',
 '10.1126/science.286.5442.1129',
 '10.1021/bp990058l',
 '10.1021/bc010080b',
 '10.1038/17092',
 '10.1126/science.281.5375.389']

In [305]:
len( store )

6

In [200]:
store['10.1038/nmat1353']

{'DOI': '10.1038/nmat1353',
 'ISSN': ['1476-1122', '1476-4660'],
 'URL': 'http://dx.doi.org/10.1038/nmat1353',
 'alternative-id': ['BFnmat1353'],
 'author': [{'affiliation': [],
   'family': 'Grohol',
   'given': 'Daniel',
   'sequence': 'first'},
  {'affiliation': [],
   'family': 'Matan',
   'given': 'Kittiwit',
   'sequence': 'additional'},
  {'affiliation': [],
   'family': 'Cho',
   'given': 'Jin-Hyung',
   'sequence': 'additional'},
  {'affiliation': [],
   'family': 'Lee',
   'given': 'Seung-Hun',
   'sequence': 'additional'},
  {'affiliation': [],
   'family': 'Lynn',
   'given': 'Jeffrey W.',
   'sequence': 'additional'},
  {'affiliation': [],
   'family': 'Nocera',
   'given': 'Daniel G.',
   'sequence': 'additional'},
  {'affiliation': [],
   'family': 'Lee',
   'given': 'Young S.',
   'sequence': 'additional'}],
 'container-title': ['Nature Materials'],
 'content-domain': {'crossmark-restriction': False, 'domain': []},
 'created': {'date-parts': [[2005, 3, 27]],
  'date-tim