In [None]:
import scholarly, re
from nltk import word_tokenize
from joblib import Parallel, delayed

In [None]:
nr_papers_per_author = 10

authors = ["Theeuwes, Jan","Roelfsema, Pieter","Ridderinkhof, Richard",\
           "Pennartz, Cyriel","Beek, D van de","Brussaard, AB","Willuhn, Ingo",\
           "Keysers, Christian","Levelt, Christiaan","Oever, MC van den",\
           "Hulst, Hanneke","Heine, Vivi","kock, Christiaan de",\
           "Kant, Rik van der","Verwijk, Esmee","Scholte, HS",\
           "Opstal, Filip van","Sligte, Ilja","Szymanik, Jakub","Visser, Renee",\
           "Vugt, Mark van","Knapen, Tomas","Wingen, Guido",\
           "Krugers, Harm","Lansink, Carien","Lorteije, Jeannette"]
authors = [auth.split(', ')[1] + ' ' + auth.split(', ')[0] for auth in authors]
print(authors)

# # now trying to change the names to work anyway
# # deleted because Google Scholar profiles insufficient:
# authors.pop(authors.index('Diederik van de Beek'))
# print(authors)

In [None]:
def get_abstracts_and_titles_for_author(author_name, nr_papers_per_author=nr_papers_per_author):
    """get_abstracts_and_titles_for_author
    
    Parameters
    ----------
    author_name : str
        name of author in Firstname, Lastname format
    nr_papers_per_author : int, optional
        number of papers per author to take
    
    Returns
    -------
    dict
        abstract and title dictionary for this author
    """
    # broadcast author name to know where we are
    print(author_name)
    
    search_query = scholarly.search_author(author_name)
    try:
        author = next(search_query).fill()
    except StopIteration:
        print("No author with name {author_name}".format(
            author_name=author_name
        ))
        return {'abstracts':["" for x in range(nr_papers_per_author)], \
                'titles':["" for x in range(nr_papers_per_author)]}
    
    # adjust the number of papers for junior authors
    nppa = nr_papers_per_author
    if len(author.publications) < nr_papers_per_author:
        nppa = len(author.publications)
        print('adjusting nr of publications to {nppa} for {author_name}'.format(
            nppa=nppa,
            author_name=author_name
        ))
        
    # fill the data for the publications
    filled_publications = [pub.fill() for pub in author.publications[:nppa]]
    abstracts = [re.sub('<[^<]+?>', '', str(pub.bib['abstract'])) 
                 for pub in filled_publications if 'abstract' in pub.bib.keys()]
    titles = [re.sub('<[^<]+?>', '', str(pub.bib['title']))
                 for pub in filled_publications if 'title' in pub.bib.keys()]
    
    print('    found {nppa} articles for {author_name}'.format(
            nppa=nppa,
            author_name=author_name
        ))
    
    return {'abstracts':abstracts, 'titles':titles}

In [None]:
all_abstracts_and_titles = [get_abstracts_and_titles_for_author(auth) for auth in authors[4:]]

In [None]:
# for parallel processing
# all_abstracts_and_titles = Parallel(n_jobs=8)(delayed(get_abstracts_and_titles_for_author)(auth) for auth in authors)