In [7]:
import scholarly, re, nltk, wordcloud, pickle
from nltk import word_tokenize
from joblib import Parallel, delayed

In [2]:
nr_papers_per_author = 10

authors = ["Theeuwes, Jan","Roelfsema, Pieter","Ridderinkhof, Richard",\
           "Pennartz, Cyriel","Beek, D van de","Brussaard, AB","Willuhn, Ingo",\
           "Keysers, Christian","Levelt, Christiaan","Oever, MC van den",\
           "Hulst, Hanneke","Heine, Vivi","kock, Christiaan de",\
           "Kant, Rik van der","Verwijk, Esmee","Scholte, HS",\
           "Opstal, Filip van","Sligte, Ilja","Szymanik, Jakub","Visser, Renee",\
           "Vugt, Mark van","Knapen, Tomas","Wingen, Guido",\
           "Krugers, Harm","Lansink, Carien","Lorteije, Jeannette"]
authors = [auth.split(', ')[1] + ' ' + auth.split(', ')[0] for auth in authors]
print(authors)

# # now trying to change the names to work anyway
# # deleted because Google Scholar profiles insufficient:
# authors.pop(authors.index('Diederik van de Beek'))
# print(authors)

['Jan Theeuwes', 'Pieter Roelfsema', 'Richard Ridderinkhof', 'Cyriel Pennartz', 'D van de Beek', 'AB Brussaard', 'Ingo Willuhn', 'Christian Keysers', 'Christiaan Levelt', 'MC van den Oever', 'Hanneke Hulst', 'Vivi Heine', 'Christiaan de kock', 'Rik van der Kant', 'Esmee Verwijk', 'HS Scholte', 'Filip van Opstal', 'Ilja Sligte', 'Jakub Szymanik', 'Renee Visser', 'Mark van Vugt', 'Tomas Knapen', 'Guido Wingen', 'Harm Krugers', 'Carien Lansink', 'Jeannette Lorteije']


In [3]:
def get_abstracts_and_titles_for_author(author_name, nr_papers_per_author=nr_papers_per_author):
    """get_abstracts_and_titles_for_author
    
    Parameters
    ----------
    author_name : str
        name of author in Firstname, Lastname format
    nr_papers_per_author : int, optional
        number of papers per author to take
    
    Returns
    -------
    dict
        abstract and title dictionary for this author
    """
    # broadcast author name to know where we are
    print(author_name)
    
    search_query = scholarly.search_author(author_name)
    try:
        author = next(search_query).fill()
    except StopIteration:
        print("No author with name {author_name}".format(
            author_name=author_name
        ))
        return {'abstracts':["" for x in range(nr_papers_per_author)], \
                'titles':["" for x in range(nr_papers_per_author)]}
    
    # adjust the number of papers for junior authors
    nppa = nr_papers_per_author
    if len(author.publications) < nr_papers_per_author:
        nppa = len(author.publications)
        print('adjusting nr of publications to {nppa} for {author_name}'.format(
            nppa=nppa,
            author_name=author_name
        ))
        
    # fill the data for the publications
    filled_publications = [pub.fill() for pub in author.publications[:nppa]]
    abstracts = [re.sub('<[^<]+?>', '', str(pub.bib['abstract'])) 
                 for pub in filled_publications if 'abstract' in pub.bib.keys()]
    titles = [re.sub('<[^<]+?>', '', str(pub.bib['title']))
                 for pub in filled_publications if 'title' in pub.bib.keys()]
    
    print('    found {nppa} articles for {author_name}'.format(
            nppa=nppa,
            author_name=author_name
        ))
    
    return {'abstracts':abstracts, 'titles':titles}

In [5]:
all_abstracts_and_titles = [get_abstracts_and_titles_for_author(auth) for auth in authors]

Jan Theeuwes
    found 10 articles for Jan Theeuwes
Pieter Roelfsema
    found 10 articles for Pieter Roelfsema
Richard Ridderinkhof
    found 10 articles for Richard Ridderinkhof
Cyriel Pennartz
    found 10 articles for Cyriel Pennartz
D van de Beek
    found 10 articles for D van de Beek
AB Brussaard
    found 10 articles for AB Brussaard
Ingo Willuhn
    found 10 articles for Ingo Willuhn
Christian Keysers
    found 10 articles for Christian Keysers
Christiaan Levelt
    found 10 articles for Christiaan Levelt
MC van den Oever
No author with name MC van den Oever
Hanneke Hulst
    found 10 articles for Hanneke Hulst
Vivi Heine
No author with name Vivi Heine
Christiaan de kock
    found 10 articles for Christiaan de kock
Rik van der Kant
    found 10 articles for Rik van der Kant
Esmee Verwijk
No author with name Esmee Verwijk
HS Scholte
    found 10 articles for HS Scholte
Filip van Opstal
    found 10 articles for Filip van Opstal
Ilja Sligte
    found 10 articles for Ilja Sligte


In [None]:
# for parallel processing
all_abstracts_and_titles = Parallel(n_jobs=18)(delayed(get_abstracts_and_titles_for_author)(auth) for auth in authors)

In [10]:
with open('data/all_abstracts_and_titles.pkl', 'wb') as f:
    pickle.dump(all_abstracts_and_titles, f)