Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add get_authors script to list all authors
Not sure the information use case is you wanted. Let me know if any comments
- Loading branch information
1 parent
7b7bf91
commit db6286c
Showing
1 changed file
with
108 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
|
||
# coding: utf-8 | ||
|
||
import re | ||
import requests | ||
from html.parser import HTMLParser | ||
import codecs | ||
|
||
search_engine="https://www.semanticscholar.org/search?q=" | ||
post_fix = "&sort=relevance&ae=false" | ||
|
||
class AuthorParser( HTMLParser ): | ||
tail_string = "" #contains the last tag's name which point to author field | ||
m_Stop = False | ||
m_authors = [] | ||
def handle_starttag(self, tag, attr): | ||
if self.m_Stop: | ||
return | ||
if tag == 'article': | ||
self.tail_string += tag | ||
return | ||
if self.tail_string != "": | ||
#print("search already kick-off") | ||
self.tail_string = self.tail_string+"."+tag | ||
#print(self.tail_string) | ||
def handle_endtag(self, tag): | ||
if self.m_Stop : | ||
return | ||
if self.tail_string == "article": | ||
# ONLY handle the first article | ||
self.m_Stop = True | ||
if self.tail_string != "": | ||
tags = self.tail_string.split('.') | ||
tags.reverse() | ||
for t in tags: | ||
if t == tag: | ||
tags.remove(t) | ||
break | ||
self.tail_string = "" | ||
tags.reverse() | ||
for i,t in enumerate(tags): | ||
self.tail_string = self.tail_string + "." + t if i > 0 else t | ||
|
||
def handle_data(self, data): | ||
if self.m_Stop: | ||
return | ||
if self.tail_string == "article.header.ul.li.span.span.a.span.span": | ||
#print(data) | ||
self.m_authors.append(data) | ||
|
||
def get_authors(self): | ||
return self.m_authors | ||
|
||
def clean(self): | ||
self.m_authors = [] | ||
self.tail_string= "" | ||
self.m_Stop = False | ||
|
||
|
||
def getPaperNames( readme_file ): | ||
paper_list = [] | ||
with codecs.open( readme_file,encoding='utf-8',mode='r',buffering = 1, errors='strict' ) as f: | ||
lines = f.read().split('\n') | ||
heading, section_path = '', '' | ||
for line in lines: | ||
if('###' in line): | ||
heading = line.strip().split('###')[1] | ||
heading = heading.replace('/', '|') | ||
|
||
if('[[pdf]]' in line): | ||
# The stars ensure you pick up only the top 100 papers | ||
# Modify the expression if you want to fetch all other papers as well | ||
result = re.search('\*\*(.*?)\*\*.*?\[\[pdf\]\]\((.*?)\)', line) | ||
if(result): | ||
paper, url = result.groups() | ||
paper_list.append(paper) | ||
|
||
return paper_list | ||
|
||
all_papers = getPaperNames("README.md") | ||
|
||
author_parser = AuthorParser() | ||
author_dict = {} | ||
for index,paper in enumerate(all_papers): | ||
paper.replace(" ", "%20") | ||
search_result = requests.get(search_engine + paper + post_fix) | ||
author_parser.feed(search_result.text) | ||
#print( paper, '==>', author_parser.get_authors() ) | ||
authors = author_parser.get_authors() | ||
for weight, author in enumerate( authors): | ||
if author not in author_dict.keys(): | ||
author_dict[author] = [] | ||
|
||
author_dict[author].append( (weight+1,paper)) | ||
author_parser.clean() | ||
print("Processed %d |"%(index), paper) | ||
|
||
# example usage of author information | ||
with open( "author.csv",'w') as fcsv: | ||
for (author, papers) in author_dict.items(): | ||
score = 0.0 | ||
for (weight, paper) in papers: | ||
score += 1.0/weight | ||
print(author," score: %.2f"%score) | ||
fcsv.write( author+','+"%.2f"%score) | ||
|
||
|
||
|