This notebook uses `scholarly` API (https://github.com/scholarly-python-package/scholarly) to enrich CS faculty info

In [1]:
from scrap_cs_faculty import *

In [2]:
org_list = ["Princeton-CS","UWash-CS", "UPenn-CS"]
idx_org = org_list.index("UWash-CS")
org = org_list[idx_org]
org2 = org.split("-")[0] 

## read CS faculty info scraped previously

In [3]:
file_xlsx = f"faculty-{org}.xlsx"
xlsxf = pd.ExcelFile(file_xlsx)

In [4]:
xlsxf.sheet_names

['Faculty']

In [5]:
df = xlsxf.parse('Faculty')

In [6]:
df

Unnamed: 0,name,job_title,phd_univ,phd_year,research_area,research_concentration,research_focus,url,img_url,phone,email,cell_phone,office_address,department,school,url_profile,url_author
0,Tim Althoff,,,,"Data Science, Data Mining, Social Network Anal...",,,https://homes.cs.washington.edu/~althoff/,https://s3-us-west-2.amazonaws.com/www-cse-pub...,,althoff@cs.washington.edu,,CSE2 313,Computer Science,Univ Washington,,
1,Richard Anderson,,,,"Computing for the developing world, health inf...",,,https://www.cs.washington.edu/people/faculty/a...,https://s3-us-west-2.amazonaws.com/www-cse-pub...,,anderson@cs.washington.edu,,CSE2 344,Computer Science,Univ Washington,,
2,Ruth E. Anderson,,,,"Computer science education, educational techno...",,,http://homes.cs.washington.edu/~rea,https://s3-us-west-2.amazonaws.com/www-cse-pub...,,rea@cs.washington.edu,,CSE 558,Computer Science,Univ Washington,,
3,Tom Anderson,,,,"Distributed systems, networks, operating syste...",,,http://www.cs.washington.edu/people/faculty/tom/,https://s3-us-west-2.amazonaws.com/www-cse-pub...,,tom@cs.washington.edu,,CSE 646,Computer Science,Univ Washington,,
4,Magdalena Balazinska,,,,"Databases, cloud computing, big-data analytics...",,,http://www.cs.washington.edu/people/faculty/ma...,https://s3-us-west-2.amazonaws.com/www-cse-pub...,,magda@cs.washington.edu,,CSE 584,Computer Science,Univ Washington,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,Xi Wang,,,,"Computer systems, security, and programming la...",,,/people/faculty/xi,https://s3-us-west-2.amazonaws.com/www-cse-pub...,,xi@cs.washington.edu,,CSE 580,Computer Science,Univ Washington,,
94,Robbie Weber,,,,,,,http://weberrobbie.com/,https://s3-us-west-2.amazonaws.com/www-cse-pub...,,rtweber2@cs.washington.edu,,,Computer Science,Univ Washington,,
95,James R Wilcox,,,,"Computer science education, programming langua...",,,http://homes.cs.washington.edu/~jrw12/,https://s3-us-west-2.amazonaws.com/www-cse-pub...,,jrw12@cs.washington.edu,,,Computer Science,Univ Washington,,
96,Brett Wortzman,,,,Computer science education; Computer science t...,,,https://homes.cs.washington.edu/~brettwo/,https://s3-us-west-2.amazonaws.com/www-cse-pub...,,brettwo@cs.washington.edu,,CSE 542,Computer Science,Univ Washington,,


In [7]:
names = df["name"].to_list()

In [8]:
len(names), names 

(98,
 ['Tim Althoff',
  'Richard Anderson',
  'Ruth E. Anderson',
  'Tom Anderson',
  'Magdalena Balazinska',
  'Leilani Battle',
  'Paul Beame',
  'Gilbert Bernstein',
  'Rastislav Bodik',
  'Byron Boots',
  'Lauren Bricker',
  'Maya Cakmak',
  'Luis Ceze',
  'Yejin Choi',
  'Andrea Coladangelo',
  'Brian Curless',
  'Simon Du',
  'Michael Ernst',
  'Ali Farhadi',
  'James Fogarty',
  'Dieter Fox',
  'Jon E. Froehlich',
  'Elba Garza',
  'Shyam Gollakota',
  'Matthew Golub',
  'Dan Grossman',
  'Abhishek Gupta',
  'Hannaneh Hajishirzi',
  'Jeffrey Heer',
  'Kurtis Heimerl',
  'Justin Hsia',
  'Scott Sumio Ichikawa',
  'Vikram Iyer',
  'Kevin Jamieson',
  'Natasha Jaques',
  'René Just',
  'Anna Karlin',
  'Baris Kasikci',
  'Ira Kemelmacher-Shlizerman',
  'Pang Wei Koh',
  'David Kohlbrenner',
  'Tadayoshi Kohno',
  'Ranjay Krishna',
  'Arvind Krishnamurthy',
  'Ed Lazowska',
  'James R. Lee',
  'Su-In Lee',
  'Yin Tat Lee',
  'Hank Levy',
  'Huijia (Rachel) Lin',
  'Kevin Lin',
  'Ry

## fetch google scholar data for all CS faculty

In [9]:
ntest = -1  #  2 # 

In [10]:
pub_data = []
for n, name in enumerate(names[:ntest]):
#     if n < 5: continue 
    print(f"n, name = {n}, {name} ...")
    
    author_org = f"{name} {org2}"
    norm_auth_org = normalize_str(author_org)
    file_author = f"data/GScholar_{norm_auth_org}.json"

    try:
        search_query = scholarly.search_author(author_org)
        init_result = next(search_query)
    except Exception as ex:
        print(str(ex))
        init_result = None 
        
    if init_result is None:
        print(f"Failed search_author()")
        continue

    scholar_id = init_result.get("scholar_id", "")
    if not scholar_id:
        print(f"Missing scholar_id")
        continue        
    
    # fetch data
    author = scholarly.fill(init_result)
    
    author_dict = {}
    # fill data cell
    author_dict["name"] = name 
    author_dict["file_author"] = file_author 
    author_dict["scholar_id"] = scholar_id 
    author_dict["affiliation"] = author.get("affiliation", "")
    author_dict["interests"] = "; ".join(author.get("interests", []))
    author_dict["url_author"] = get_scholar_page(scholar_id)
    author_dict["url_picture"] = author.get("url_picture", "")
    author_dict["url_homepage"] = author.get("homepage", "")
    author_dict["citedby"] = author.get("citedby", 0)
    author_dict["citedby5y"] = author.get("citedby5y", 0)
    author_dict["hindex"] = author.get("hindex", 0)
    author_dict["hindex5y"] = author.get("hindex5y", 0)
    author_dict["i10index"] = author.get("i10index", 0)
    author_dict["i10index5y"] = author.get("i10index5y", 0)
    author_dict["num_papers"] = len(author.get("publications", []))
    author_dict["num_coauthors"] = len(author.get("coauthors", []))
      
    # fill row 
    author_data = []
    for c in SCHOLAR_HEADER:
        author_data.append(author_dict.get(c))
        
    # accumulate row
    pub_data.append(author_data)
    
    # persist author data
    with open(Path(file_author), "w", encoding="utf-8") as f:
        f.write(json.dumps(author))
    
    delay = randint(1,5)
    sleep(delay)

n, name = 0, Tim Althoff ...
n, name = 1, Richard Anderson ...
n, name = 2, Ruth E. Anderson ...
n, name = 3, Tom Anderson ...
n, name = 4, Magdalena Balazinska ...
n, name = 5, Leilani Battle ...
n, name = 6, Paul Beame ...
n, name = 7, Gilbert Bernstein ...

Failed search_author()
n, name = 8, Rastislav Bodik ...
n, name = 9, Byron Boots ...
n, name = 10, Lauren Bricker ...

Failed search_author()
n, name = 11, Maya Cakmak ...
n, name = 12, Luis Ceze ...
n, name = 13, Yejin Choi ...
n, name = 14, Andrea Coladangelo ...

Failed search_author()
n, name = 15, Brian Curless ...
n, name = 16, Simon Du ...
n, name = 17, Michael Ernst ...
n, name = 18, Ali Farhadi ...
n, name = 19, James Fogarty ...
n, name = 20, Dieter Fox ...
n, name = 21, Jon E. Froehlich ...
n, name = 22, Elba Garza ...

Failed search_author()
n, name = 23, Shyam Gollakota ...
n, name = 24, Matthew Golub ...

Failed search_author()
n, name = 25, Dan Grossman ...
n, name = 26, Abhishek Gupta ...
n, name = 27, Hannaneh Ha

In [11]:
len(pub_data)

73

## write out xlsx

In [12]:
df_out = pd.DataFrame(pub_data, columns=SCHOLAR_HEADER)

In [13]:
df_out.head()

Unnamed: 0,name,affiliation,interests,num_papers,num_coauthors,citedby,hindex,i10index,citedby5y,hindex5y,i10index5y,scholar_id,url_author,url_picture,url_homepage,file_author
0,Tim Althoff,"Assistant Professor of Computer Science, Unive...",data science; data mining; natural language pr...,98,26,4104,29,42,3617,27,42,yc4nBNgAAAAJ,https://scholar.google.com/citations?user=yc4n...,https://scholar.google.com/citations?view_op=m...,http://www.timalthoff.com/,data/GScholar_tim_althoff_uwash.json
1,Richard Anderson,"Professor of Computer Science and Engineering,...",Information and Computing Technologies for Dev...,1103,0,13823,61,228,3576,28,95,uLsDDUMAAAAJ,https://scholar.google.com/citations?user=uLsD...,https://scholar.google.com/citations?view_op=m...,http://www.cs.washington.edu/people/faculty/an...,data/GScholar_richard_anderson_uwash.json
2,Ruth E. Anderson,University of Washington,,61,0,2435,24,33,596,13,16,X3x2Pi0AAAAJ,https://scholar.google.com/citations?user=X3x2...,https://scholar.google.com/citations?view_op=m...,,data/GScholar_ruth_e_anderson_uwash.json
3,Tom Anderson,"Warren Francis and Wilma Kolm Bradley Chair, C...",Distributed Systems; Networking; Operating Sys...,310,56,59071,98,183,13284,50,109,MYqlcPgAAAAJ,https://scholar.google.com/citations?user=MYql...,https://scholar.google.com/citations?view_op=m...,http://www.cs.washington.edu/homes/tom,data/GScholar_tom_anderson_uwash.json
4,Magdalena Balazinska,University of Washington,Databases; data science; cloud computing; para...,226,0,17167,63,117,5458,39,88,DDxFvcIAAAAJ,https://scholar.google.com/citations?user=DDxF...,https://scholar.google.com/citations?view_op=m...,http://www.cs.washington.edu/people/faculty/magda,data/GScholar_magdalena_balazinska_uwash.json


In [14]:
# import xlsxwriter
file_xlsx = f"data/cs-faculty-gscholar-{org2}-{n}.xlsx"
writer = pd.ExcelWriter(Path(file_xlsx), engine='xlsxwriter')
df_out.to_excel(writer, sheet_name=org2, index=False)
writer.save()

  writer.save()


In [15]:
file_xlsx

'data/cs-faculty-gscholar-UWash-96.xlsx'