This notebook uses `scholarly` API (https://github.com/scholarly-python-package/scholarly) to enrich CS faculty info

In [1]:
from scrap_cs_faculty import *

In [2]:
org_list = ["Princeton-CS","UWash-CS", "UPenn-CS"]
idx_org = org_list.index("UPenn-CS")
org = org_list[idx_org]
org2 = org.split("-")[0] 

## read CS faculty info scraped previously

In [3]:
file_xlsx = f"faculty-{org}.xlsx"
xlsxf = pd.ExcelFile(file_xlsx)

In [4]:
xlsxf.sheet_names

['Faculty']

In [5]:
df = xlsxf.parse('Faculty')

In [6]:
df

Unnamed: 0,name,job_title,phd_univ,phd_year,research_area,research_concentration,research_focus,url,img_url,phone,email,cell_phone,office_address,department,school,url_profile,url_author
0,Shivani Agarwal,"Associate Professor, Rachleff Family Associate...",,,,,,https://directory.seas.upenn.edu/shivani-agarwal/,https://directory.seas.upenn.edu/wp-content/up...,,ashivani@cis.upenn.edu,,,Computer Science,Univ Pennsylvania,,
1,Rajeev Alur,"Professor, Zisman Family Professor, Computer a...",,,,,,https://directory.seas.upenn.edu/rajeev-alur/,https://directory.seas.upenn.edu/wp-content/up...,,ALUR@CIS.UPENN.EDU,,,Computer Science,Univ Pennsylvania,,https://scholar.google.com/citations?user=ZvLa...
2,Sebastian Angel,"Assistant Professor, Raj and Neera Singh Term ...",,,,,,https://directory.seas.upenn.edu/sebastian-angel/,https://directory.seas.upenn.edu/wp-content/up...,,sga001@cis.upenn.edu,,,Computer Science,Univ Pennsylvania,,
3,Ryan Baker,"Associate Professor, Computer and Information ...",,,,,,https://directory.seas.upenn.edu/ryan-baker/,https://directory.seas.upenn.edu/wp-content/up...,,rybaker@upenn.edu,,,Computer Science,Univ Pennsylvania,,
4,Yoseph Barash,"Associate Professor, Computer and Information ...",,,,,,https://directory.seas.upenn.edu/yoseph-barash/,https://directory.seas.upenn.edu/wp-content/up...,(215) 746-8683,yosephb@seas.upenn.edu,,,Computer Science,Univ Pennsylvania,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,Richard Paul,"Professor Emeritus, Computer and Information S...",,,,,,https://directory.seas.upenn.edu/richard-paul/,https://directory.seas.upenn.edu/wp-content/up...,,lou@seas.upenn.edu,,,Computer Science,Univ Pennsylvania,,
108,Mark Steedman,"Adjunct Professor, Computer and Information Sc...",,,,,,https://directory.seas.upenn.edu/mark-steedman/,https://directory.seas.upenn.edu/wp-content/up...,,steedman@seas.upenn.edu,,,Computer Science,Univ Pennsylvania,,
109,Bonnie Webber,"Adjunct Professor, Computer and Information Sc...",,,,,,https://directory.seas.upenn.edu/bonnie-webber/,https://directory.seas.upenn.edu/wp-content/up...,,bonnie@seas.upenn.edu,,,Computer Science,Univ Pennsylvania,,
110,James Weimer,"Adjunct Assistant Professor, Computer and Info...",,,,,,https://directory.seas.upenn.edu/james-weimer/,https://directory.seas.upenn.edu/wp-content/up...,,weimerj@seas.upenn.edu,,,Computer Science,Univ Pennsylvania,,https://scholar.google.com/citations?user=IeuL...


In [7]:
names = df["name"].to_list()

In [8]:
len(names), names 

(112,
 ['Shivani Agarwal',
  'Rajeev Alur',
  'Sebastian Angel',
  'Ryan Baker',
  'Yoseph Barash',
  'Osbert Bastani',
  'Arvind Bhusnurmath',
  'Chris Callison-Burch',
  'Damon Centola',
  'Pratik Chaudhari',
  'Konstantinos Daniilidis',
  'Susan B. Davidson',
  'Anindya De',
  'Andre DeHon',
  'Joseph Devietti',
  'Eric Eaton',
  'Thomas Farmer',
  'Nadia Figueroa',
  'Eric Fouh',
  'Jean H. Gallier',
  'Rajiv Gandhi',
  'Jacob Gardner',
  'James Gee',
  'Surbhi Goel',
  'Sharath Chandra Guntuku',
  'Andreas Haeberlen',
  'Hamed Hassani',
  'Andrew Head',
  'Brett Hemenway',
  'M. Ani Hsieh',
  'Zachary Ives',
  'Dinesh Jayaraman',
  'Kevin B. Johnson',
  'Yasmin Kafai',
  'Sampath K. Kannan',
  'Michael Kearns',
  'Sanjeev Khanna',
  'Bongho Kim',
  'Junhyong Kim',
  'Daniel E. Koditschek',
  'Konrad Paul Kording',
  'Brandon Krakowsky',
  'Vijay Kumar',
  'Stephen Lane',
  'Benjamin C. Lee',
  'Insup Lee',
  'Jing (Jane) Li',
  'Gushu Li',
  'Mark Y. Liberman',
  'Vincent Liu',
  

## fetch google scholar data for all CS faculty

In [9]:
ntest = -1  #  2 # 

In [10]:
pub_data = []
for n, name in enumerate(names[:ntest]):
#     if n < 5: continue 
    print(f"n, name = {n}, {name} ...")
    
    author_org = f"{name} {org2}"
    norm_auth_org = normalize_str(author_org)
    file_author = f"data/GScholar_{norm_auth_org}.json"

    try:
        search_query = scholarly.search_author(author_org)
        init_result = next(search_query)
    except Exception as ex:
        print(str(ex))
        init_result = None 
        
    if init_result is None:
        print(f"Failed search_author()")
        continue

    scholar_id = init_result.get("scholar_id", "")
    if not scholar_id:
        print(f"Missing scholar_id")
        continue        
    
    # fetch data
    author = scholarly.fill(init_result)
    
    author_dict = {}
    # fill data cell
    author_dict["name"] = name 
    author_dict["file_author"] = file_author 
    author_dict["scholar_id"] = scholar_id 
    author_dict["affiliation"] = author.get("affiliation", "")
    author_dict["interests"] = "; ".join(author.get("interests", []))
    author_dict["url_author"] = get_scholar_page(scholar_id)
    author_dict["url_picture"] = author.get("url_picture", "")
    author_dict["url_homepage"] = author.get("homepage", "")
    author_dict["citedby"] = author.get("citedby", 0)
    author_dict["citedby5y"] = author.get("citedby5y", 0)
    author_dict["hindex"] = author.get("hindex", 0)
    author_dict["hindex5y"] = author.get("hindex5y", 0)
    author_dict["i10index"] = author.get("i10index", 0)
    author_dict["i10index5y"] = author.get("i10index5y", 0)
    author_dict["num_papers"] = len(author.get("publications", []))
    author_dict["num_coauthors"] = len(author.get("coauthors", []))
      
    # fill row 
    author_data = []
    for c in SCHOLAR_HEADER:
        author_data.append(author_dict.get(c))
        
    # accumulate row
    pub_data.append(author_data)
    
    # persist author data
    with open(Path(file_author), "w", encoding="utf-8") as f:
        f.write(json.dumps(author))
    
    delay = randint(1,5)
    sleep(delay)

n, name = 0, Shivani Agarwal ...
n, name = 1, Rajeev Alur ...
n, name = 2, Sebastian Angel ...
n, name = 3, Ryan Baker ...
n, name = 4, Yoseph Barash ...
n, name = 5, Osbert Bastani ...
n, name = 6, Arvind Bhusnurmath ...

Failed search_author()
n, name = 7, Chris Callison-Burch ...
n, name = 8, Damon Centola ...
n, name = 9, Pratik Chaudhari ...
n, name = 10, Konstantinos Daniilidis ...
n, name = 11, Susan B. Davidson ...

Failed search_author()
n, name = 12, Anindya De ...
n, name = 13, Andre DeHon ...
n, name = 14, Joseph Devietti ...
n, name = 15, Eric Eaton ...
n, name = 16, Thomas Farmer ...
n, name = 17, Nadia Figueroa ...
n, name = 18, Eric Fouh ...
n, name = 19, Jean H. Gallier ...

Failed search_author()
n, name = 20, Rajiv Gandhi ...

Failed search_author()
n, name = 21, Jacob Gardner ...
n, name = 22, James Gee ...
n, name = 23, Surbhi Goel ...
n, name = 24, Sharath Chandra Guntuku ...
n, name = 25, Andreas Haeberlen ...
n, name = 26, Hamed Hassani ...
n, name = 27, Andrew 

In [11]:
len(pub_data)

75

## write out xlsx

In [12]:
df_out = pd.DataFrame(pub_data, columns=SCHOLAR_HEADER)

In [13]:
df_out.head()

Unnamed: 0,name,affiliation,interests,num_papers,num_coauthors,citedby,hindex,i10index,citedby5y,hindex5y,i10index5y,scholar_id,url_author,url_picture,url_homepage,file_author
0,Shivani Agarwal,University of Pennsylvania,Machine Learning; Learning Theory,80,0,5337,33,55,2082,25,42,Q4ErnU4AAAAJ,https://scholar.google.com/citations?user=Q4Er...,https://scholar.google.com/citations?view_op=m...,https://www.shivani-agarwal.net/,data/GScholar_shivani_agarwal_upenn.json
1,Rajeev Alur,Zisman Family Professor of Computer and Inform...,Formal methods; Trustworthy AI; Cyber-physical...,484,52,51973,94,239,11184,49,147,ZvLa1RIAAAAJ,https://scholar.google.com/citations?user=ZvLa...,https://scholar.google.com/citations?view_op=m...,http://www.cis.upenn.edu/~alur/,data/GScholar_rajeev_alur_upenn.json
2,Sebastian Angel,University of Pennsylvania & Microsoft Research,Operating Systems; Distributed Systems; Networ...,37,10,1546,13,16,1155,13,16,pNaNj8EAAAAJ,https://scholar.google.com/citations?user=pNaN...,https://scholar.google.com/citations?view_op=m...,https://cis.upenn.edu/~sga001,data/GScholar_sebastian_angel_upenn.json
3,Ryan Baker,University of Pennsylvania,Educational Data Mining; Learning Analytics; L...,530,182,27442,74,299,16072,59,256,hvs8PEoAAAAJ,https://scholar.google.com/citations?user=hvs8...,https://scholar.google.com/citations?view_op=m...,http://www.upenn.edu/learninganalytics/ryanbaker/,data/GScholar_ryan_baker_upenn.json
4,Yoseph Barash,Associate Professor of Genetics and Computer &...,Computational RNA Biology; Machine learning; G...,92,0,6760,32,46,4057,24,41,U5G4TJAAAAAJ,https://scholar.google.com/citations?user=U5G4...,https://scholar.google.com/citations?view_op=m...,http://www.biociphers.org/,data/GScholar_yoseph_barash_upenn.json


In [14]:
# import xlsxwriter
file_xlsx = f"data/cs-faculty-gscholar-{org2}-{n}.xlsx"
writer = pd.ExcelWriter(Path(file_xlsx), engine='xlsxwriter')
df_out.to_excel(writer, sheet_name=org2, index=False)
writer.save()

  writer.save()


In [15]:
file_xlsx

'data/cs-faculty-gscholar-UPenn-110.xlsx'