This notebook uses `scholarly` API (https://github.com/scholarly-python-package/scholarly) to enrich CS faculty info

In [2]:
from scrap_cs_faculty import *

In [3]:
org_list = ["CMU-CS","Cornell-CS","MIT-AID","MIT-CS","Stanford-CS","UCB-CS","UIUC-CS",]
idx_org = org_list.index("Cornell-CS")
org = org_list[idx_org]
org2 = org.split("-")[0] 

## read CS faculty info scraped previously

In [4]:
file_xlsx = f"faculty-{org}.xlsx"
xlsxf = pd.ExcelFile(file_xlsx)

In [5]:
xlsxf.sheet_names

['Faculty', 'Research Groups']

In [6]:
df = xlsxf.parse('Faculty')

In [7]:
df

Unnamed: 0,name,job_title,phd_univ,phd_year,research_area,research_concentration,research_focus,url,img_url,phone,email,cell_phone,office_address,department,school
0,Mohamed Abdelfattah,Assistant Professor,Univ Toronto,2016.0,"Artificial Intelligence, Machine Learning",Artificial Intelligence,,https://www.mohsaied.com,https://www.cs.cornell.edu/sites/default/files...,,,,,"Electrical and Computer Engineering, Cornell T...",
1,Jayadev Acharya,Assistant Professor,Univ California San Diego,2014.0,"Artificial Intelligence, Theory of Computing","Artificial Intelligence, Theory of Computation","Information theory, machine learning, and algo...",https://people.ece.cornell.edu/acharya/,https://www.cs.cornell.edu/sites/default/files...,,,,,"Electrical and Computer Engineering, CS Field ...",
2,Rachit Agarwal,Associate Professor,Univ Illinois Urbana-Champaign,2013.0,"Systems and Networking, Theory of Computing","Systems, Theory of Computation","Distributed systems, systems for big data anal...",http://www.cs.cornell.edu/~ragarwal/,https://www.cs.cornell.edu/sites/default/files...,,,,,"Computer Science, CS Field Member",
3,David Albonesi,Professor,Univ Mass Amherst,1996.0,"Computer Architecture & VLSI, Systems and Netw...",Systems,Adaptive and reconfigurable multi-core and pro...,http://www.csl.cornell.edu/~albonesi/,https://www.cs.cornell.edu/sites/default/files...,,,,,"Electrical and Computer Engineering, CS Field ...",
4,Lorenzo Alvisi,Professor,Cornell Univ,1996.0,Systems and Networking,Systems,Theory and practice of dependable distributed ...,http://www.cs.cornell.edu/lorenzo/,https://www.cs.cornell.edu/sites/default/files...,,,,,"Computer Science, CS Field Member, Tisch Unive...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,Christina Lee Yu,Assistant Professor,MIT,,Theory of Computing,Theory of Computation,"Theory of Computing, Artificial Intelligence (...",https://people.orie.cornell.edu/cleeyu/,https://www.cs.cornell.edu/sites/default/files...,,,,,"ORIE, CS Field Member",
118,Haiyuan Yu,Professor,Yale,2005.0,Computational Biology,Scientific Computing and Applications,"Biomedical systems biology, machine learning, ...",https://www.yulab.org,https://www.cs.cornell.edu/sites/default/files...,,,,,"Biological Statistics & Computational Biology,...",
119,Ramin Zabih,Professor,Stanford Univ,1994.0,"Human Interaction, Vision","Artificial Intelligence, Theory of Computation","Computer vision and its applications, especial...",http://www.cs.cornell.edu/~rdz,https://www.cs.cornell.edu/sites/default/files...,,,,,"Computer Science, Cornell Tech, CS Field Member",
120,Cheng Zhang,Assistant Professor,Georgia Institute Technology,2018.0,Artificial Intelligence,Systems,"Ubiquitous Computing, Wearable Computing, Huma...",http://www.czhang.org/,https://www.cs.cornell.edu/sites/default/files...,,,,,"Information Science, CS Field Member",


In [8]:
names = df["name"].to_list()

In [9]:
len(names), names 

(122,
 ['Mohamed Abdelfattah',
  'Jayadev Acharya',
  'Rachit Agarwal',
  'David Albonesi',
  'Lorenzo Alvisi',
  'Yoav Artzi',
  'Shiri Azenkot',
  'Kavita Bala',
  'Siddhartha  Banerjee',
  'Christopher Batten',
  'Tapomayukh Bhattacharjee',
  'David Bindel',
  'Ken Birman',
  'Ronald Brachman',
  'Anne Bracy',
  'Florentina Bunea',
  'Mark Campbell',
  'Claire Cardie',
  'Eshan Chattopadhyay',
  'Sanjiban Choudhury',
  'Tanzeem Choudhury',
  'Michael Clarkson',
  'Robert Constable',
  'Anil Damle',
  'Cristian Danescu-Niculescu-Mizil',
  'Abe Davis',
  'Christopher De Sa',
  'Sarah Dean',
  'Nicola Dell',
  'Alan Demers',
  'Shimon Edelman',
  'Ahmed El Alaoui',
  'Kevin Ellis',
  'Deborah Estrin',
  'K-Y. Daisy Fan',
  'Silvia Ferrari',
  'Nate Foster',
  'Nikhil Garg',
  'Ziv Goldfeld',
  'Carla Gomes',
  'Donald Greenberg',
  'David Gries',
  'Giulia Guidi',
  'Francois Guimbretiere',
  'Joseph Halpern',
  'Bharath Hariharan',
  'Haym Hirsh',
  'Guy Hoffman',
  'John Hopcroft',
 

## fetch google scholar data for all CS faculty

In [10]:
ntest = -1  #  2 # 

In [11]:
pub_data = []
for n, name in enumerate(names[:ntest]):
#     if n < 5: continue 
    print(f"n, name = {n}, {name} ...")
    
    author_org = f"{name} {org2}"
    norm_auth_org = normalize_str(author_org)
    file_author = f"data/GScholar_{norm_auth_org}.json"

    try:
        search_query = scholarly.search_author(author_org)
        init_result = next(search_query)
    except Exception as ex:
        print(str(ex))
        init_result = None 
        
    if init_result is None:
        print(f"Failed search_author()")
        continue

    scholar_id = init_result.get("scholar_id", "")
    if not scholar_id:
        print(f"Missing scholar_id")
        continue        
    
    # fetch data
    author = scholarly.fill(init_result)
    
    author_dict = {}
    # fill data cell
    author_dict["name"] = name 
    author_dict["file_author"] = file_author 
    author_dict["scholar_id"] = scholar_id 
    author_dict["affiliation"] = author.get("affiliation", "")
    author_dict["interests"] = "; ".join(author.get("interests", []))
    author_dict["url_author"] = get_scholar_page(scholar_id)
    author_dict["url_picture"] = author.get("url_picture", "")
    author_dict["url_homepage"] = author.get("homepage", "")
    author_dict["citedby"] = author.get("citedby", 0)
    author_dict["citedby5y"] = author.get("citedby5y", 0)
    author_dict["hindex"] = author.get("hindex", 0)
    author_dict["hindex5y"] = author.get("hindex5y", 0)
    author_dict["i10index"] = author.get("i10index", 0)
    author_dict["i10index5y"] = author.get("i10index5y", 0)
    author_dict["num_papers"] = len(author.get("publications", []))
    author_dict["num_coauthors"] = len(author.get("coauthors", []))
      
    # fill row 
    author_data = []
    for c in SCHOLAR_HEADER:
        author_data.append(author_dict.get(c))
        
    # accumulate row
    pub_data.append(author_data)
    
    # persist author data
    with open(Path(file_author), "w", encoding="utf-8") as f:
        f.write(json.dumps(author))
    
    delay = randint(1,5)
    sleep(delay)

n, name = 0, Mohamed Abdelfattah ...
n, name = 1, Jayadev Acharya ...
n, name = 2, Rachit Agarwal ...
n, name = 3, David Albonesi ...

Failed search_author()
n, name = 4, Lorenzo Alvisi ...
n, name = 5, Yoav Artzi ...
n, name = 6, Shiri Azenkot ...
n, name = 7, Kavita Bala ...
n, name = 8, Siddhartha  Banerjee ...
n, name = 9, Christopher Batten ...
n, name = 10, Tapomayukh Bhattacharjee ...
n, name = 11, David Bindel ...
n, name = 12, Ken Birman ...
n, name = 13, Ronald Brachman ...
n, name = 14, Anne Bracy ...

Failed search_author()
n, name = 15, Florentina Bunea ...
n, name = 16, Mark Campbell ...
n, name = 17, Claire Cardie ...
n, name = 18, Eshan Chattopadhyay ...
n, name = 19, Sanjiban Choudhury ...
n, name = 20, Tanzeem Choudhury ...
n, name = 21, Michael Clarkson ...
n, name = 22, Robert Constable ...

Failed search_author()
n, name = 23, Anil Damle ...
n, name = 24, Cristian Danescu-Niculescu-Mizil ...
n, name = 25, Abe Davis ...
n, name = 26, Christopher De Sa ...
n, name = 

In [12]:
len(pub_data)

107

## write out xlsx

In [13]:
df_out = pd.DataFrame(pub_data, columns=SCHOLAR_HEADER)

In [14]:
df_out.head()

Unnamed: 0,name,affiliation,interests,num_papers,num_coauthors,citedby,hindex,i10index,citedby5y,hindex5y,i10index5y,scholar_id,url_author,url_picture,url_homepage,file_author
0,Mohamed Abdelfattah,Cornell University,Machine Learning; FPGAs; Reconfigurable Computing,57,28,1220,17,22,999,15,18,q4wBpWAAAAAJ,https://scholar.google.com/citations?user=q4wB...,https://scholar.google.com/citations?view_op=m...,http://mohsaied.com/,data/GScholar_mohamed_abdelfattah_cornell.json
1,Jayadev Acharya,"Assistant Professor, Cornell University",Learning theory; information theory; statistic...,89,44,2684,28,52,2044,26,46,70vJVxcAAAAJ,https://scholar.google.com/citations?user=70vJ...,https://scholar.google.com/citations?view_op=m...,https://people.ece.cornell.edu/acharya/,data/GScholar_jayadev_acharya_cornell.json
2,Rachit Agarwal,Cornell University,Systems; Networking; Distributed systems,48,57,2861,26,34,2144,20,25,71IXR1QAAAAJ,https://scholar.google.com/citations?user=71IX...,https://scholar.google.com/citations?view_op=m...,http://www.cs.cornell.edu/~ragarwal/,data/GScholar_rachit_agarwal_cornell.json
3,Lorenzo Alvisi,Tisch University Professor of Computer Science...,Distributed Computing,199,13,15019,56,96,4021,29,51,TIGS4c8AAAAJ,https://scholar.google.com/citations?user=TIGS...,https://scholar.google.com/citations?view_op=m...,http://www.cs.cornell.edu/lorenzo/,data/GScholar_lorenzo_alvisi_cornell.json
4,Yoav Artzi,Cornell University; ASAPP,Natural Language Processing; Machine Learning,71,80,8456,33,46,7505,31,44,XuQW7ogAAAAJ,https://scholar.google.com/citations?user=XuQW...,https://scholar.google.com/citations?view_op=m...,http://yoavartzi.com/,data/GScholar_yoav_artzi_cornell.json


In [15]:
# import xlsxwriter
file_xlsx = f"data/cs-faculty-gscholar-{org2}-{n}.xlsx"
writer = pd.ExcelWriter(Path(file_xlsx), engine='xlsxwriter')
df_out.to_excel(writer, sheet_name=org2, index=False)
writer.save()

  writer.save()


In [16]:
file_xlsx

'data/cs-faculty-gscholar-Cornell-120.xlsx'