This notebook uses `scholarly` API (https://github.com/scholarly-python-package/scholarly) to enrich CS faculty info

In [1]:
from scrap_cs_faculty import *

In [2]:
org_list = ["CMU-CS","Cornell-CS","MIT-AID","MIT-CS","Stanford-CS","UCB-CS","UIUC-CS",]
idx_org = org_list.index("MIT-AID")
org = org_list[idx_org]
org2 = org.split("-")[0] 

## read CS faculty info scraped previously

In [3]:
file_xlsx = f"faculty-{org}.xlsx"
xlsxf = pd.ExcelFile(file_xlsx)

In [4]:
xlsxf.sheet_names

['Faculty', 'Research Groups']

In [5]:
df = xlsxf.parse('Faculty')

In [6]:
df

Unnamed: 0,name,job_title,phd_univ,phd_year,research_area,research_concentration,research_focus,url,img_url,phone,email,cell_phone,office_address,department,school
0,Hal Abelson,Class of 1992 Professor,,,"AI and Society, Artificial Intelligence + Deci...",,,https://www.eecs.mit.edu/people/hal-abelson/,https://www.eecs.mit.edu/wp-content/uploads/20...,(617) 253-5856,hal@mit.edu,,Office: 32-G516,Computer Science; AI & Decision-making,Massachusetts Institute Technology
1,Elfar Adalsteinsson,"Education Officer for Electrical Engineering, ...",,,"AI for Healthcare and Life Sciences, Biologica...",,,https://www.eecs.mit.edu/people/elfar-adalstei...,https://www.eecs.mit.edu/people/elfar-adalstei...,(617) 324-3597,elfar@mit.edu,,Office: 36-766,Electrical Engineering; AI & Decision-making,Massachusetts Institute Technology
2,Pulkit Agrawal,Steven and Renee Finn Career Development Profe...,,,"Artificial Intelligence + Machine Learning, Gr...",,,https://www.eecs.mit.edu/people/pulkit-agrawal/,https://www.eecs.mit.edu/people/pulkit-agrawal/,(617) 253-5851,pulkitag@mit.edu,,Office: 32-342,AI & Decision-making; Computer Science,Massachusetts Institute Technology
3,Mohammad Alizadeh,Associate Professor,,,"Multicore Processors & Cloud Computing, Securi...",,,https://www.eecs.mit.edu/people/mohammad-aliza...,https://www.eecs.mit.edu/people/mohammad-aliza...,(617) 253-6042,alizadeh@mit.edu,,Office: 32-G920,Computer Science;,Massachusetts Institute Technology
4,Jacob Andreas,"X-Window Consortium Professor, Assistant Profe...",,,"Artificial Intelligence + Machine Learning, Na...",,,https://www.eecs.mit.edu/people/jacob-andreas/,https://www.eecs.mit.edu/people/jacob-andreas/,617-253-0352,jda@mit.edu,,Office: 32-386H,AI & Decision-making,Massachusetts Institute Technology
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,Ashia Wilson,Lister Brothers (Gordon K. ’30 and Donald K. ’...,,,"AI and Society, Artificial Intelligence + Mach...",,,https://www.eecs.mit.edu/people/ashia-wilson/,https://www.eecs.mit.edu/wp-content/uploads/20...,,ashia07@mit.edu,,Office: 32-D660,AI & Decision-making,Massachusetts Institute Technology
88,Gregory Wornell,Sumitomo Electric Industries Professor in Engi...,,,"Artificial Intelligence + Machine Learning, Co...",,,https://www.eecs.mit.edu/people/gregory-wornell/,https://www.eecs.mit.edu/people/gregory-wornell/,(617) 253-3513,gww@mit.edu,,Office: 36-677,AI & Decision-making,Massachusetts Institute Technology
89,Guangyu Robert Yang,Assistant Professor / Shared appointment in BCS,,,Artificial Intelligence + Decision making,,,https://www.eecs.mit.edu/people/guangyu-robert...,https://www.eecs.mit.edu/wp-content/uploads/20...,(475) 201-7714,yanggr@mit.edu,,Office: 46,AI & Decision-making,Massachusetts Institute Technology
90,Lizhong Zheng,Professor of EE,,,"Communications Systems, Information Science an...",,,https://www.eecs.mit.edu/people/lizhong-zheng/,https://www.eecs.mit.edu/people/lizhong-zheng/,(617) 452-2941,lizhong@mit.edu,,Office: 36-660,AI & Decision-making; Electrical Engineering,Massachusetts Institute Technology


In [7]:
names = df["name"].to_list()

In [8]:
len(names), names 

(92,
 ['Hal Abelson',
  'Elfar Adalsteinsson',
  'Pulkit Agrawal',
  'Mohammad Alizadeh',
  'Jacob Andreas',
  'Hari Balakrishnan',
  'Regina Barzilay',
  'Robert Berwick',
  'Sangeeta Bhatia',
  'Duane Boning',
  'Guy Bresler',
  'Tamara Broderick',
  'Michael Carbin',
  'Vincent Chan',
  'YuFeng (Kevin)  Chen',
  'Isaac Chuang',
  'Connor Wilson Coley',
  'Munther Dahleh',
  'Luca Daniel',
  'Constantinos Daskalakis',
  'Randall Davis',
  'Frederic Durand',
  'Dirk Englund',
  'Dennis Freeman',
  'William Freeman',
  'Marzyeh Ghassemi',
  'David Gifford',
  'Polina Golland',
  'W. Eric L Grimson',
  'Dylan Hadfield-Menell',
  'Thomas Heldt',
  'Sam Hopkins',
  'Berthold Horn',
  'Daniel Huttenlocher',
  'Piotr Indyk',
  'Phillip Isola',
  'Tommi Jaakkola',
  'Patrick Jaillet',
  'Stefanie Jegelka',
  'Leslie Kaelbling',
  'Dina Katabi',
  'Manolis Kellis',
  'Yoon Kim',
  'Mina Konakovic Lukovic',
  'Jing Kong',
  'Tim Kraska',
  'Jae Lim',
  'Tomás Lozano-Pérez',
  'Samuel Madden',


## fetch google scholar data for all CS faculty

In [9]:
ntest = -1  #  2 # 

In [10]:
pub_data = []
for n, name in enumerate(names[:ntest]):
#     if n < 5: continue 
    print(f"n, name = {n}, {name} ...")
    
    author_org = f"{name} {org2}"
    norm_auth_org = normalize_str(author_org)
    file_author = f"data/GScholar_{norm_auth_org}.json"

    try:
        search_query = scholarly.search_author(author_org)
        init_result = next(search_query)
    except Exception as ex:
        print(str(ex))
        init_result = None 
        
    if init_result is None:
        print(f"Failed search_author()")
        continue

    scholar_id = init_result.get("scholar_id", "")
    if not scholar_id:
        print(f"Missing scholar_id")
        continue        
    
    # fetch data
    author = scholarly.fill(init_result)
    
    author_dict = {}
    # fill data cell
    author_dict["name"] = name 
    author_dict["file_author"] = file_author 
    author_dict["scholar_id"] = scholar_id 
    author_dict["affiliation"] = author.get("affiliation", "")
    author_dict["interests"] = "; ".join(author.get("interests", []))
    author_dict["url_author"] = get_scholar_page(scholar_id)
    author_dict["url_picture"] = author.get("url_picture", "")
    author_dict["url_homepage"] = author.get("homepage", "")
    author_dict["citedby"] = author.get("citedby", 0)
    author_dict["citedby5y"] = author.get("citedby5y", 0)
    author_dict["hindex"] = author.get("hindex", 0)
    author_dict["hindex5y"] = author.get("hindex5y", 0)
    author_dict["i10index"] = author.get("i10index", 0)
    author_dict["i10index5y"] = author.get("i10index5y", 0)
    author_dict["num_papers"] = len(author.get("publications", []))
    author_dict["num_coauthors"] = len(author.get("coauthors", []))
      
    # fill row 
    author_data = []
    for c in SCHOLAR_HEADER:
        author_data.append(author_dict.get(c))
        
    # accumulate row
    pub_data.append(author_data)
    
    # persist author data
    with open(Path(file_author), "w", encoding="utf-8") as f:
        f.write(json.dumps(author))
    
    delay = randint(1,5)
    sleep(delay)

n, name = 0, Hal Abelson ...

Failed search_author()
n, name = 1, Elfar Adalsteinsson ...

Failed search_author()
n, name = 2, Pulkit Agrawal ...
n, name = 3, Mohammad Alizadeh ...
n, name = 4, Jacob Andreas ...
n, name = 5, Hari Balakrishnan ...
n, name = 6, Regina Barzilay ...

Failed search_author()
n, name = 7, Robert Berwick ...
n, name = 8, Sangeeta Bhatia ...
n, name = 9, Duane Boning ...
n, name = 10, Guy Bresler ...
n, name = 11, Tamara Broderick ...
n, name = 12, Michael Carbin ...
n, name = 13, Vincent Chan ...
n, name = 14, YuFeng (Kevin)  Chen ...

Failed search_author()
n, name = 15, Isaac Chuang ...

Failed search_author()
n, name = 16, Connor Wilson Coley ...

Failed search_author()
n, name = 17, Munther Dahleh ...
n, name = 18, Luca Daniel ...

Failed search_author()
n, name = 19, Constantinos Daskalakis ...
n, name = 20, Randall Davis ...
n, name = 21, Frederic Durand ...

Failed search_author()
n, name = 22, Dirk Englund ...
n, name = 23, Dennis Freeman ...

Failed s

In [11]:
len(pub_data)

70

## write out xlsx

In [12]:
df_out = pd.DataFrame(pub_data, columns=SCHOLAR_HEADER)

In [13]:
df_out.head()

Unnamed: 0,name,affiliation,interests,num_papers,num_coauthors,citedby,hindex,i10index,citedby5y,hindex5y,i10index5y,scholar_id,url_author,url_picture,url_homepage,file_author
0,Pulkit Agrawal,Massachusetts Institute of Technology,Robotics; Computer Vision; Artificial Intellig...,97,39,8748,30,44,7923,30,43,UpZmJI0AAAAJ,https://scholar.google.com/citations?user=UpZm...,https://scholar.google.com/citations?view_op=m...,https://people.csail.mit.edu/pulkitag/,data/GScholar_pulkit_agrawal_mit.json
1,Mohammad Alizadeh,"Associate Professor of Computer Science, MIT",Computer networks; Systems; Machine learning,164,180,16853,59,103,13727,54,100,6_cxCKQAAAAJ,https://scholar.google.com/citations?user=6_cx...,https://scholar.google.com/citations?view_op=m...,http://people.csail.mit.edu/alizadeh,data/GScholar_mohammad_alizadeh_mit.json
2,Jacob Andreas,MIT,NLP; ML,106,75,7672,39,62,6847,36,58,dnZ8udEAAAAJ,https://scholar.google.com/citations?user=dnZ8...,https://scholar.google.com/citations?view_op=m...,http://web.mit.edu/jda/www,data/GScholar_jacob_andreas_mit.json
3,Hari Balakrishnan,MIT,Networked computer systems; wireless networks;...,451,34,145090,128,247,29652,62,170,Qf4bw4UAAAAJ,https://scholar.google.com/citations?user=Qf4b...,https://scholar.google.com/citations?view_op=m...,http://nms.csail.mit.edu/~hari,data/GScholar_hari_balakrishnan_mit.json
4,Robert Berwick,"Professor, Massachusetts Institute of Technology",computational linguistics; cognitive science,283,0,13935,51,103,4423,28,56,7xH1C7IAAAAJ,https://scholar.google.com/citations?user=7xH1...,https://scholar.google.com/citations?view_op=m...,,data/GScholar_robert_berwick_mit.json


In [14]:
# import xlsxwriter
file_xlsx = f"data/cs-faculty-gscholar-{org2}-{n}.xlsx"
writer = pd.ExcelWriter(Path(file_xlsx), engine='xlsxwriter')
df_out.to_excel(writer, sheet_name=org2, index=False)
writer.save()

  writer.save()
