This notebook uses `scholarly` API (https://github.com/scholarly-python-package/scholarly) to enrich CS faculty info

In [1]:
from scrap_cs_faculty import *

In [2]:
org_list = ["CMU-CS","Cornell-CS","MIT-AID","MIT-CS","Stanford-CS","UCB-CS","UIUC-CS",]
idx_org = org_list.index("UIUC-CS")
org = org_list[idx_org]
org2 = org.split("-")[0] 

## read CS faculty info scraped previously

In [3]:
file_xlsx = f"faculty-{org}.xlsx"
xlsxf = pd.ExcelFile(file_xlsx)

In [4]:
xlsxf.sheet_names

['Faculty']

In [5]:
df = xlsxf.parse('Faculty')

In [6]:
df

Unnamed: 0,name,job_title,phd_univ,phd_year,research_area,research_concentration,research_focus,url,img_url,phone,email,cell_phone,office_address,department,school
0,Tarek Abdelzaher,Sohaib and Sara Abbasi Professor and Willett F...,,,,,,https://cs.illinois.edu/about/people/departmen...,https://ws.engr.illinois.edu/directory/viewpho...,(217) 265-6793,zaher@illinois.edu,,,Computer Science,Univ Illinois Urbana-Champaign
1,Sarita V. Adve,Richard T. Cheng Professor,,,,,,https://cs.illinois.edu/about/people/departmen...,https://ws.engr.illinois.edu/directory/viewpho...,(217) 333-8461,sadve@illinois.edu,,,Computer Science,Univ Illinois Urbana-Champaign
2,Vikram Adve,Donald B. Gillies Professor in Computer Science,,,,,,https://cs.illinois.edu/about/people/departmen...,https://ws.engr.illinois.edu/directory/viewpho...,(217) 244-2016,vadve@illinois.edu,,,Computer Science,Univ Illinois Urbana-Champaign
3,Gul A. Agha,Research Professor and Professor Emeritus,,,,,,https://cs.illinois.edu/about/people/departmen...,https://ws.engr.illinois.edu/directory/viewpho...,(217) 244-3087,agha@illinois.edu,,,Computer Science,Univ Illinois Urbana-Champaign
4,Ram Alagappan,Assistant Professor,,,,,,https://cs.illinois.edu/about/people/departmen...,https://ws.engr.illinois.edu/directory/viewpho...,,ramn@illinois.edu,,,Computer Science,Univ Illinois Urbana-Champaign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,Wenzhen Yuan,Assistant Professor starting Fall 2023,,,,,,https://cs.illinois.edu/about/people/departmen...,https://ws.engr.illinois.edu/directory/viewpho...,,yuanwz@illinois.edu,,,Computer Science,Univ Illinois Urbana-Champaign
117,ChengXiang Zhai,Donald Biggar Willett Professor in Engineering,,,,,,https://cs.illinois.edu/about/people/departmen...,https://ws.engr.illinois.edu/directory/viewpho...,(217) 244-4943,czhai@illinois.edu,,,Computer Science,Univ Illinois Urbana-Champaign
118,Lingming Zhang,Associate Professor,,,,,,https://cs.illinois.edu/about/people/departmen...,https://ws.engr.illinois.edu/directory/viewpho...,(217) 244-8921,lingming@illinois.edu,,,Computer Science,Univ Illinois Urbana-Champaign
119,Han Zhao,Assistant Professor,,,,,,https://cs.illinois.edu/about/people/departmen...,https://ws.engr.illinois.edu/directory/viewpho...,,hanzhao@illinois.edu,,,Computer Science,Univ Illinois Urbana-Champaign


In [7]:
names = df["name"].to_list()

In [8]:
len(names), names 

(121,
 ['Tarek  Abdelzaher',
  'Sarita V. Adve',
  'Vikram Adve',
  'Gul A. Agha',
  'Ram Alagappan',
  'Abdussalam Alawini',
  'Nancy M. Amato',
  'Lawrence Angrave',
  'Brian P. Bailey',
  'Arindam  Banerjee',
  'Adam Bates',
  'Mattox Alan Beckman',
  'Matthew Caesar',
  'George  Chacko',
  'Geoffrey Werner Challen',
  'Timothy Moon-Yew Chan',
  'Eshwar  Chandrasekharan',
  'Kevin Chenchuan Chang',
  'Chandra Sekhar Chekuri',
  'Girish  Chowdhary',
  'Camille Cobb',
  'Benjamin Cosman',
  'Katie Cunningham',
  'Ryan Matthew Cunningham',
  'Neal E. Davis',
  'Payam  Delgosha',
  'Mohammed  El-Kebir',
  'Jeff Erickson',
  'Carl Evans',
  'Wade A. Fagen-Ulmschneider',
  'Paul  Fischer',
  'Margaret M. Fleck',
  'Christopher Wardlaw Fletcher',
  'Michael A. Forbes',
  'David Forsyth',
  'Aishwarya Ganesan',
  'Yael  Gertner',
  'Saugata  Ghose',
  'Brighten Godfrey',
  'William D. Gropp',
  'Liangyan  Gui',
  'Carl  Gunter',
  'Elsa  Gunter',
  'Indranil  Gupta',
  'Jiawei  Han',
  'Sar

## fetch google scholar data for all CS faculty

In [9]:
ntest = -1  #  2 # 

In [10]:
pub_data = []
for n, name in enumerate(names[:ntest]):
#     if n < 5: continue 
    print(f"n, name = {n}, {name} ...")
    
    author_org = f"{name} {org2}"
    norm_auth_org = normalize_str(author_org)
    file_author = f"data/GScholar_{norm_auth_org}.json"

    try:
        search_query = scholarly.search_author(author_org)
        init_result = next(search_query)
    except Exception as ex:
        print(str(ex))
        init_result = None 
        
    if init_result is None:
        print(f"Failed search_author()")
        continue

    scholar_id = init_result.get("scholar_id", "")
    if not scholar_id:
        print(f"Missing scholar_id")
        continue        
    
    # fetch data
    author = scholarly.fill(init_result)
    
    author_dict = {}
    # fill data cell
    author_dict["name"] = name 
    author_dict["file_author"] = file_author 
    author_dict["scholar_id"] = scholar_id 
    author_dict["affiliation"] = author.get("affiliation", "")
    author_dict["interests"] = "; ".join(author.get("interests", []))
    author_dict["url_author"] = get_scholar_page(scholar_id)
    author_dict["url_picture"] = author.get("url_picture", "")
    author_dict["url_homepage"] = author.get("homepage", "")
    author_dict["citedby"] = author.get("citedby", 0)
    author_dict["citedby5y"] = author.get("citedby5y", 0)
    author_dict["hindex"] = author.get("hindex", 0)
    author_dict["hindex5y"] = author.get("hindex5y", 0)
    author_dict["i10index"] = author.get("i10index", 0)
    author_dict["i10index5y"] = author.get("i10index5y", 0)
    author_dict["num_papers"] = len(author.get("publications", []))
    author_dict["num_coauthors"] = len(author.get("coauthors", []))
      
    # fill row 
    author_data = []
    for c in SCHOLAR_HEADER:
        author_data.append(author_dict.get(c))
        
    # accumulate row
    pub_data.append(author_data)
    
    # persist author data
    with open(Path(file_author), "w", encoding="utf-8") as f:
        f.write(json.dumps(author))
    
    delay = randint(1,5)
    sleep(delay)

n, name = 0, Tarek  Abdelzaher ...
n, name = 1, Sarita V. Adve ...

Failed search_author()
n, name = 2, Vikram Adve ...
n, name = 3, Gul A. Agha ...

Failed search_author()
n, name = 4, Ram Alagappan ...

Failed search_author()
n, name = 5, Abdussalam Alawini ...
n, name = 6, Nancy M. Amato ...
n, name = 7, Lawrence Angrave ...
n, name = 8, Brian P. Bailey ...
n, name = 9, Arindam  Banerjee ...
n, name = 10, Adam Bates ...
n, name = 11, Mattox Alan Beckman ...

Failed search_author()
n, name = 12, Matthew Caesar ...
n, name = 13, George  Chacko ...
n, name = 14, Geoffrey Werner Challen ...

Failed search_author()
n, name = 15, Timothy Moon-Yew Chan ...

Failed search_author()
n, name = 16, Eshwar  Chandrasekharan ...
n, name = 17, Kevin Chenchuan Chang ...
n, name = 18, Chandra Sekhar Chekuri ...

Failed search_author()
n, name = 19, Girish  Chowdhary ...
n, name = 20, Camille Cobb ...
n, name = 21, Benjamin Cosman ...
n, name = 22, Katie Cunningham ...
n, name = 23, Ryan Matthew Cunni

In [11]:
len(pub_data)

85

## write out xlsx

In [12]:
df_out = pd.DataFrame(pub_data, columns=SCHOLAR_HEADER)

In [13]:
df_out.head()

Unnamed: 0,name,affiliation,interests,num_papers,num_coauthors,citedby,hindex,i10index,citedby5y,hindex5y,i10index5y,scholar_id,url_author,url_picture,url_homepage,file_author
0,Tarek Abdelzaher,University of Illinois,Real-time Systems; wireless sensor networks; c...,751,0,42269,96,330,9888,48,202,cA28Zs0AAAAJ,https://scholar.google.com/citations?user=cA28...,https://scholar.google.com/citations?view_op=m...,,data/GScholar_tarek_abdelzaher_uiuc.json
1,Vikram Adve,University of Illinois at Urbana-Champaign,Compilers; Programming Languages; Parallel Com...,171,36,15388,50,85,5618,27,47,VbruE20AAAAJ,https://scholar.google.com/citations?user=Vbru...,https://scholar.google.com/citations?view_op=m...,http://vikram.cs.illinois.edu/,data/GScholar_vikram_adve_uiuc.json
2,Abdussalam Alawini,"Teaching Assistant Professor, University of Il...",Database Systems; Data Science; Data Provenanc...,38,12,178,9,8,168,8,6,lyoVis4AAAAJ,https://scholar.google.com/citations?user=lyoV...,https://scholar.google.com/citations?view_op=m...,https://cs.illinois.edu/about/people/faculty/a...,data/GScholar_abdussalam_alawini_uiuc.json
3,Nancy M. Amato,"Abel Bliss Professor and Head, Computer Scienc...",motion planning; robotics; computational biolo...,471,7,12485,60,180,3253,28,91,AmaB9c4AAAAJ,https://scholar.google.com/citations?user=AmaB...,https://scholar.google.com/citations?view_op=m...,https://cs.illinois.edu/directory/profile/namato,data/GScholar_nancy_m_amato_uiuc.json
4,Lawrence Angrave,"Teaching Professor of Computer Science, Univer...",,46,9,288,8,8,205,6,5,KEl_ggoAAAAJ,https://scholar.google.com/citations?user=KEl_...,https://scholar.google.com/citations?view_op=m...,,data/GScholar_lawrence_angrave_uiuc.json


In [14]:
# import xlsxwriter
file_xlsx = f"data/cs-faculty-gscholar-{org2}-{n}.xlsx"
writer = pd.ExcelWriter(Path(file_xlsx), engine='xlsxwriter')
df_out.to_excel(writer, sheet_name=org2, index=False)
writer.save()

  writer.save()
