This notebook uses `scholarly` API (https://github.com/scholarly-python-package/scholarly) to enrich CS faculty info

In [3]:
from scrap_cs_faculty import *

In [14]:
org_list = ["CMU-CS","Cornell-CS","MIT-AID","MIT-CS","Stanford-CS","UCB-CS","UIUC-CS",]

In [17]:
org = org_list[4]  # stanford

In [18]:
org2 = org.split("-")[0] 

In [19]:
file_xlsx = f"faculty-{org}.xlsx"
xlsxf = pd.ExcelFile(file_xlsx)

In [20]:
xlsxf.sheet_names

['Faculty']

In [21]:
df = xlsxf.parse('Faculty')

In [22]:
df

Unnamed: 0,name,job_title,phd_univ,phd_year,research_area,research_concentration,research_focus,url,img_url,phone,email,cell_phone,office_address,department,school
0,Sara Achour,Regular Faculty,,,,,,,,,sachour@stanford.edu,,Gates 484,Computer Science,Stanford Univ
1,Maneesh Agrawala,Regular Faculty,,,,,,http://graphics.stanford.edu/~maneesh,,,@stanford.edu,,Gates 364,Computer Science,Stanford Univ
2,Alex Aiken,Regular Faculty,,,,,,http://theory.stanford.edu/~aiken,,5-3359,aiken@stanford.edu,,Gates 490,Computer Science,Stanford Univ
3,Nima Anari,Regular Faculty,,,,,,https://nimaanari.com,,,nima.anari@stanford.edu,,Gates 168A,Computer Science,Stanford Univ
4,Clark Barrett,Regular Faculty,,,,,,http://theory.stanford.edu/~barrett,,650-736-0822,barrett@stanford.edu,,Gates 488,Computer Science,Stanford Univ
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141,Chris Hahn,Visiting and Acting Faculty,,,,,,http://www.christopherhahn.io,,,hahn@stanford.edu,,Gates 481,Computer Science,Stanford Univ
142,Hamed Nemati,Visiting and Acting Faculty,,,,,,https://hnemati.github.io/,,,hnnemati@stanford.edu,,Gates 999,Computer Science,Stanford Univ
143,Dolière Francis Somé,Visiting and Acting Faculty,,,,,,,,,doliere@stanford.edu,,,Computer Science,Stanford Univ
144,Marco Vassena,Visiting and Acting Faculty,,,,,,https://webspace.science.uu.nl/mvassena,,,vassena@stanford.edu,,Gates 478,Computer Science,Stanford Univ


In [23]:
names = df["name"].to_list()

In [133]:
len(names), names 

(146,
 ['Sara Achour',
  'Maneesh Agrawala',
  'Alex Aiken',
  'Nima Anari',
  'Clark Barrett',
  'Gill Bejerano',
  'Michael Bernstein',
  'Jeannette Bohg',
  'Dan Boneh',
  'Adam Bouland',
  'Emma Brunskill',
  'Moses Charikar',
  'Ron Dror',
  'Zakir Durumeric',
  'Dawson Engler',
  'Stefano Ermon',
  'Kayvon Fatahalian',
  'Ron Fedkiw',
  'Chelsea Finn',
  'Emily Fox',
  'Mike Genesereth',
  'Noah Goodman',
  'Carlos Guestrin',
  'Leonidas Guibas',
  'Patrick Hanrahan',
  'Tatsu Hashimoto',
  'John Hennessy',
  'Mark Horowitz',
  'Doug James',
  'Dan Jurafsky',
  'Sachin Katti',
  'Oussama Khatib',
  'Fred Kjoelstad',
  'Fred Kjolstad',
  'Sanmi Koyejo',
  'Christos Kozyrakis',
  'Anshul Kundaje',
  'Monica Lam',
  'James Landay',
  'Jure Leskovec',
  'Philip Levis',
  'Fei-Fei Li',
  'Percy Liang',
  'Karen Liu',
  'Tengyu Ma',
  'Chris Manning',
  'David Mazieres',
  'Nick McKeown',
  'John Mitchell',
  'Subhasish Mitra',
  'Kunle Olukotun',
  'John Ousterhout',
  'Chris Piech',


## fetch google scholar data for all CS faculty

In [119]:
SCHOLAR_HEADER = [
    'name',
    'affiliation',
    'interests',
    'num_papers',
    'num_coauthors',
    'citedby',
    'hindex',
    'i10index',
    'citedby5y',
    'hindex5y',
    'i10index5y',
    'scholar_id',
    'url_author',
    'url_picture',
    'url_homepage',
    'file_author']

In [120]:
ntest = -1  #  2 # 

In [127]:
pub_data = []
for n, name in enumerate(names[:ntest]):
    if n < 5: continue 
    print(f"n, name = {n}, {name} ...")
    
    author_org = f"{name} {org2}"
    norm_auth_org = normalize_str(author_org)
    file_author = f"data/GScholar_{norm_auth_org}.json"

    try:
        search_query = scholarly.search_author(author_org)
        init_result = next(search_query)
    except Exception as ex:
        print(str(ex))
        init_result = None 
        
    if init_result is None:
        print(f"Failed search_author()")
        continue

    scholar_id = init_result.get("scholar_id", "")
    if not scholar_id:
        print(f"Missing scholar_id")
        continue        
    
    # fetch data
    author = scholarly.fill(init_result)
    
    author_dict = {}
    # fill data cell
    author_dict["name"] = name 
    author_dict["file_author"] = file_author 
    author_dict["scholar_id"] = scholar_id 
    author_dict["affiliation"] = author.get("affiliation", "")
    author_dict["interests"] = "; ".join(author.get("interests", []))
    author_dict["url_author"] = get_scholar_page(scholar_id)
    author_dict["url_picture"] = author.get("url_picture", "")
    author_dict["url_homepage"] = author.get("homepage", "")
    author_dict["citedby"] = author.get("citedby", 0)
    author_dict["citedby5y"] = author.get("citedby5y", 0)
    author_dict["hindex"] = author.get("hindex", 0)
    author_dict["hindex5y"] = author.get("hindex5y", 0)
    author_dict["i10index"] = author.get("i10index", 0)
    author_dict["i10index5y"] = author.get("i10index5y", 0)
    author_dict["num_papers"] = len(author.get("publications", []))
    author_dict["num_coauthors"] = len(author.get("coauthors", []))
      
    # fill row 
    author_data = []
    for c in SCHOLAR_HEADER:
        author_data.append(author_dict.get(c))
        
    # accumulate row
    pub_data.append(author_data)
    
    # persist author data
    with open(Path(file_author), "w", encoding="utf-8") as f:
        f.write(json.dumps(author))
    
    delay = randint(1,5)
    sleep(delay)

n, name = 5, Gill Bejerano ...

Failed search_author()
n, name = 6, Michael Bernstein ...
n, name = 7, Jeannette Bohg ...
n, name = 8, Dan Boneh ...
n, name = 9, Adam Bouland ...
n, name = 10, Emma Brunskill ...
n, name = 11, Moses Charikar ...
n, name = 12, Ron Dror ...
n, name = 13, Zakir Durumeric ...
n, name = 14, Dawson Engler ...
n, name = 15, Stefano Ermon ...
n, name = 16, Kayvon Fatahalian ...
n, name = 17, Ron Fedkiw ...

Failed search_author()
n, name = 18, Chelsea Finn ...
n, name = 19, Emily Fox ...
n, name = 20, Mike Genesereth ...
n, name = 21, Noah Goodman ...
n, name = 22, Carlos Guestrin ...
n, name = 23, Leonidas Guibas ...
n, name = 24, Patrick Hanrahan ...
n, name = 25, Tatsu Hashimoto ...
n, name = 26, John Hennessy ...
n, name = 27, Mark Horowitz ...
n, name = 28, Doug James ...
n, name = 29, Dan Jurafsky ...
n, name = 30, Sachin Katti ...
n, name = 31, Oussama Khatib ...
n, name = 32, Fred Kjoelstad ...

Failed search_author()
n, name = 33, Fred Kjolstad ...

Fa

In [128]:
len(pub_data)

102

## write out xlsx

In [129]:
df_out = pd.DataFrame(pub_data, columns=SCHOLAR_HEADER)

In [132]:
df_out.head()

Unnamed: 0,name,affiliation,interests,num_papers,num_coauthors,citedby,hindex,i10index,citedby5y,hindex5y,i10index5y,scholar_id,url_author,url_picture,url_homepage,file_author
0,Michael Bernstein,"Associate Professor, Stanford University",Human-computer interaction; social computing,245,73,63473,61,133,51505,53,118,zkhHirIAAAAJ,https://scholar.google.com/citations?user=zkhH...,https://scholar.google.com/citations?view_op=m...,http://hci.stanford.edu/msb,data/GScholar_michael_bernstein_stanford.json
1,Jeannette Bohg,"Assistant Professor, Stanford University",Robotics; Multi-Modal Perception; Machine Lear...,135,96,7216,36,76,6149,33,67,rjnJnEkAAAAJ,https://scholar.google.com/citations?user=rjnJ...,https://scholar.google.com/citations?view_op=m...,http://web.stanford.edu/~bohg,data/GScholar_jeannette_bohg_stanford.json
2,Dan Boneh,"Professor of Computer Science, Stanford Univer...",Cryptography; Computer Security; Computer Scie...,475,18,101996,132,302,38220,89,249,MwLqCs4AAAAJ,https://scholar.google.com/citations?user=MwLq...,https://scholar.google.com/citations?view_op=m...,http://crypto.stanford.edu/~dabo,data/GScholar_dan_boneh_stanford.json
3,Adam Bouland,Stanford University,Quantum Computing; Theoretical Computer Science,33,44,889,15,18,762,14,17,61uf9p0AAAAJ,https://scholar.google.com/citations?user=61uf...,https://scholar.google.com/citations?view_op=m...,http://theory.stanford.edu/~abouland/,data/GScholar_adam_bouland_stanford.json
4,Emma Brunskill,"Associate Professor of Computer Science, Stanf...",Reinforcement Learning; Machine Learning; Deci...,220,19,10839,49,120,8636,43,105,HaN8b2YAAAAJ,https://scholar.google.com/citations?user=HaN8...,https://scholar.google.com/citations?view_op=m...,http://cs.stanford.edu/people/ebrun/,data/GScholar_emma_brunskill_stanford.json


In [131]:
# import xlsxwriter
file_xlsx = f"data/cs-faculty-gscholar-{org2}-{n}.xlsx"
writer = pd.ExcelWriter(Path(file_xlsx), engine='xlsxwriter')
df_out.to_excel(writer, sheet_name=org2, index=False)
writer.save()

  writer.save()
