This notebook uses `scholarly` API (https://github.com/scholarly-python-package/scholarly) to enrich CS faculty info

When searching google scholar using `scholarly`, don't include org, e.g. 
- `Stuart J. Russell berkeley` returns no result, 
    - but `Stuart J. Russell` does

In [1]:
from scrap_cs_faculty import *

## read CS faculty to be reprocessed

In [29]:
file_csv = "cs-faculty-no_scholar_id-0910.csv"  # "cs-faculty-no_scholar_id.csv"
df = pd.read_csv(file_csv)

In [30]:
df

Unnamed: 0,name,name_school
0,"DeYoung, Henry","DeYoung, Henry CMU"
1,"Eckhardt, Dave","Eckhardt, Dave CMU"
2,"Gligor, Virgil","Gligor, Virgil CMU"
3,"Jahanian, Farnam","Jahanian, Farnam CMU"
4,"Kohlbrenner, Anne","Kohlbrenner, Anne CMU"
...,...,...
113,Matus Jan Telgarsky,Matus Jan Telgarsky UIUC
114,Serge Plotkin,Serge Plotkin Stanford
115,Michael J. Clancy,Michael J. Clancy Berkeley
116,David A. Patterson,David A. Patterson Berkeley


In [31]:
names = df.values

In [32]:
len(names), names 

(118,
 array([['DeYoung, Henry', 'DeYoung, Henry CMU'],
        ['Eckhardt, Dave', 'Eckhardt, Dave CMU'],
        ['Gligor, Virgil', 'Gligor, Virgil CMU'],
        ['Jahanian, Farnam', 'Jahanian, Farnam CMU'],
        ['Kohlbrenner, Anne', 'Kohlbrenner, Anne CMU'],
        ['Kosbie, David', 'Kosbie, David CMU'],
        ['Needham, Kit', 'Needham, Kit CMU'],
        ['Sherry, Justine', 'Sherry, Justine CMU'],
        ['Vinayak, Rashmi Korlakai', 'Vinayak, Rashmi Korlakai CMU'],
        ['David Albonesi', 'David Albonesi nan'],
        ['Anne Bracy', 'Anne Bracy nan'],
        ['Robert Constable', 'Robert Constable nan'],
        ['Alan Demers', 'Alan Demers nan'],
        ['K-Y. Daisy Fan', 'K-Y. Daisy Fan nan'],
        ['Donald Greenberg', 'Donald Greenberg nan'],
        ['Curran Muhlberger', 'Curran Muhlberger nan'],
        ['Alexander "Sasha" Rush', 'Alexander "Sasha" Rush nan'],
        ['G. Edward Suh', 'G. Edward Suh nan'],
        ['Tim Teitelbaum', 'Tim Teitelbaum nan'],
    

## fetch google scholar data for all CS faculty

In [33]:
def search_gscholar(name, author_org=""):
    init_result = None
    
    if author_org:
        try:
            search_query = scholarly.search_author(author_org)
            init_result = next(search_query)
            return init_result
        except Exception as ex:
            print(str(ex))
             
    if not author_org or init_result is None:
        try:
            x = name.split()
            if len(x) > 2:
                name = x[0] + " " + x[-1]
            search_query = scholarly.search_author(name)
            init_result = next(search_query)
            return init_result
        except Exception as ex:
            print(str(ex))
            init_result = None 
        
    if init_result is None:
        print(f"Failed to search google scholar: {name}, {author_org}")
        return None

In [34]:
ntest = -1  #  5 #   

In [35]:
pub_data = []
pub_error = []
for n, r in enumerate(names[:ntest]):
#     if n < 5: continue 
    name, author_org = r
    print(f"\n n, name = {n}, {name} ...")
    
    norm_auth_org = normalize_str(author_org)
    file_author = f"data/GScholar_{norm_auth_org}.json"

    init_result = search_gscholar(name, author_org)
    if init_result is None:
        pub_error.append(r)
        continue

    scholar_id = init_result.get("scholar_id", "")
    if not scholar_id:
        print(f"Missing scholar_id")
        continue        
    
    # fetch data
    author = scholarly.fill(init_result)
    
    author_dict = {}
    # fill data cell
    author_dict["name"] = name 
    author_dict["file_author"] = file_author 
    author_dict["scholar_id"] = scholar_id 
    author_dict["affiliation"] = author.get("affiliation", "")
    author_dict["interests"] = "; ".join(author.get("interests", []))
    author_dict["url_author"] = get_scholar_page(scholar_id)
    author_dict["url_picture"] = author.get("url_picture", "")
    author_dict["url_homepage"] = author.get("homepage", "")
    author_dict["citedby"] = author.get("citedby", 0)
    author_dict["citedby5y"] = author.get("citedby5y", 0)
    author_dict["hindex"] = author.get("hindex", 0)
    author_dict["hindex5y"] = author.get("hindex5y", 0)
    author_dict["i10index"] = author.get("i10index", 0)
    author_dict["i10index5y"] = author.get("i10index5y", 0)
    author_dict["num_papers"] = len(author.get("publications", []))
    author_dict["num_coauthors"] = len(author.get("coauthors", []))
      
    # fill row 
    author_data = []
    for c in SCHOLAR_HEADER:
        author_data.append(author_dict.get(c))
        
    # accumulate row
    pub_data.append(author_data)
    
    # persist author data
    with open(Path(file_author), "w", encoding="utf-8") as f:
        f.write(json.dumps(author))
    
    delay = randint(1,5)
    sleep(delay)


 n, name = 0, DeYoung, Henry ...


Failed to search google scholar: DeYoung, Henry, DeYoung, Henry CMU

 n, name = 1, Eckhardt, Dave ...


Failed to search google scholar: Eckhardt, Dave, Eckhardt, Dave CMU

 n, name = 2, Gligor, Virgil ...


Failed to search google scholar: Gligor, Virgil, Gligor, Virgil CMU

 n, name = 3, Jahanian, Farnam ...


Failed to search google scholar: Jahanian, Farnam, Jahanian, Farnam CMU

 n, name = 4, Kohlbrenner, Anne ...


Failed to search google scholar: Kohlbrenner, Anne, Kohlbrenner, Anne CMU

 n, name = 5, Kosbie, David ...


Failed to search google scholar: Kosbie, David, Kosbie, David CMU

 n, name = 6, Needham, Kit ...


Failed to search google scholar: Needham, Kit, Needham, Kit CMU

 n, name = 7, Sherry, Justine ...


Failed to search google scholar: Sherry, Justine, Sherry, Justine CMU

 n, name = 8, Vinayak, Rashmi Korlakai ...


 n, name = 9, David Albonesi ...


Failed to search google scholar: David Albonesi, David Albonesi nan

 n, name 



Failed to search google scholar: Michael Stonebraker, Michael Stonebraker MIT

 n, name = 99, Fred Kjoelstad ...


Failed to search google scholar: Fred Kjoelstad, Fred Kjoelstad Stanford

 n, name = 100, Aviad Rubinstein ...


Failed to search google scholar: Aviad Rubinstein, Aviad Rubinstein Stanford

 n, name = 101, Jean-Claude Latombe ...


Failed to search google scholar: Jean-Claude Latombe, Jean-Claude Latombe Stanford

 n, name = 102, Domenico Ferrari ...


Failed to search google scholar: Domenico Ferrari, Domenico Ferrari Berkeley

 n, name = 103, Paul N. Hilfinger ...


Failed to search google scholar: Paul Hilfinger, Paul N. Hilfinger Berkeley

 n, name = 104, Timothy Moon-Yew Chan ...


 n, name = 105, Chandra Sekhar Chekuri ...


 n, name = 106, Michael A. Forbes ...


 n, name = 107, Sheldon Howard Jacobson ...


 n, name = 108, Robin Hillary Kravets ...


 n, name = 109, Luther Tychonievich ...


Failed to search google scholar: Luther Tychonievich, Luther Tychonievi

In [36]:
len(pub_data), len(pub_error)

(45, 72)

In [37]:
if len(pub_data) == len(pub_error):
    raise Exception("All searches failed!")

## write out xlsx

In [None]:
df_out = pd.DataFrame(pub_data, columns=SCHOLAR_HEADER)

In [46]:
df_out

Unnamed: 0,name,affiliation,interests,num_papers,num_coauthors,citedby,hindex,i10index,citedby5y,hindex5y,i10index5y,scholar_id,url_author,url_picture,url_homepage,file_author
0,"Vinayak, Rashmi Korlakai","Assistant Professor, UW-Madison",Machine Learning; Statistical Inference; Crowd...,22,7,199,7,5,160,7,5,-dDKMgoAAAAJ,https://scholar.google.com/citations?user=-dDK...,https://scholar.google.com/citations?view_op=m...,https://ramyakv.github.io/,data/GScholar_vinayak_rashmi_korlakai_cmu.json
1,K-Y. Daisy Fan,University of Kentucky,Systems biochemistry,327,0,18872,60,141,10343,43,100,2a_t2xwAAAAJ,https://scholar.google.com/citations?user=2a_t...,https://scholar.google.com/citations?view_op=m...,http://bioinformatics.cesb.uky.edu/bin/view/RC...,data/GScholar_k_y_daisy_fan_nan.json
2,"Alexander ""Sasha"" Rush","Associate Professor, Cornell University",Natural Language Processing; Machine Learning,166,18,26964,56,105,24208,52,98,LIjnUGgAAAAJ,https://scholar.google.com/citations?user=LIjn...,https://scholar.google.com/citations?view_op=m...,http://nlp.seas.harvard.edu/,data/GScholar_alexander_sasha_rush_nan.json
3,G. Edward Suh,KBRI,,893,0,23541,81,311,6194,39,167,ih5KsPkAAAAJ,https://scholar.google.com/citations?user=ih5K...,https://scholar.google.com/citations?view_op=m...,,data/GScholar_g_edward_suh_nan.json
4,YuFeng (Kevin) Chen,Unknown affiliation,,114,0,2462,28,49,1426,22,44,UQwjO6UAAAAJ,https://scholar.google.com/citations?user=UQwj...,https://scholar.google.com/citations?view_op=m...,,data/GScholar_yufeng_kevin_chen_mit.json
5,Connor Wilson Coley,"Department of Chemical Engineering, MIT",drug discovery; machine learning; automation; ...,129,12,8249,39,68,8174,39,68,l015S80AAAAJ,https://scholar.google.com/citations?user=l015...,https://scholar.google.com/citations?view_op=m...,http://coley.mit.edu/,data/GScholar_connor_wilson_coley_mit.json
6,W. Eric L Grimson,The University of Alabama,,19,1,743,9,9,406,6,6,vunn5csAAAAJ,https://scholar.google.com/citations?user=vunn...,https://scholar.google.com/citations?view_op=m...,,data/GScholar_w_eric_l_grimson_mit.json
7,Juan Carlos Niebles Duque,"Professor Emeritus, Duke-NUS",Psychiatry,1333,0,136306,175,772,37908,89,481,gFBjcfMAAAAJ,https://scholar.google.com/citations?user=gFBj...,https://scholar.google.com/citations?view_op=m...,,data/GScholar_juan_carlos_niebles_duque_stanfo...
8,John F. Canny,"University of California, Berkeley",HCI; Ubicomp; ICTD; Data Mining; Health Techno...,435,0,72850,88,230,19161,45,136,LAv0HTEAAAAJ,https://scholar.google.com/citations?user=LAv0...,https://scholar.google.com/citations?view_op=m...,,data/GScholar_john_f_canny_berkeley.json
9,David E. Culler,"University of California, Berkeley",Systems; Networks; Embedded Computing; Archite...,635,49,112366,132,347,14747,58,193,urTiL7QAAAAJ,https://scholar.google.com/citations?user=urTi...,https://scholar.google.com/citations?view_op=m...,,data/GScholar_david_e_culler_berkeley.json


In [39]:
# import xlsxwriter
org = "extra3" # "extra2"
file_xlsx = f"data/cs-faculty-gscholar-{org}.xlsx"
print(file_xlsx)
writer = pd.ExcelWriter(Path(file_xlsx), engine='xlsxwriter')
df_out.to_excel(writer, sheet_name=org, index=False)
writer.close()

data/cs-faculty-gscholar-extra3.xlsx


In [47]:
len(pub_error), pub_error[:10]

(72,
 [array(['DeYoung, Henry', 'DeYoung, Henry CMU'], dtype=object),
  array(['Eckhardt, Dave', 'Eckhardt, Dave CMU'], dtype=object),
  array(['Gligor, Virgil', 'Gligor, Virgil CMU'], dtype=object),
  array(['Jahanian, Farnam', 'Jahanian, Farnam CMU'], dtype=object),
  array(['Kohlbrenner, Anne', 'Kohlbrenner, Anne CMU'], dtype=object),
  array(['Kosbie, David', 'Kosbie, David CMU'], dtype=object),
  array(['Needham, Kit', 'Needham, Kit CMU'], dtype=object),
  array(['Sherry, Justine', 'Sherry, Justine CMU'], dtype=object),
  array(['David Albonesi', 'David Albonesi nan'], dtype=object),
  array(['Anne Bracy', 'Anne Bracy nan'], dtype=object)])

In [44]:
df_err = pd.DataFrame(pub_error, columns=["name", "name_org"])
df_err

Unnamed: 0,name,name_org
0,"DeYoung, Henry","DeYoung, Henry CMU"
1,"Eckhardt, Dave","Eckhardt, Dave CMU"
2,"Gligor, Virgil","Gligor, Virgil CMU"
3,"Jahanian, Farnam","Jahanian, Farnam CMU"
4,"Kohlbrenner, Anne","Kohlbrenner, Anne CMU"
...,...,...
67,Domenico Ferrari,Domenico Ferrari Berkeley
68,Paul N. Hilfinger,Paul N. Hilfinger Berkeley
69,Luther Tychonievich,Luther Tychonievich UIUC
70,Victor Zue,Victor Zue MIT


In [45]:
# import xlsxwriter
org = "error" # "extra2"
file_xlsx = f"data/cs-faculty-gscholar-{org}.xlsx"
print(file_xlsx)
writer = pd.ExcelWriter(Path(file_xlsx), engine='xlsxwriter')
df_err.to_excel(writer, sheet_name=org, index=False)
writer.close()

data/cs-faculty-gscholar-error.xlsx
