This notebook will 
- Scrape CS-info for 3 new schools
    - Princeton
    - UWash
    - UPenn
- For each new school, merge 
    - ./faculty-Princeton-CS.xlsx
    - data/cs-faculty-gscholar-Princeton-CS.xlsx
- append to data/cs_faculty_top6-2023-09-11.xlsx 
- write out data/cs_faculty_top6-2023-09-16.xlsx
- update my github repo: csinfo

In [1]:
from scrap_cs_faculty import *

In [3]:
file_out = "data/cs_faculty_top6-2023-09-16.xlsx"
file_in = "data/cs_faculty_top6-2023-09-11.xlsx"
new_schools = ["Princeton-CS", "UWash-CS", "UPenn-CS",]
file_faculty = "faculty-{school}.xlsx"
file_gscholar = "data/cs-faculty-gscholar-{school}.xlsx"

## read dataset to enrich

In [5]:
df_in = pd.read_excel(Path(file_in))
df_in.fillna("", inplace=True)

df_in.columns 

Index(['name', 'affiliation', 'interests', 'num_papers', 'num_coauthors',
       'citedby', 'hindex', 'i10index', 'citedby5y', 'hindex5y', 'i10index5y',
       'phd_univ', 'phd_year', 'url_homepage', 'url', 'url_author',
       'research_area', 'research_concentration', 'research_focus',
       'job_title', 'url_picture', 'img_url', 'phone', 'email', 'cell_phone',
       'office_address', 'department', 'school', 'scholar_id', 'file_author',
       'not_found_date'],
      dtype='object')

In [51]:
DEBUG = False 
(NSTART, NSTOP) = (0, 1) if DEBUG else (0, -1)

In [59]:
list_df_faculty = []
list_df_gscholar = []
for n, school in enumerate(new_schools[NSTART : NSTOP]):
    df_faculty1 = pd.read_excel(Path(file_faculty.format(school=school)))
    df_faculty1.fillna("", inplace=True)
    
    df_gscholar1 = pd.read_excel(Path(file_gscholar.format(school=school)))
    df_gscholar1.fillna("", inplace=True)
    
    list_df_faculty.append(df_faculty1)
    list_df_gscholar.append(df_gscholar1)
    
df_faculty = pd.concat(list_df_faculty)
df_gscholar = pd.concat(list_df_gscholar)

In [60]:
df_faculty.rename(columns={"url_author": "url_author_orig"}, inplace=True)

In [61]:
df_faculty.tail(3)

Unnamed: 0,name,job_title,phd_univ,phd_year,research_area,research_concentration,research_focus,url,img_url,phone,email,cell_phone,office_address,department,school,url_profile,url_author_orig
95,James R Wilcox,,,,"Computer science education, programming langua...",,,http://homes.cs.washington.edu/~jrw12/,https://s3-us-west-2.amazonaws.com/www-cse-pub...,,jrw12@cs.washington.edu,,,Computer Science,Univ Washington,,
96,Brett Wortzman,,,,Computer science education; Computer science t...,,,https://homes.cs.washington.edu/~brettwo/,https://s3-us-west-2.amazonaws.com/www-cse-pub...,,brettwo@cs.washington.edu,,CSE 542,Computer Science,Univ Washington,,
97,Luke Zettlemoyer,,,,,,,http://www.cs.washington.edu/people/faculty/lsz/,https://s3-us-west-2.amazonaws.com/www-cse-pub...,,lsz@cs.washington.edu,,CSE 534,Computer Science,Univ Washington,,


In [62]:
df_gscholar.tail(3)

Unnamed: 0,name,affiliation,interests,num_papers,num_coauthors,citedby,hindex,i10index,citedby5y,hindex5y,i10index5y,scholar_id,url_author,url_picture,url_homepage,file_author
70,Sheng Wang,Assistant Professor at University of Washington,machine learning; computational biology; cance...,65,0,1889,20,32,1657,19,30,b6wGKhsAAAAJ,https://scholar.google.com/citations?user=b6wG...,https://scholar.google.com/citations?view_op=m...,https://homes.cs.washington.edu/~swang/,data/GScholar_sheng_wang_uwash.json
71,Xi Wang,University of Washington,Computer Systems,38,0,3114,22,30,1765,19,24,uy1OB3YAAAAJ,https://scholar.google.com/citations?user=uy1O...,https://scholar.google.com/citations?view_op=m...,https://homes.cs.washington.edu/~xi/,data/GScholar_xi_wang_uwash.json
72,James R Wilcox,University of Washington,Programming languages; Formal Verification; Di...,38,31,2299,18,19,1459,15,17,rQ0Zl50AAAAJ,https://scholar.google.com/citations?user=rQ0Z...,https://scholar.google.com/citations?view_op=m...,https://jamesrwilcox.com/,data/GScholar_james_r_wilcox_uwash.json


In [63]:
df_faculty_new = df_faculty.merge(df_gscholar, how="left", on="name").fillna("")

In [64]:
df_faculty_new

Unnamed: 0,name,job_title,phd_univ,phd_year,research_area,research_concentration,research_focus,url,img_url,phone,...,hindex,i10index,citedby5y,hindex5y,i10index5y,scholar_id,url_author,url_picture,url_homepage,file_author
0,Parastoo Abtahi,Assistant Professor,Stanford University,2022.0,"Human-Computer Interaction, Augmented Reality,...",,,https://www.cs.princeton.edu/~abtahi,https://www.cs.princeton.edu/sites/all/modules...,(609) 258-9528,...,9.0,7.0,490.0,9.0,7.0,u3eIIOsAAAAJ,https://scholar.google.com/citations?user=u3eI...,https://scholar.google.com/citations?view_op=m...,http://www.parastooabtahi.com/,data/GScholar_parastoo_abtahi_princeton.json
1,Ryan Adams,"Professor, Associate Chair",University of Cambridge,2009.0,"I am interested in machine learning, artificia...",,,https://www.cs.princeton.edu/~rpa,https://www.cs.princeton.edu/sites/all/modules...,(609) 258-8682,...,61.0,130.0,31187.0,55.0,114.0,grQ_GBgAAAAJ,https://scholar.google.com/citations?user=grQ_...,https://scholar.google.com/citations?view_op=m...,http://www.cs.princeton.edu/~rpa/,data/GScholar_ryan_adams_princeton.json
2,Andrew Appel,Eugene Higgins Professor,Carnegie-Mellon University,1985.0,"Software verification, computer security, prog...",,,https://www.cs.princeton.edu/~appel,https://www.cs.princeton.edu/sites/all/modules...,(609) 258-4627,...,59.0,132.0,3695.0,29.0,63.0,wC_ntLYAAAAJ,https://scholar.google.com/citations?user=wC_n...,https://scholar.google.com/citations?view_op=m...,http://www.cs.princeton.edu/~appel/,data/GScholar_andrew_appel_princeton.json
3,Sanjeev Arora,Charles C. Fitzmorris Professor,"University of California, Berkeley",1994.0,Uses of randomness in complexity theory and al...,,,https://www.cs.princeton.edu/~arora,https://www.cs.princeton.edu/sites/all/modules...,(609) 258-3869,...,73.0,136.0,17012.0,51.0,107.0,RUP4S68AAAAJ,https://scholar.google.com/citations?user=RUP4...,https://scholar.google.com/citations?view_op=m...,http://www.cs.princeton.edu/~arora/,data/GScholar_sanjeev_arora_princeton.json
4,David August,Professor,"University of Illinois, Urbana/Champaign",2000.0,Computer Architecture and Compilers,,,https://www.cs.princeton.edu/~august,https://www.cs.princeton.edu/sites/all/modules...,(609) 258-2085,...,46.0,90.0,1852.0,25.0,48.0,Wov5tYoAAAAJ,https://scholar.google.com/citations?user=Wov5...,https://scholar.google.com/citations?view_op=m...,,data/GScholar_david_august_princeton.json
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161,Xi Wang,,,,"Computer systems, security, and programming la...",,,/people/faculty/xi,https://s3-us-west-2.amazonaws.com/www-cse-pub...,,...,22.0,30.0,1765.0,19.0,24.0,uy1OB3YAAAAJ,https://scholar.google.com/citations?user=uy1O...,https://scholar.google.com/citations?view_op=m...,https://homes.cs.washington.edu/~xi/,data/GScholar_xi_wang_uwash.json
162,Robbie Weber,,,,,,,http://weberrobbie.com/,https://s3-us-west-2.amazonaws.com/www-cse-pub...,,...,,,,,,,,,,
163,James R Wilcox,,,,"Computer science education, programming langua...",,,http://homes.cs.washington.edu/~jrw12/,https://s3-us-west-2.amazonaws.com/www-cse-pub...,,...,18.0,19.0,1459.0,15.0,17.0,rQ0Zl50AAAAJ,https://scholar.google.com/citations?user=rQ0Z...,https://scholar.google.com/citations?view_op=m...,https://jamesrwilcox.com/,data/GScholar_james_r_wilcox_uwash.json
164,Brett Wortzman,,,,Computer science education; Computer science t...,,,https://homes.cs.washington.edu/~brettwo/,https://s3-us-west-2.amazonaws.com/www-cse-pub...,,...,,,,,,,,,,


In [65]:
# add not_found_date column
def tag_missing_profile(row):
    return datetime.now().strftime('%Y-%m-%d') if row["scholar_id"] == "" else ""

df_faculty_new["not_found_date"] = df_faculty_new.apply(tag_missing_profile, axis=1)

In [66]:
cols_scholar = [c for c in df_gscholar.columns if c != 'name']
cols_faculty = [c for c in df_faculty.columns if c not in cols_scholar]

In [67]:
len(df_faculty_new.columns), len(df_in.columns)

(33, 31)

In [68]:
set(df_faculty_new.columns).difference(set(df_in.columns))

{'url_author_orig', 'url_profile'}

In [69]:
set(df_in.columns).difference(set(df_faculty_new.columns))

set()

In [70]:
df_out = df_in.copy()

In [71]:
for c in set(df_faculty_new.columns).difference(set(df_in.columns)):
    df_out[c] = ""

In [72]:
df_out

Unnamed: 0,name,affiliation,interests,num_papers,num_coauthors,citedby,hindex,i10index,citedby5y,hindex5y,...,email,cell_phone,office_address,department,school,scholar_id,file_author,not_found_date,url_profile,url_author_orig
0,Jiawei Han,"Abel Bliss Professor of Computer Science, Univ...",data mining; database systems; data warehousin...,1653.0,92.0,244378.0,200.0,1022.0,82746.0,117.0,...,hanj@illinois.edu,,,Computer Science,Univ Illinois Urbana-Champaign,Kv9AbjMAAAAJ,data/GScholar_jiawei_han_uiuc.json,,,
1,Robert Miller,"Professor, Computer Science, MIT",HCI; software engineering; crowdsourcing,1491.0,0.0,35488.0,78.0,324.0,13243.0,49.0,...,rcm@mit.edu,,Office: 32-G718,Computer Science,Massachusetts Institute Technology,e-c3R8QAAAAJ,data/GScholar_robert_miller_mit.json,,,
2,"Kanade, Takeo",Carnegie Mellon University,Computer Vision,1302.0,0.0,147292.0,169.0,636.0,33531.0,74.0,...,kanade@andrew.cmu.edu,,,Computer Science,Carnegie Mellon Univ,LQ87h3sAAAAJ,data/GScholar_kanade_takeo_cmu.json,,,
3,"Faloutsos, Christos",CMU,Data Mining; Graph Mining; Databases,1126.0,51.0,112459.0,148.0,589.0,34139.0,87.0,...,christos@andrew.cmu.edu,,7003 Gates and Hillman Centers,Computer Science,Carnegie Mellon Univ,nd8lQQIAAAAJ,data/GScholar_faloutsos_christos_cmu.json,,,
4,Daniela Rus,Andrew (1956) and Erna Viterbi Professor of Co...,Robotics; Wireless Networks; Distributed Compu...,1071.0,0.0,67599.0,137.0,574.0,38898.0,99.0,...,rus@csail.mit.edu,,Office: 32-374,AI & Decision-making; Computer Science,Massachusetts Institute Technology,910z20QAAAAJ,data/GScholar_daniela_rus_mit.json,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
772,Anand V Natarajan,"Assistant Professor, EECS, MIT",Quantum computing,20.0,11.0,583.0,11.0,11.0,534.0,11.0,...,anandn@mit.edu,,Office: 32,Computer Science,Massachusetts Institute Technology,7Y4nM-oAAAAJ,data/GScholar_anand_v_natarajan_mit.json,,,
773,Randy H. Katz,"University of California, Berkeley",Computer Systems; Communications Networks; Dis...,687.0,0.0,94359.0,127.0,340.0,16845.0,49.0,...,randy@cs.berkeley.edu,,"751 Soda Hall,",Computer Science,Univ California Berkeley,PkfChMgAAAAJ,data/GScholar_randy_h_katz_berkeley.json,,,
774,Matus Jan Telgarsky,"University of Illinois, Urbana-Champaign",deep learning theory; machine learning theory,54.0,21.0,5075.0,21.0,28.0,4350.0,19.0,...,mjt@illinois.edu,,,Computer Science,Univ Illinois Urbana-Champaign,Fc-5yRIAAAAJ,data/GScholar_matus_jan_telgarsky_uiuc.json,,,
775,Michael J. Clancy,"Professor of Politics, University of Hartford",political economy; tourism; Ireland; Mexico; s...,37.0,0.0,1584.0,16.0,19.0,523.0,12.0,...,clancy@cs.berkeley.edu,,"784 Soda Hall,",Computer Science,Univ California Berkeley,q-lFwSEAAAAJ,data/GScholar_michael_j_clancy_berkeley.json,,,


In [73]:
df_new = df_faculty_new[df_out.columns]

In [74]:
df_new.tail(2)

Unnamed: 0,name,affiliation,interests,num_papers,num_coauthors,citedby,hindex,i10index,citedby5y,hindex5y,...,email,cell_phone,office_address,department,school,scholar_id,file_author,not_found_date,url_profile,url_author_orig
164,Brett Wortzman,,,,,,,,,,...,brettwo@cs.washington.edu,,CSE 542,Computer Science,Univ Washington,,,2023-09-16,,
165,Luke Zettlemoyer,,,,,,,,,,...,lsz@cs.washington.edu,,CSE 534,Computer Science,Univ Washington,,,2023-09-16,,


In [50]:
df_out.head(2)

Unnamed: 0,name,affiliation,interests,num_papers,num_coauthors,citedby,hindex,i10index,citedby5y,hindex5y,...,email,cell_phone,office_address,department,school,scholar_id,file_author,not_found_date,url_profile,url_author_orig
0,Jiawei Han,"Abel Bliss Professor of Computer Science, Univ...",data mining; database systems; data warehousin...,1653.0,92.0,244378.0,200.0,1022.0,82746.0,117.0,...,hanj@illinois.edu,,,Computer Science,Univ Illinois Urbana-Champaign,Kv9AbjMAAAAJ,data/GScholar_jiawei_han_uiuc.json,,,
1,Robert Miller,"Professor, Computer Science, MIT",HCI; software engineering; crowdsourcing,1491.0,0.0,35488.0,78.0,324.0,13243.0,49.0,...,rcm@mit.edu,,Office: 32-G718,Computer Science,Massachusetts Institute Technology,e-c3R8QAAAAJ,data/GScholar_robert_miller_mit.json,,,


In [5]:
sql_stmt = f"""
select distinct * from df 
--where scholar_id is null or scholar_id = ''
"""
df_u = duckdb.sql(sql_stmt).df()
df_u 

Unnamed: 0,name,affiliation,interests,num_papers,num_coauthors,citedby,hindex,i10index,citedby5y,hindex5y,...,img_url,phone,email,cell_phone,office_address,department,school,scholar_id,file_author,not_found_date
0,Jiawei Han,"Abel Bliss Professor of Computer Science, Univ...",data mining; database systems; data warehousin...,1653.0,92.0,244378.0,200.0,1022.0,82746.0,117.0,...,https://ws.engr.illinois.edu/directory/viewpho...,(217) 333-6903,hanj@illinois.edu,,,Computer Science,Univ Illinois Urbana-Champaign,Kv9AbjMAAAAJ,data/GScholar_jiawei_han_uiuc.json,
1,Robert Miller,"Professor, Computer Science, MIT",HCI; software engineering; crowdsourcing,1491.0,0.0,35488.0,78.0,324.0,13243.0,49.0,...,https://www.eecs.mit.edu/people/rob-miller/,(617) 324-6028,rcm@mit.edu,,Office: 32-G718,Computer Science,Massachusetts Institute Technology,e-c3R8QAAAAJ,data/GScholar_robert_miller_mit.json,
2,"Kanade, Takeo",Carnegie Mellon University,Computer Vision,1302.0,0.0,147292.0,169.0,636.0,33531.0,74.0,...,,(412) 268-3016,kanade@andrew.cmu.edu,,,Computer Science,Carnegie Mellon Univ,LQ87h3sAAAAJ,data/GScholar_kanade_takeo_cmu.json,
3,"Faloutsos, Christos",CMU,Data Mining; Graph Mining; Databases,1126.0,51.0,112459.0,148.0,589.0,34139.0,87.0,...,,(412) 268-1457,christos@andrew.cmu.edu,,7003 Gates and Hillman Centers,Computer Science,Carnegie Mellon Univ,nd8lQQIAAAAJ,data/GScholar_faloutsos_christos_cmu.json,
4,Daniela Rus,Andrew (1956) and Erna Viterbi Professor of Co...,Robotics; Wireless Networks; Distributed Compu...,1071.0,0.0,67599.0,137.0,574.0,38898.0,99.0,...,https://www.eecs.mit.edu/people/daniela-rus/,(617) 258-7567,rus@csail.mit.edu,,Office: 32-374,AI & Decision-making; Computer Science,Massachusetts Institute Technology,910z20QAAAAJ,data/GScholar_daniela_rus_mit.json,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
772,"Donahue, Chris","Research Scientist, Google Magenta",Machine learning for music; Generative modelin...,27.0,37.0,3596.0,13.0,13.0,3578.0,13.0,...,,,chrisdon@andrew.cmu.edu,,,Computer Science,Carnegie Mellon Univ,MgzHAPQAAAAJ,data/GScholar_donahue_chris_cmu.json,
773,Sarah Chasins,"University of California, Berkeley",programming languages; human-computer interact...,47.0,0.0,470.0,12.0,12.0,412.0,12.0,...,https://www2.eecs.berkeley.edu/Faculty/Photos/...,,,,"637 Soda Hall,",Computer Science,Univ California Berkeley,-5Em-tcAAAAJ,data/GScholar_sarah_chasins_berkeley.json,
774,ChengXiang Zhai,University of Illinois at Urbana-Champaign,Information Retrieval; Text Mining; Machine Le...,568.0,1.0,40247.0,92.0,285.0,14642.0,58.0,...,https://ws.engr.illinois.edu/directory/viewpho...,(217) 244-4943,czhai@illinois.edu,,,Computer Science,Univ Illinois Urbana-Champaign,YU-baPIAAAAJ,data/GScholar_chengxiang_zhai_uiuc.json,
775,Sanjam Garg,"University of California, Berkeley",Cryptography; Security; Privacy; Theoretical C...,122.0,69.0,7673.0,43.0,83.0,4501.0,37.0,...,https://www2.eecs.berkeley.edu/Faculty/Photos/...,,sanjamg@berkeley.edu,,685 Soda Hall,Computer Science,Univ California Berkeley,mb8mQH8AAAAJ,data/GScholar_sanjam_garg_berkeley.json,


In [6]:
sql_stmt = f"""
select distinct * from df 
where scholar_id is null or scholar_id = ''
"""
df_missing = duckdb.sql(sql_stmt).df()
df_missing 

Unnamed: 0,name,affiliation,interests,num_papers,num_coauthors,citedby,hindex,i10index,citedby5y,hindex5y,...,img_url,phone,email,cell_phone,office_address,department,school,scholar_id,file_author,not_found_date
0,"DeYoung, Henry",,,,,,,,,,...,,,hdeyoung@andrew.cmu.edu,,,Computer Science,Carnegie Mellon Univ,,,2023-09-10
1,"Eckhardt, Dave",,,,,,,,,,...,,(412) 268-6720,de0u@andrew.cmu.edu,,4001 Gates and Hillman Centers,Computer Science,Carnegie Mellon Univ,,,2023-09-10
2,"Gligor, Virgil",,,,,,,,,,...,,(412) 268-9833,virgil@andrew.cmu.edu,,2123 Mehrabian Collaborative Innovation Center,Computer Science,Carnegie Mellon Univ,,,2023-09-10
3,"Jahanian, Farnam",,,,,,,,,,...,,(412) 268-2200,farnam@andrew.cmu.edu,,610 Warner Hall,Computer Science,Carnegie Mellon Univ,,,2023-09-10
4,"Kohlbrenner, Anne",,,,,,,,,,...,,,akohlbre@andrew.cmu.edu,,,Computer Science,Carnegie Mellon Univ,,,2023-09-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,Victor Zue,,,,,,,,,,...,https://www.eecs.mit.edu/people/victor-zue/,(617) 253-8513,zue@csail.mit.edu,,Office: 32-G470,AI & Decision-making; Computer Science,Massachusetts Institute Technology,,,2023-09-10
114,Anand V Natarajan,,,,,,,,,,...,https://www.eecs.mit.edu/people/anand-v-natara...,408-910-4116,anandn@mit.edu,,Office: 32,Computer Science,Massachusetts Institute Technology,,,2023-09-10
115,Randy H. Katz,,,,,,,,,,...,https://www2.eecs.berkeley.edu/Faculty/Photos/...,510-642-8778,randy@cs.berkeley.edu,,"751 Soda Hall,",Computer Science,Univ California Berkeley,,,2023-09-10
116,Yury Polyanskiy,,,,,,,,,,...,https://www.eecs.mit.edu/people/yury-polyanskiy/,(617) 324-0047,yp@mit.edu,,Office: 32-D668,AI & Decision-making; Electrical Engineering,Massachusetts Institute Technology,,,2023-09-10


238 rows without scholar_id

## read extra search

In [7]:
df_x = pd.read_excel(Path(file_xlsx_extra))
df_x.fillna("", inplace=True)

In [8]:
df_x.columns 

Index(['name', 'affiliation', 'interests', 'num_papers', 'num_coauthors',
       'citedby', 'hindex', 'i10index', 'citedby5y', 'hindex5y', 'i10index5y',
       'scholar_id', 'url_author', 'url_picture', 'url_homepage',
       'file_author'],
      dtype='object')

In [9]:
df_x

Unnamed: 0,name,affiliation,interests,num_papers,num_coauthors,citedby,hindex,i10index,citedby5y,hindex5y,i10index5y,scholar_id,url_author,url_picture,url_homepage,file_author
0,"Vinayak, Rashmi Korlakai","Assistant Professor, UW-Madison",Machine Learning; Statistical Inference; Crowd...,22,7,199,7,5,160,7,5,-dDKMgoAAAAJ,https://scholar.google.com/citations?user=-dDK...,https://scholar.google.com/citations?view_op=m...,https://ramyakv.github.io/,data/GScholar_vinayak_rashmi_korlakai_cmu.json
1,K-Y. Daisy Fan,University of Kentucky,Systems biochemistry,327,0,18872,60,141,10343,43,100,2a_t2xwAAAAJ,https://scholar.google.com/citations?user=2a_t...,https://scholar.google.com/citations?view_op=m...,http://bioinformatics.cesb.uky.edu/bin/view/RC...,data/GScholar_k_y_daisy_fan_nan.json
2,"Alexander ""Sasha"" Rush","Associate Professor, Cornell University",Natural Language Processing; Machine Learning,166,18,26964,56,105,24208,52,98,LIjnUGgAAAAJ,https://scholar.google.com/citations?user=LIjn...,https://scholar.google.com/citations?view_op=m...,http://nlp.seas.harvard.edu/,data/GScholar_alexander_sasha_rush_nan.json
3,G. Edward Suh,KBRI,,893,0,23541,81,311,6194,39,167,ih5KsPkAAAAJ,https://scholar.google.com/citations?user=ih5K...,https://scholar.google.com/citations?view_op=m...,,data/GScholar_g_edward_suh_nan.json
4,YuFeng (Kevin) Chen,Unknown affiliation,,114,0,2462,28,49,1426,22,44,UQwjO6UAAAAJ,https://scholar.google.com/citations?user=UQwj...,https://scholar.google.com/citations?view_op=m...,,data/GScholar_yufeng_kevin_chen_mit.json
5,Connor Wilson Coley,"Department of Chemical Engineering, MIT",drug discovery; machine learning; automation; ...,129,12,8249,39,68,8174,39,68,l015S80AAAAJ,https://scholar.google.com/citations?user=l015...,https://scholar.google.com/citations?view_op=m...,http://coley.mit.edu/,data/GScholar_connor_wilson_coley_mit.json
6,W. Eric L Grimson,The University of Alabama,,19,1,743,9,9,406,6,6,vunn5csAAAAJ,https://scholar.google.com/citations?user=vunn...,https://scholar.google.com/citations?view_op=m...,,data/GScholar_w_eric_l_grimson_mit.json
7,Juan Carlos Niebles Duque,"Professor Emeritus, Duke-NUS",Psychiatry,1333,0,136306,175,772,37908,89,481,gFBjcfMAAAAJ,https://scholar.google.com/citations?user=gFBj...,https://scholar.google.com/citations?view_op=m...,,data/GScholar_juan_carlos_niebles_duque_stanfo...
8,John F. Canny,"University of California, Berkeley",HCI; Ubicomp; ICTD; Data Mining; Health Techno...,435,0,72850,88,230,19161,45,136,LAv0HTEAAAAJ,https://scholar.google.com/citations?user=LAv0...,https://scholar.google.com/citations?view_op=m...,,data/GScholar_john_f_canny_berkeley.json
9,David E. Culler,"University of California, Berkeley",Systems; Networks; Embedded Computing; Archite...,635,49,112366,132,347,14747,58,193,urTiL7QAAAAJ,https://scholar.google.com/citations?user=urTi...,https://scholar.google.com/citations?view_op=m...,,data/GScholar_david_e_culler_berkeley.json


In [10]:
sql_stmt = f"""
select * from df_x 
where scholar_id is null or scholar_id = ''
"""
df_x1 = duckdb.sql(sql_stmt).df()
df_x1 

Unnamed: 0,name,affiliation,interests,num_papers,num_coauthors,citedby,hindex,i10index,citedby5y,hindex5y,i10index5y,scholar_id,url_author,url_picture,url_homepage,file_author


## perform enrichment

In [11]:
df_u.shape

(777, 31)

In [12]:
col_names_str = ", ".join([c for c in df_x.columns if c != 'name'])
col_names_str

'affiliation, interests, num_papers, num_coauthors, citedby, hindex, i10index, citedby5y, hindex5y, i10index5y, scholar_id, url_author, url_picture, url_homepage, file_author'

In [13]:
sql_upd = f"""
update df_u 
set ({col_names_str}) = (
    select {col_names_str}
    from df_x 
    where name = df_u.name
)
where name in  (
    select name from df_x 
) 
"""

In [14]:
print(sql_upd)


update df_u 
set (affiliation, interests, num_papers, num_coauthors, citedby, hindex, i10index, citedby5y, hindex5y, i10index5y, scholar_id, url_author, url_picture, url_homepage, file_author) = (
    select affiliation, interests, num_papers, num_coauthors, citedby, hindex, i10index, citedby5y, hindex5y, i10index5y, scholar_id, url_author, url_picture, url_homepage, file_author
    from df_x 
    where name = df_u.name
)
where name in  (
    select name from df_x 
) 



duckdb.sql(sql_upd)

Following error was thrown:
```
NotImplementedException                   Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_1596\457050239.py in <module>
----> 1 duckdb.sql(sql_upd)

NotImplementedException: Not implemented Error: Expr of type 354 not implemented


```

Use pandas merge 

In [15]:
cols_x = [c for c in df_x.columns if c != 'name']

In [16]:
cols_old = [c for c in df_u.columns if c not in cols_x]

In [17]:
cols_old

['name',
 'phd_univ',
 'phd_year',
 'url',
 'research_area',
 'research_concentration',
 'research_focus',
 'job_title',
 'img_url',
 'phone',
 'email',
 'cell_phone',
 'office_address',
 'department',
 'school',
 'not_found_date']

In [18]:
df_u_old = df_u[cols_old]

In [19]:
df_x_new = df_x.merge(df_u_old, how="inner", on="name")

In [20]:
df_x_new_2 = df_x_new[[c for c in df_u.columns]]

In [21]:
df_x_new_2

Unnamed: 0,name,affiliation,interests,num_papers,num_coauthors,citedby,hindex,i10index,citedby5y,hindex5y,...,img_url,phone,email,cell_phone,office_address,department,school,scholar_id,file_author,not_found_date
0,"Vinayak, Rashmi Korlakai","Assistant Professor, UW-Madison",Machine Learning; Statistical Inference; Crowd...,22,7,199,7,5,160,7,...,,,rvinayak@andrew.cmu.edu,,9011 Gates and Hillman Centers,Computer Science,Carnegie Mellon Univ,-dDKMgoAAAAJ,data/GScholar_vinayak_rashmi_korlakai_cmu.json,2023-09-10
1,K-Y. Daisy Fan,University of Kentucky,Systems biochemistry,327,0,18872,60,141,10343,43,...,https://www.cs.cornell.edu/sites/default/files...,,,,,Computer Science,,2a_t2xwAAAAJ,data/GScholar_k_y_daisy_fan_nan.json,2023-09-10
2,"Alexander ""Sasha"" Rush","Associate Professor, Cornell University",Natural Language Processing; Machine Learning,166,18,26964,56,105,24208,52,...,https://www.cs.cornell.edu/sites/default/files...,,,,,"Computer Science, Cornell Tech, CS Field Member",,LIjnUGgAAAAJ,data/GScholar_alexander_sasha_rush_nan.json,2023-09-10
3,G. Edward Suh,KBRI,,893,0,23541,81,311,6194,39,...,https://www.cs.cornell.edu/sites/default/files...,,,,,"Electrical and Computer Engineering, CS Field ...",,ih5KsPkAAAAJ,data/GScholar_g_edward_suh_nan.json,2023-09-10
4,YuFeng (Kevin) Chen,Unknown affiliation,,114,0,2462,28,49,1426,22,...,https://www.eecs.mit.edu/people/yufeng-kevin-c...,(617) 253-7351,yufengc@mit.edu,,Office: 10-140H,Electrical Engineering; AI & Decision-making,Massachusetts Institute Technology,UQwjO6UAAAAJ,data/GScholar_yufeng_kevin_chen_mit.json,2023-09-10
5,Connor Wilson Coley,"Department of Chemical Engineering, MIT",drug discovery; machine learning; automation; ...,129,12,8249,39,68,8174,39,...,https://www.eecs.mit.edu/people/connor-wilson-...,(617) 253-9218,ccoley@mit.edu,,Office: 66-548,AI & Decision-making,Massachusetts Institute Technology,l015S80AAAAJ,data/GScholar_connor_wilson_coley_mit.json,2023-09-10
6,W. Eric L Grimson,The University of Alabama,,19,1,743,9,9,406,6,...,https://www.eecs.mit.edu/people/w-eric-l-grimson/,617-253-4645,egrimson@mit.edu,,Office: 3-221,Computer Science; AI & Decision-making,Massachusetts Institute Technology,vunn5csAAAAJ,data/GScholar_w_eric_l_grimson_mit.json,2023-09-10
7,Juan Carlos Niebles Duque,"Professor Emeritus, Duke-NUS",Psychiatry,1333,0,136306,175,772,37908,89,...,,,@stanford.edu,,Gates 301,Computer Science,Stanford Univ,gFBjcfMAAAAJ,data/GScholar_juan_carlos_niebles_duque_stanfo...,2023-09-10
8,John F. Canny,"University of California, Berkeley",HCI; Ubicomp; ICTD; Data Mining; Health Techno...,435,0,72850,88,230,19161,45,...,https://www2.eecs.berkeley.edu/Faculty/Photos/...,510-642-9955,canny@berkeley.edu,,"637 Soda Hall,",Computer Science,Univ California Berkeley,LAv0HTEAAAAJ,data/GScholar_john_f_canny_berkeley.json,2023-09-10
9,David E. Culler,"University of California, Berkeley",Systems; Networks; Embedded Computing; Archite...,635,49,112366,132,347,14747,58,...,https://www2.eecs.berkeley.edu/Faculty/Photos/...,,culler@berkeley.edu,,783 Soda Hall,Computer Science,Univ California Berkeley,urTiL7QAAAAJ,data/GScholar_david_e_culler_berkeley.json,2023-09-10


In [22]:
sql_stmt = """
    select * from df_u
    where not exists (
        select 1 from df_x where df_x.name = df_u.name
    )
"""

df_u_1 = duckdb.sql(sql_stmt).df()
df_u_1

Unnamed: 0,name,affiliation,interests,num_papers,num_coauthors,citedby,hindex,i10index,citedby5y,hindex5y,...,img_url,phone,email,cell_phone,office_address,department,school,scholar_id,file_author,not_found_date
0,Jiawei Han,"Abel Bliss Professor of Computer Science, Univ...",data mining; database systems; data warehousin...,1653.0,92.0,244378.0,200.0,1022.0,82746.0,117.0,...,https://ws.engr.illinois.edu/directory/viewpho...,(217) 333-6903,hanj@illinois.edu,,,Computer Science,Univ Illinois Urbana-Champaign,Kv9AbjMAAAAJ,data/GScholar_jiawei_han_uiuc.json,
1,Robert Miller,"Professor, Computer Science, MIT",HCI; software engineering; crowdsourcing,1491.0,0.0,35488.0,78.0,324.0,13243.0,49.0,...,https://www.eecs.mit.edu/people/rob-miller/,(617) 324-6028,rcm@mit.edu,,Office: 32-G718,Computer Science,Massachusetts Institute Technology,e-c3R8QAAAAJ,data/GScholar_robert_miller_mit.json,
2,"Kanade, Takeo",Carnegie Mellon University,Computer Vision,1302.0,0.0,147292.0,169.0,636.0,33531.0,74.0,...,,(412) 268-3016,kanade@andrew.cmu.edu,,,Computer Science,Carnegie Mellon Univ,LQ87h3sAAAAJ,data/GScholar_kanade_takeo_cmu.json,
3,"Faloutsos, Christos",CMU,Data Mining; Graph Mining; Databases,1126.0,51.0,112459.0,148.0,589.0,34139.0,87.0,...,,(412) 268-1457,christos@andrew.cmu.edu,,7003 Gates and Hillman Centers,Computer Science,Carnegie Mellon Univ,nd8lQQIAAAAJ,data/GScholar_faloutsos_christos_cmu.json,
4,Daniela Rus,Andrew (1956) and Erna Viterbi Professor of Co...,Robotics; Wireless Networks; Distributed Compu...,1071.0,0.0,67599.0,137.0,574.0,38898.0,99.0,...,https://www.eecs.mit.edu/people/daniela-rus/,(617) 258-7567,rus@csail.mit.edu,,Office: 32-374,AI & Decision-making; Computer Science,Massachusetts Institute Technology,910z20QAAAAJ,data/GScholar_daniela_rus_mit.json,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
727,"Donahue, Chris","Research Scientist, Google Magenta",Machine learning for music; Generative modelin...,27.0,37.0,3596.0,13.0,13.0,3578.0,13.0,...,,,chrisdon@andrew.cmu.edu,,,Computer Science,Carnegie Mellon Univ,MgzHAPQAAAAJ,data/GScholar_donahue_chris_cmu.json,
728,Sarah Chasins,"University of California, Berkeley",programming languages; human-computer interact...,47.0,0.0,470.0,12.0,12.0,412.0,12.0,...,https://www2.eecs.berkeley.edu/Faculty/Photos/...,,,,"637 Soda Hall,",Computer Science,Univ California Berkeley,-5Em-tcAAAAJ,data/GScholar_sarah_chasins_berkeley.json,
729,ChengXiang Zhai,University of Illinois at Urbana-Champaign,Information Retrieval; Text Mining; Machine Le...,568.0,1.0,40247.0,92.0,285.0,14642.0,58.0,...,https://ws.engr.illinois.edu/directory/viewpho...,(217) 244-4943,czhai@illinois.edu,,,Computer Science,Univ Illinois Urbana-Champaign,YU-baPIAAAAJ,data/GScholar_chengxiang_zhai_uiuc.json,
730,Sanjam Garg,"University of California, Berkeley",Cryptography; Security; Privacy; Theoretical C...,122.0,69.0,7673.0,43.0,83.0,4501.0,37.0,...,https://www2.eecs.berkeley.edu/Faculty/Photos/...,,sanjamg@berkeley.edu,,685 Soda Hall,Computer Science,Univ California Berkeley,mb8mQH8AAAAJ,data/GScholar_sanjam_garg_berkeley.json,


In [23]:
df_12 = pd.concat([df_u_1, df_x_new_2])

In [24]:
df_12

Unnamed: 0,name,affiliation,interests,num_papers,num_coauthors,citedby,hindex,i10index,citedby5y,hindex5y,...,img_url,phone,email,cell_phone,office_address,department,school,scholar_id,file_author,not_found_date
0,Jiawei Han,"Abel Bliss Professor of Computer Science, Univ...",data mining; database systems; data warehousin...,1653.0,92.0,244378.0,200.0,1022.0,82746.0,117.0,...,https://ws.engr.illinois.edu/directory/viewpho...,(217) 333-6903,hanj@illinois.edu,,,Computer Science,Univ Illinois Urbana-Champaign,Kv9AbjMAAAAJ,data/GScholar_jiawei_han_uiuc.json,
1,Robert Miller,"Professor, Computer Science, MIT",HCI; software engineering; crowdsourcing,1491.0,0.0,35488.0,78.0,324.0,13243.0,49.0,...,https://www.eecs.mit.edu/people/rob-miller/,(617) 324-6028,rcm@mit.edu,,Office: 32-G718,Computer Science,Massachusetts Institute Technology,e-c3R8QAAAAJ,data/GScholar_robert_miller_mit.json,
2,"Kanade, Takeo",Carnegie Mellon University,Computer Vision,1302.0,0.0,147292.0,169.0,636.0,33531.0,74.0,...,,(412) 268-3016,kanade@andrew.cmu.edu,,,Computer Science,Carnegie Mellon Univ,LQ87h3sAAAAJ,data/GScholar_kanade_takeo_cmu.json,
3,"Faloutsos, Christos",CMU,Data Mining; Graph Mining; Databases,1126.0,51.0,112459.0,148.0,589.0,34139.0,87.0,...,,(412) 268-1457,christos@andrew.cmu.edu,,7003 Gates and Hillman Centers,Computer Science,Carnegie Mellon Univ,nd8lQQIAAAAJ,data/GScholar_faloutsos_christos_cmu.json,
4,Daniela Rus,Andrew (1956) and Erna Viterbi Professor of Co...,Robotics; Wireless Networks; Distributed Compu...,1071.0,0.0,67599.0,137.0,574.0,38898.0,99.0,...,https://www.eecs.mit.edu/people/daniela-rus/,(617) 258-7567,rus@csail.mit.edu,,Office: 32-374,AI & Decision-making; Computer Science,Massachusetts Institute Technology,910z20QAAAAJ,data/GScholar_daniela_rus_mit.json,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40,Anand V Natarajan,"Assistant Professor, EECS, MIT",Quantum computing,20,11,583,11,11,534,11,...,https://www.eecs.mit.edu/people/anand-v-natara...,408-910-4116,anandn@mit.edu,,Office: 32,Computer Science,Massachusetts Institute Technology,7Y4nM-oAAAAJ,data/GScholar_anand_v_natarajan_mit.json,2023-09-10
41,Randy H. Katz,"University of California, Berkeley",Computer Systems; Communications Networks; Dis...,687,0,94359,127,340,16845,49,...,https://www2.eecs.berkeley.edu/Faculty/Photos/...,510-642-8778,randy@cs.berkeley.edu,,"751 Soda Hall,",Computer Science,Univ California Berkeley,PkfChMgAAAAJ,data/GScholar_randy_h_katz_berkeley.json,2023-09-10
42,Matus Jan Telgarsky,"University of Illinois, Urbana-Champaign",deep learning theory; machine learning theory,54,21,5075,21,28,4350,19,...,https://ws.engr.illinois.edu/directory/viewpho...,(217) 300-4507,mjt@illinois.edu,,,Computer Science,Univ Illinois Urbana-Champaign,Fc-5yRIAAAAJ,data/GScholar_matus_jan_telgarsky_uiuc.json,2023-09-10
43,Michael J. Clancy,"Professor of Politics, University of Hartford",political economy; tourism; Ireland; Mexico; s...,37,0,1584,16,19,523,12,...,https://www2.eecs.berkeley.edu/Faculty/Photos/...,510-642-7017,clancy@cs.berkeley.edu,,"784 Soda Hall,",Computer Science,Univ California Berkeley,q-lFwSEAAAAJ,data/GScholar_michael_j_clancy_berkeley.json,2023-09-10


add a new column to tag those faculties without Google Scholar profiles

In [25]:
from datetime import datetime
str(datetime.now())

'2023-09-11 06:33:35.472174'

In [26]:
def tag_missing_profile(row):
    return datetime.now().strftime('%Y-%m-%d') if row["scholar_id"] == "" else ""

In [27]:
df_12["not_found_date"] = df_12.apply(tag_missing_profile, axis=1)

In [28]:
sql_stmt = """
    select not_found_date, count(*) from df_12
    group by not_found_date
"""

df_12_count = duckdb.sql(sql_stmt).df()
df_12_count

Unnamed: 0,not_found_date,count_star()
0,,704
1,2023-09-11,73


## write out result

In [75]:
dt = datetime.now().strftime('%Y-%m-%d')
file_xlsx = f"data/cs_faculty_top9-{dt}.xlsx"
writer = pd.ExcelWriter(Path(file_xlsx), engine='xlsxwriter')
pd.concat([df_out, df_new]).to_excel(writer, sheet_name="CS-Scholars", index=False)
writer.close()