In [1]:
from IPython.display import display, Markdown, Latex
from scrap_cs_faculty import *

with open("readme.md") as f:
    readme_md = f.read()

display(Markdown(readme_md)) 

## US CS Faculty Dataset
- [CS Faculty Composition and Hiring Trends (Blog)](https://jeffhuang.com/computer-science-open-data/#cs-faculty-composition-and-hiring-trends)
- [2200 Computer Science Professors in 50 top US Graduate Programs](https://cs.brown.edu/people/apapouts/faculty_dataset.html)
- [CS Professors (Data Explorer)](https://drafty.cs.brown.edu/csprofessors?src=csopendata)
- [Drafty Project](https://drafty.cs.brown.edu/)
- [CSRankings.org](https://csrankings.org/#/fromyear/2011/toyear/2023/index?ai&vision&mlmining&nlp&inforet&act&crypt&log&us)


## Top 6 schools
Stanford, UCB, MIT, CMU, Cornell, UIUC

## add more schools

- https://www.cs.princeton.edu/people/faculty?type=main
- https://www.cs.washington.edu/people/faculty
- https://directory.seas.upenn.edu/computer-and-information-science/
- https://www.cms.caltech.edu/cms-people/faculty
- https://www.cs.columbia.edu/people/faculty/
- https://cse.engin.umich.edu/people/faculty/
- https://samueli.ucla.edu/search-faculty/#cs
- https://cse.ucsd.edu/people/faculty-profiles
- https://seas.harvard.edu/computer-science/people?role[46]=46
- https://www.cc.gatech.edu/people/faculty?page=24  (page=[0,25))


In [2]:
SCHOOL = "UPenn-CS"
URL = SCHOOL_DICT[SCHOOL]["url"]  #  "https://www.eecs.mit.edu/role/faculty-aid/"
print(URL)

https://directory.seas.upenn.edu/computer-and-information-science/


In [3]:
base_url = "/".join(URL.split("/")[:3])
base_url

'https://directory.seas.upenn.edu'

In [4]:
page = requests.get(URL, headers=BROWSER_HEADERS)

In [5]:
soup = BeautifulSoup(page.content, "html.parser")

We take 2 passes of processing on div classes: StaffListPhoto, StaffListMeta, separately, 

build up a dict with url as key

## Find Profile with image

In [6]:
photos = soup.find_all("div", class_="StaffListPhoto")
# print(results.prettify())

In [7]:
school, dept = map_school_dept(SCHOOL)
school, dept

('Univ Pennsylvania', 'Computer Science')

In [8]:
len(photos), photos[1]

(112,
 <div class="StaffListPhoto">
 <a href="https://directory.seas.upenn.edu/rajeev-alur/">
 <img alt="Rajeev Alur" src="https://directory.seas.upenn.edu/wp-content/uploads/2020/03/alur-rajeev.jpg"/>
 </a>
 </div>)

In [9]:
all_data_dict = {}

for n, url_img in enumerate(photos):
    data_dict = {"school": school, "department": dept}

    # parse name, url
    url = url_img.find("a")["href"]
    if not url:
        print(f"[ERROR] {n} missing URL: {url_img.prettify()}")
    
    img = url_img.find("img")
    if img:
        name = img.get("alt", "")
        img_url = img.get("src", "")
        
    data_dict['url'] = url
    data_dict['name'] = name if name else pick_name_from_profile_url(url)
    data_dict['img_url'] = img_url

    all_data_dict.update({url : data_dict})

In [10]:
all_data_dict

{'https://directory.seas.upenn.edu/shivani-agarwal/': {'school': 'Univ Pennsylvania',
  'department': 'Computer Science',
  'url': 'https://directory.seas.upenn.edu/shivani-agarwal/',
  'name': 'Shivani Agarwal',
  'img_url': 'https://directory.seas.upenn.edu/wp-content/uploads/2020/03/Agarwal-Shivani.jpg'},
 'https://directory.seas.upenn.edu/rajeev-alur/': {'school': 'Univ Pennsylvania',
  'department': 'Computer Science',
  'url': 'https://directory.seas.upenn.edu/rajeev-alur/',
  'name': 'Rajeev Alur',
  'img_url': 'https://directory.seas.upenn.edu/wp-content/uploads/2020/03/alur-rajeev.jpg'},
 'https://directory.seas.upenn.edu/sebastian-angel/': {'school': 'Univ Pennsylvania',
  'department': 'Computer Science',
  'url': 'https://directory.seas.upenn.edu/sebastian-angel/',
  'name': 'Sebastian Angel',
  'img_url': 'https://directory.seas.upenn.edu/wp-content/uploads/2020/03/angel-sebastian.jpg'},
 'https://directory.seas.upenn.edu/ryan-baker/': {'school': 'Univ Pennsylvania',
  'de

## Find Person Detail

In [11]:
metas = soup.find_all("div", class_="StaffListMeta")
# print(results.prettify())

In [12]:
len(metas), metas[1]

(112,
 <div class="StaffListMeta">
 <div class="StaffListName">
 <a href="https://directory.seas.upenn.edu/rajeev-alur/">
 	 			Rajeev Alur	 		</a>
 </div>
 <div class="StaffListTitles">
 <div class="d-none">
 						Professor					</div>
 <div>
 							Zisman Family Professor						</div>
 <div>
 																Computer and Information Science										</div>
 </div>
 <div class="StaffListSpecialTitle">
 <div>
 		 		 	Founding Director, ASSET Center		 		</div>
 </div>
 <div class="StaffListSocial">
 <ul>
 <li><a href="mailto:ALUR@CIS.UPENN.EDU"><i class="sls-icon email"></i></a></li> <li><a href="https://scholar.google.com/citations?user=ZvLa1RIAAAAJ&amp;hl=en" target="_blank"><i class="sls-icon gschool"></i></a></li> <li><a href="https://twitter.com/RajeevAlur" target="_blank"><i class="sls-icon twitter"></i></a></li> </ul>
 </div>
 </div>)

In [13]:
all_data_dict2 = {}
for m, person_detail in enumerate(metas):
    data_dict = {"school": school, "department": dept}
    job_titles = []

    # parse name
    name = person_detail.find("div", class_="StaffListName")
    if name:
        x = name.find("a")
        url = x["href"]
        data_dict["url"] = url
        data_dict["name"] = x.text.strip()

    # parse title
    titles = person_detail.find("div", class_="StaffListTitles")
    if titles:
        job_titles += [x.text.strip() for x in titles.find_all("div") if x.text.strip()]

    titles = person_detail.find("div", class_="StaffListSpecialTitle")
    if titles:
        job_titles += [x.text.strip() for x in titles.find_all("div") if x.text.strip()]

    if job_titles:
        data_dict["job_title"] = ", ".join(job_titles)

    # parse social
    social = person_detail.find("div", class_="StaffListSocial")
    if social:
        items = social.find_all("li")
        num_lis = len(items)
        n = 0
        for li in items:
            href = li.find("a")["href"]
            if "mailto:" in href:
                data_dict["email"] = href.replace("mailto:", "")
                n += 1
            elif "scholar.google." in href:
                data_dict["url_author"] = href
                n += 1
            elif "twitter." in href:
                data_dict["url_twitter"] = href
                n += 1
            elif "tel:" in href:
                data_dict["phone"] = href.replace("tel:", "")
                n += 1

        if n != num_lis:
            print(f"[ERROR] {m} : {url} has new social tag:\n {social.prettify()}")

    if data_dict.get("url", ""):
        all_data_dict2.update({data_dict["url"] : data_dict})

[ERROR] 32 : https://directory.seas.upenn.edu/kevin-b-johnson/ has new social tag:
 <div class="StaffListSocial">
 <ul>
  <li>
   <a href="mailto:Kevin.Johnson1@Pennmedicine.upenn.edu">
    <i class="sls-icon email">
    </i>
   </a>
  </li>
  <li>
   <a href="https://scholar.google.com/citations?user=RFcbxjoAAAAJ&amp;hl=en&amp;oi=ao" target="_blank">
    <i class="sls-icon gschool">
    </i>
   </a>
  </li>
  <li>
   <a href="@kbjohnsonmd" target="_blank">
    <i class="sls-icon twitter">
    </i>
   </a>
  </li>
 </ul>
</div>



## Merge two dicts 

In [14]:
all_data_dict["https://directory.seas.upenn.edu/kevin-b-johnson/"]

{'school': 'Univ Pennsylvania',
 'department': 'Computer Science',
 'url': 'https://directory.seas.upenn.edu/kevin-b-johnson/',
 'name': 'Kevin B. Johnson',
 'img_url': 'https://directory.seas.upenn.edu/wp-content/uploads/2021/12/Johnson-Kevin-directory.jpg'}

In [15]:
all_data_dict2["https://directory.seas.upenn.edu/kevin-b-johnson/"]

{'school': 'Univ Pennsylvania',
 'department': 'Computer Science',
 'url': 'https://directory.seas.upenn.edu/kevin-b-johnson/',
 'name': 'Kevin B. Johnson',
 'job_title': 'Professor, David L. Cohen University Professor, Annenberg School for Communication, Bioengineering, Biostatistics, Epidemiology and Informatics, Computer and Information Science, Pediatrics, Vice President for Applied Informatics, University of Pennsylvania Health System',
 'email': 'Kevin.Johnson1@Pennmedicine.upenn.edu',
 'url_author': 'https://scholar.google.com/citations?user=RFcbxjoAAAAJ&hl=en&oi=ao'}

## Prepare dataframe

In [16]:
DEBUG = True

In [17]:
data = []
for n, k in enumerate(all_data_dict2.keys()):
    data_dict = all_data_dict2[k]
    if "img_url" not in data_dict and k in all_data_dict:
        data_dict.update({"img_url": all_data_dict[k]["img_url"]})
        
    print(f"========\t n={n}\t=============")
    if DEBUG and n < 3:
        print(f"name= {data_dict.get('name','')}")
        print(f"job_title= {data_dict.get('job_title','')}")
        print(f"phone= {data_dict.get('phone','')}")
        print(f"office= {data_dict.get('office_address','')}")
        print(f"email= {data_dict.get('email','')}")
        print(f"url= {data_dict.get('url','')}")
        print(f"img_url= {data_dict.get('img_url','')}")
        print(f"phd_univ= {data_dict.get('phd_univ','')}")
        print(f"phd_year= {data_dict.get('phd_year','')}")
        print(f"research_area= {data_dict.get('research_area','')}")
        print(f"department= {data_dict.get('department','')}")
        print(f"school= {data_dict.get('school','')}")
        print(f"url_profile= {data_dict.get('url_profile','')}")
        print(f"url_author= {data_dict.get('url_author','')}")

    row_data = []
    for c in COLUMNS_v2:
        cell = data_dict.get(c,"")
        row_data.append(cell)
    data.append(row_data)

name= Shivani Agarwal
job_title= Associate Professor, Rachleff Family Associate Professor, Computer and Information Science
phone= 
office= 
email= ashivani@cis.upenn.edu
url= https://directory.seas.upenn.edu/shivani-agarwal/
img_url= https://directory.seas.upenn.edu/wp-content/uploads/2020/03/Agarwal-Shivani.jpg
phd_univ= 
phd_year= 
research_area= 
department= Computer Science
school= Univ Pennsylvania
url_profile= 
url_author= 
name= Rajeev Alur
job_title= Professor, Zisman Family Professor, Computer and Information Science, Founding Director, ASSET Center
phone= 
office= 
email= ALUR@CIS.UPENN.EDU
url= https://directory.seas.upenn.edu/rajeev-alur/
img_url= https://directory.seas.upenn.edu/wp-content/uploads/2020/03/alur-rajeev.jpg
phd_univ= 
phd_year= 
research_area= 
department= Computer Science
school= Univ Pennsylvania
url_profile= 
url_author= https://scholar.google.com/citations?user=ZvLa1RIAAAAJ&hl=en
name= Sebastian Angel
job_title= Assistant Professor, Raj and Neera Singh T

In [18]:
len(data) , data[-1]

(112,
 ['Stella Yu',
  'Adjunct Assistant Professor, Computer and Information Science',
  '',
  '',
  '',
  '',
  '',
  'https://directory.seas.upenn.edu/stella-yu/',
  'https://directory.seas.upenn.edu/wp-content/uploads/2020/03/Yu-Stella.jpg',
  '510-666-2900',
  'stellayu@berkeley.edu',
  '',
  '',
  'Computer Science',
  'Univ Pennsylvania',
  '',
  ''])

In [19]:
df = pd.DataFrame(data, columns=COLUMNS_v2)

In [20]:
print(f"Number of faculties at {SCHOOL}: {df.shape[0]}")

Number of faculties at UPenn-CS: 112


In [21]:
df

Unnamed: 0,name,job_title,phd_univ,phd_year,research_area,research_concentration,research_focus,url,img_url,phone,email,cell_phone,office_address,department,school,url_profile,url_author
0,Shivani Agarwal,"Associate Professor, Rachleff Family Associate...",,,,,,https://directory.seas.upenn.edu/shivani-agarwal/,https://directory.seas.upenn.edu/wp-content/up...,,ashivani@cis.upenn.edu,,,Computer Science,Univ Pennsylvania,,
1,Rajeev Alur,"Professor, Zisman Family Professor, Computer a...",,,,,,https://directory.seas.upenn.edu/rajeev-alur/,https://directory.seas.upenn.edu/wp-content/up...,,ALUR@CIS.UPENN.EDU,,,Computer Science,Univ Pennsylvania,,https://scholar.google.com/citations?user=ZvLa...
2,Sebastian Angel,"Assistant Professor, Raj and Neera Singh Term ...",,,,,,https://directory.seas.upenn.edu/sebastian-angel/,https://directory.seas.upenn.edu/wp-content/up...,,sga001@cis.upenn.edu,,,Computer Science,Univ Pennsylvania,,
3,Ryan Baker,"Associate Professor, Computer and Information ...",,,,,,https://directory.seas.upenn.edu/ryan-baker/,https://directory.seas.upenn.edu/wp-content/up...,,rybaker@upenn.edu,,,Computer Science,Univ Pennsylvania,,
4,Yoseph Barash,"Associate Professor, Computer and Information ...",,,,,,https://directory.seas.upenn.edu/yoseph-barash/,https://directory.seas.upenn.edu/wp-content/up...,(215) 746-8683,yosephb@seas.upenn.edu,,,Computer Science,Univ Pennsylvania,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,Richard Paul,"Professor Emeritus, Computer and Information S...",,,,,,https://directory.seas.upenn.edu/richard-paul/,https://directory.seas.upenn.edu/wp-content/up...,,lou@seas.upenn.edu,,,Computer Science,Univ Pennsylvania,,
108,Mark Steedman,"Adjunct Professor, Computer and Information Sc...",,,,,,https://directory.seas.upenn.edu/mark-steedman/,https://directory.seas.upenn.edu/wp-content/up...,,steedman@seas.upenn.edu,,,Computer Science,Univ Pennsylvania,,
109,Bonnie Webber,"Adjunct Professor, Computer and Information Sc...",,,,,,https://directory.seas.upenn.edu/bonnie-webber/,https://directory.seas.upenn.edu/wp-content/up...,,bonnie@seas.upenn.edu,,,Computer Science,Univ Pennsylvania,,
110,James Weimer,"Adjunct Assistant Professor, Computer and Info...",,,,,,https://directory.seas.upenn.edu/james-weimer/,https://directory.seas.upenn.edu/wp-content/up...,,weimerj@seas.upenn.edu,,,Computer Science,Univ Pennsylvania,,https://scholar.google.com/citations?user=IeuL...


In [22]:
# import xlsxwriter
file_xlsx = f"faculty-{SCHOOL}.xlsx"
print(f"{file_xlsx}")
writer = pd.ExcelWriter(file_xlsx, engine='xlsxwriter')
df.to_excel(writer, sheet_name="Faculty", index=False)
# if not df_research.empty:
#     df_research.to_excel(writer, sheet_name = "Research Groups", index=False)
writer.close()

faculty-UPenn-CS.xlsx
