In [1]:
from IPython.display import display, Markdown, Latex
from scrap_cs_faculty import *

with open("readme.md") as f:
    readme_md = f.read()

display(Markdown(readme_md)) 

## US CS Faculty Dataset
- [CS Faculty Composition and Hiring Trends (Blog)](https://jeffhuang.com/computer-science-open-data/#cs-faculty-composition-and-hiring-trends)
- [2200 Computer Science Professors in 50 top US Graduate Programs](https://cs.brown.edu/people/apapouts/faculty_dataset.html)
- [CS Professors (Data Explorer)](https://drafty.cs.brown.edu/csprofessors?src=csopendata)
- [Drafty Project](https://drafty.cs.brown.edu/)
- [CSRankings.org](https://csrankings.org/#/fromyear/2011/toyear/2023/index?ai&vision&mlmining&nlp&inforet&act&crypt&log&us)


## Top 6 schools
Stanford, UCB, MIT, CMU, Cornell, UIUC

## add more schools

- https://www.cs.princeton.edu/people/faculty?type=main
- https://www.cs.washington.edu/people/faculty
- https://directory.seas.upenn.edu/computer-and-information-science/
- https://www.cms.caltech.edu/cms-people/faculty
- https://www.cs.columbia.edu/people/faculty/
- https://cse.engin.umich.edu/people/faculty/
- https://samueli.ucla.edu/search-faculty/#cs
- https://cse.ucsd.edu/people/faculty-profiles
- https://seas.harvard.edu/computer-science/people?role[46]=46
- https://www.cc.gatech.edu/people/faculty?page=24  (page=[0,25))


In [2]:
SCHOOL = "Princeton-CS"
URL = SCHOOL_DICT[SCHOOL]["url"]  #  "https://www.eecs.mit.edu/role/faculty-aid/"
print(URL)

https://www.cs.princeton.edu/people/faculty?type=main


In [3]:
base_url = "/".join(URL.split("/")[:3])
base_url

'https://www.cs.princeton.edu'

In [4]:
page = requests.get(URL, headers=BROWSER_HEADERS)

In [5]:
soup = BeautifulSoup(page.content, "html.parser")

In [6]:
results = soup.find("div", class_="people people-facstaff")
print(results.prettify())

<div class="people people-facstaff">
 <a name="A">
 </a>
 <div class="person clearfix">
  <div class="person-photo">
   <a href="/people/profile/abtahi">
    <img alt="Photo of Parastoo Abtahi" src="/sites/all/modules/custom/cs_people/generate_thumbnail.php?id=6671&amp;thumb="/>
   </a>
  </div>
  <div class="person-details">
   <h2 class="person-name">
    <a href="/people/profile/abtahi">
     Parastoo Abtahi
    </a>
   </h2>
   <div class="person-title">
    Assistant Professor
   </div>
   <div class="person-degree">
    Ph.D., Stanford University, 2022
   </div>
   <div class="person-links">
    <a class="btn btn-xs btn-default" href="https://www.cs.princeton.edu/~abtahi">
     <span class="glyphicon glyphicon-globe">
     </span>
     Homepage
    </a>
    <a class="btn btn-xs btn-default" href="/people/profile/abtahi">
     <span class="glyphicon glyphicon-arrow-right">
     </span>
     Profile
    </a>
   </div>
   <div class="person-address">
    <span class="person-address-

## Find Elements by HTML Class Name

In [7]:
cs_persons = soup.find_all("div", class_="person clearfix")

In [8]:
len(cs_persons) , cs_persons[0]

(69,
 <div class="person clearfix">
 <div class="person-photo">
 <a href="/people/profile/abtahi"><img alt="Photo of Parastoo Abtahi" src="/sites/all/modules/custom/cs_people/generate_thumbnail.php?id=6671&amp;thumb="/></a>
 </div>
 <div class="person-details">
 <h2 class="person-name">
 <a href="/people/profile/abtahi">Parastoo Abtahi</a>
 </h2>
 <div class="person-title">
         Assistant Professor      </div>
 <div class="person-degree">
         Ph.D., Stanford University, 2022      </div>
 <div class="person-links">
 <a class="btn btn-xs btn-default" href="https://www.cs.princeton.edu/~abtahi"><span class="glyphicon glyphicon-globe"></span>Homepage</a>
 <a class="btn btn-xs btn-default" href="/people/profile/abtahi"><span class="glyphicon glyphicon-arrow-right"></span>Profile</a>
 </div>
 <div class="person-address">
 <span class="person-address-item">
 <span class="glyphicon glyphicon-envelope"></span> abtahi  (@cs.princeton.edu)
       </span>
 <span class="person-address-item

### Extract Text From HTML Elements

You can add .text to a Beautiful Soup object to return only the text content of the HTML elements that the object contains:

In [9]:
DEBUG =  False # True # 

if DEBUG:
    N_START, N_STOP = 0, 1
else:
    N_START, N_STOP = 0, -1

school, dept = map_school_dept(SCHOOL)
data = []
all_research_dict = {}
for n, person in enumerate(cs_persons[N_START:N_STOP]):
    try:
        data_dict = {"school": school, "department": dept} # default
#         if DEBUG and n > 0: break  # debug
        
        # parse profile and image URL
        url_img = person.find("div", class_="person-photo")
        url = url_img.find("a")["href"]
        img_url = url_img.find("img")["src"]
        data_dict['url_profile'] = f"{base_url}{url}"
        data_dict['img_url'] = f"{base_url}{img_url}"
        
        person_detail = person.find("div", class_="person-details")
        
        # parse name
        name = person_detail.find("h2", class_="person-name")
        if name:
            data_dict['name'] = name.text.strip()
        
        # parse title 
        title = person_detail.find("div", class_="person-title")
        if title:
            data_dict['job_title'] = title.text.strip()

        # parse phd school, year 
        school_year = person_detail.find("div", class_="person-degree")
        if school_year:
            x = [i.strip() for i in school_year.text.strip().split(",") if i.strip()]
            
            if re.match(r"[0-9]{4}", x[-1]):
                data_dict['phd_year'] = x[-1]
                
            if x[0].lower().replace(".", "").strip() == "phd" and 'phd_year' in data_dict:
                data_dict['phd_univ'] = ", ".join(x[1:-1])
                
        # parse homepage/profile url
        links = person_detail.find("div", class_="person-links")
        if links:
            hrefs = links.find_all("a")
            if hrefs:
                data_dict['url'] = hrefs[0]["href"]
            
        # parse contact info
        for item in person_detail.find_all("span", class_="person-address-item"):
            if item.find("span", class_="glyphicon glyphicon-envelope"):
                data_dict['email'] = "".join([i.strip() for i in item.text.strip().replace("(","").replace(")","").split() if i.strip()])
            if item.find("span", class_="glyphicon glyphicon-earphone"):
                data_dict['phone'] = item.text.strip()
            if item.find("span", class_="glyphicon glyphicon-briefcase"):
                data_dict['office_address'] = item.text.strip()
            
        # parse interest
        interest = person_detail.find("p", class_="person-research-interests")
        if interest:
            data_dict['research_area'] = interest.text.replace("Research Interests:", "").strip()          
                                
        if DEBUG:
            print(f"========\t n={n}\t=============")
            print(f"name= {data_dict.get('name','')}")
            print(f"job_title= {data_dict.get('job_title','')}")
            print(f"phone= {data_dict.get('phone','')}")
            print(f"office= {data_dict.get('office_address','')}")
            print(f"email= {data_dict.get('email','')}")
            print(f"url= {data_dict.get('url','')}")
            print(f"img_url= {data_dict.get('img_url','')}")
            print(f"phd_univ= {data_dict.get('phd_univ','')}")
            print(f"phd_year= {data_dict.get('phd_year','')}")
            print(f"research_area= {data_dict.get('research_area','')}")
            print(f"department= {data_dict.get('department','')}")
            print(f"school= {data_dict.get('school','')}")
            print(f"url_profile= {data_dict.get('url_profile','')}")
            
        if data_dict:
            row_data = []
            for c in COLUMNS_v2:
                cell = data_dict.get(c,"")
                row_data.append(cell)
            data.append(row_data)
    except Exception as e:
        print(f"[Error] {str(e)}")
#         print(f"{person.prettify()}")

In [10]:
len(data) , data[-1]

(68,
 ['Huacheng Yu',
  'Assistant Professor',
  'Stanford University',
  '2017',
  'Data structures, streaming algorithms, communication complexity',
  '',
  '',
  'https://www.cs.princeton.edu/~hy2',
  'https://www.cs.princeton.edu/sites/all/modules/custom/cs_people/generate_thumbnail.php?id=5209&thumb=',
  '(609) 258-5962',
  'hy2@cs.princeton.edu',
  '',
  '310 Computer Science',
  'Computer Science',
  'Princeton Univ',
  'https://www.cs.princeton.edu/people/profile/hy2',
  ''])

In [11]:
df = pd.DataFrame(data, columns=COLUMNS_v2)

In [12]:
print(f"Number of faculties at {SCHOOL}: {df.shape[0]}")

Number of faculties at Princeton-CS: 68


In [13]:
df

Unnamed: 0,name,job_title,phd_univ,phd_year,research_area,research_concentration,research_focus,url,img_url,phone,email,cell_phone,office_address,department,school,url_profile,url_author
0,Parastoo Abtahi,Assistant Professor,Stanford University,2022,"Human-Computer Interaction, Augmented Reality,...",,,https://www.cs.princeton.edu/~abtahi,https://www.cs.princeton.edu/sites/all/modules...,(609) 258-9528,abtahi@cs.princeton.edu,,419 Computer Science,Computer Science,Princeton Univ,https://www.cs.princeton.edu/people/profile/ab...,
1,Ryan Adams,"Professor, Associate Chair",University of Cambridge,2009,"I am interested in machine learning, artificia...",,,https://www.cs.princeton.edu/~rpa,https://www.cs.princeton.edu/sites/all/modules...,(609) 258-8682,rpa@cs.princeton.edu,,411 Computer Science,Computer Science,Princeton Univ,https://www.cs.princeton.edu/people/profile/rpa,
2,Andrew Appel,Eugene Higgins Professor,Carnegie-Mellon University,1985,"Software verification, computer security, prog...",,,https://www.cs.princeton.edu/~appel,https://www.cs.princeton.edu/sites/all/modules...,(609) 258-4627,appel@cs.princeton.edu,,209 Computer Science,Computer Science,Princeton Univ,https://www.cs.princeton.edu/people/profile/appel,
3,Sanjeev Arora,Charles C. Fitzmorris Professor,"University of California, Berkeley",1994,Uses of randomness in complexity theory and al...,,,https://www.cs.princeton.edu/~arora,https://www.cs.princeton.edu/sites/all/modules...,(609) 258-3869,arora@cs.princeton.edu,,407 Computer Science,Computer Science,Princeton Univ,https://www.cs.princeton.edu/people/profile/arora,
4,David August,Professor,"University of Illinois, Urbana/Champaign",2000,Computer Architecture and Compilers,,,https://www.cs.princeton.edu/~august,https://www.cs.princeton.edu/sites/all/modules...,(609) 258-2085,august@cs.princeton.edu,,221 Computer Science,Computer Science,Princeton Univ,https://www.cs.princeton.edu/people/profile/au...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,Olga Troyanskaya,Professor,Stanford University,2003,Bioinformatics; analysis of large-scale biolog...,,,/people/profile/ogt,https://www.cs.princeton.edu/sites/all/modules...,(609) 258-1749,ogt@cs.princeton.edu,,320 Computer Science,Computer Science,Princeton Univ,https://www.cs.princeton.edu/people/profile/ogt,
64,David Walker,"Professor, Director of Undergraduate Studies",Cornell University,2001,"Programming languages, type systems, compilers...",,,https://www.cs.princeton.edu/~dpw,https://www.cs.princeton.edu/sites/all/modules...,(609) 258-7654,dpw@cs.princeton.edu,,211 Computer Science,Computer Science,Princeton Univ,https://www.cs.princeton.edu/people/profile/dpw,
65,Kevin Wayne,Phillip Y. Goldman '86 University Lecturer,Cornell University,1999,Algorithms and data structures; computer scien...,,,https://www.cs.princeton.edu/~wayne,https://www.cs.princeton.edu/sites/all/modules...,(609) 258-4455,wayne@cs.princeton.edu,,040 Corwin Hall,Computer Science,Princeton Univ,https://www.cs.princeton.edu/people/profile/wayne,
66,Matthew Weinberg,Associate Professor,Massachusetts Institute of Technology,2014,,,,https://www.cs.princeton.edu/~smattw,https://www.cs.princeton.edu/sites/all/modules...,(609) 258-0944,smattw@cs.princeton.edu,,317 Computer Science,Computer Science,Princeton Univ,https://www.cs.princeton.edu/people/profile/sm...,


In [14]:
# import xlsxwriter
file_xlsx = f"faculty-{SCHOOL}.xlsx"
print(f"{file_xlsx}")
writer = pd.ExcelWriter(file_xlsx, engine='xlsxwriter')
df.to_excel(writer, sheet_name="Faculty", index=False)
# if not df_research.empty:
#     df_research.to_excel(writer, sheet_name = "Research Groups", index=False)
writer.close()

faculty-Princeton-CS.xlsx
