In [1]:
from IPython.display import display, Markdown, Latex
from scrap_cs_faculty import *

In [2]:
with open("readme.md") as f:
    readme_md = f.read()
    readme_md += "\n- Schools:"
    
for skool in SCHOOL_DICT.keys():
    readme_md += f"\n\t- [{skool}]({SCHOOL_DICT[skool]['url']})"

display(Markdown(readme_md)) 

## US CS Faculty Dataset
- [CS Faculty Composition and Hiring Trends (Blog)](https://jeffhuang.com/computer-science-open-data/#cs-faculty-composition-and-hiring-trends)
- [2200 Computer Science Professors in 50 top US Graduate Programs](https://cs.brown.edu/people/apapouts/faculty_dataset.html)
- [CS Professors (Data Explorer)](https://drafty.cs.brown.edu/csprofessors?src=csopendata)
- [Drafty Project](https://drafty.cs.brown.edu/)
- [CSRankings.org](https://csrankings.org/#/fromyear/2011/toyear/2023/index?ai&vision&mlmining&nlp&inforet&act&crypt&log&us)

- Schools:
	- [Cornell-CS](https://www.cs.cornell.edu/people/faculty)
	- [UIUC-CS](https://cs.illinois.edu/about/people/department-faculty)
	- [MIT-AID](https://www.eecs.mit.edu/role/faculty-aid/)
	- [MIT-CS](https://www.eecs.mit.edu/role/faculty-cs/)
	- [CMU-CS](https://csd.cmu.edu/people/faculty)
	- [UCB-CS](https://www2.eecs.berkeley.edu/Faculty/Lists/CS/faculty.html)
	- [Stanford-CS](https://cs.stanford.edu/directory/faculty)

In [4]:
print(readme_md)

## US CS Faculty Dataset
- [CS Faculty Composition and Hiring Trends (Blog)](https://jeffhuang.com/computer-science-open-data/#cs-faculty-composition-and-hiring-trends)
- [2200 Computer Science Professors in 50 top US Graduate Programs](https://cs.brown.edu/people/apapouts/faculty_dataset.html)
- [CS Professors (Data Explorer)](https://drafty.cs.brown.edu/csprofessors?src=csopendata)
- [Drafty Project](https://drafty.cs.brown.edu/)
- [CSRankings.org](https://csrankings.org/#/fromyear/2011/toyear/2023/index?ai&vision&mlmining&nlp&inforet&act&crypt&log&us)

- Schools:
	- [Cornell-CS](https://www.cs.cornell.edu/people/faculty)
	- [UIUC-CS](https://cs.illinois.edu/about/people/department-faculty)
	- [MIT-AID](https://www.eecs.mit.edu/role/faculty-aid/)
	- [MIT-CS](https://www.eecs.mit.edu/role/faculty-cs/)
	- [CMU-CS](https://csd.cmu.edu/people/faculty)
	- [UCB-CS](https://www2.eecs.berkeley.edu/Faculty/Lists/CS/faculty.html)
	- [Stanford-CS](https://cs.stanford.edu/directory/faculty)


In [3]:
SCHOOL = "Cornell-CS"
URL = SCHOOL_DICT[SCHOOL]["url"]  #  "https://www.cs.cornell.edu/people/faculty"
URL

'https://www.cs.cornell.edu/people/faculty'

In [4]:
page = requests.get(URL, headers=BROWSER_HEADERS)
soup = BeautifulSoup(page.content, "html.parser")

## Find Elements by HTML Class Name

In [5]:
cs_persons = soup.find_all("div", class_="person-list")

In [6]:
len(cs_persons), cs_persons[0]

(123,
 <div class="person-list">
 <div class="person-image">
 <img alt="" height="200" src="https://www.cs.cornell.edu/sites/default/files/styles/icon-100x100/public/prof_pic.jpg?itok=7zCvPZz4" width="200">
 </img></div>
 <div class="person-listing">
 <strong class="inline-field">Assistant Professor</strong>
 <span class="inline-field">; Electrical and Computer Engineering, Cornell Tech, CS Field Member;</span>
 <span class="inline-field">; Ph.D., University of Toronto, 2016</span>
 <p> <strong>Research Areas: </strong> <a href="/research/ai">Artificial Intelligence</a>, <a href="/research/machinelearning">Machine Learning</a> </p>
 <p class="views-field views-field-field-research-concentration"> <strong class="views-label views-label-field-research-concentration">Research Concentration: </strong>    Artificial Intelligence  </p>
 <div class="views-field views-field-nothing"> <span class="field-content"></span></div><!-- end person-listing div -->
 </div><!-- end person-list div --> </

### Extract Text From HTML Elements

You can add .text to a Beautiful Soup object to return only the text content of the HTML elements that the object contains:

In [7]:
DEBUG = False # True # 

school, dept = map_school_dept(SCHOOL)
data = []
all_research_dict = {}
for n, person in enumerate(cs_persons):
    if DEBUG: print(f"n={n}")
    try:
        data_dict = {"school": school, "department": dept} # default
        if DEBUG and n > 0: break  # debug
        
        # get name/url
        name_url = person.parent.find("a")
        name = name_url.text.strip()
        url = name_url['href']
        data_dict["name"] = name
        data_dict["url"] = url

#         if DEBUG and name != "Anil Damle": continue
        
        # get image
        img = person.find("img")
        img_url = img["src"]
        data_dict["img_url"] = img_url

        job_title = person.find("strong", class_="inline-field").text
        data_dict["job_title"] = job_title

        x = person.find_all("span", class_="inline-field")
        if len(x) == 2:
            dept = x[0].text.strip()
            data_dict["department"] = dept[1:].strip() if dept.startswith(";") else dept
            _, data_dict['phd_univ'], data_dict['phd_year'] = cornell_parse_dept_phd(x[1].text)
        elif len(x) == 1:
            data_dict["department"], data_dict['phd_univ'], data_dict['phd_year'] = cornell_parse_dept_phd(x[0].text)
                    
        # get research
        research = [i.text.strip() for i in person.find_all("strong") if i and i.text.strip()]
        research = [i for i in research if i != job_title]
        research_dict = {}
        research2 = [i.text.strip() for i in person.find_all("p") if i and i.text.strip()]
        for i in research:
            for j in research2:
                if i in j:
                    research_dict[i]  = j.replace(i, "").strip()
                    break
        data_dict["research"] = str(research_dict)

        for i in research_dict.get('Research Areas:',"").split(","):
            if i.strip():
                all_research_dict[i.strip()] = True
        for i in research_dict.get('Research Concentration:',"").split(","):
            if i.strip():
                all_research_dict[i.strip()] = True
            
        if DEBUG:
            print(f"n={n}\t=============")
            print(f"name= {data_dict.get('name','')}")
            print(f"job_title= {data_dict.get('job_title','')}")
            print(f"department= {data_dict.get('department','')}")
            print(f"phd_univ= {data_dict.get('phd_univ','')}")
            print(f"phd_year= {data_dict.get('phd_year','')}")
            print(f"phone= {data_dict.get('phone','')}")
            print(f"email= {data_dict.get('email','')}")
            print(f"url= {data_dict.get('url','')}")
            print(f"img_url= {data_dict.get('img_url','')}")
            print(f"research = {str(research_dict)}")
            
        if data_dict:
            row_data = []
            for c in COLUMNS:
                if c == 'research_area':
                    cell = research_dict.get('Research Areas:', "")
                elif c == 'research_concentration':
                    cell = research_dict.get('Research Concentration:', "")
                elif c == 'research_focus':
                    cell = research_dict.get('Research Focus:', "")
                else:
                    cell = data_dict.get(c,"")
                row_data.append(cell)
            data.append(row_data)
    except Exception as e:
        print(f"[Error] {str(e)}\n{person.prettify()}")

[Error] 'NoneType' object has no attribute 'text'
<div class="person-list">
 <div class="person-image">
  <img alt="" height="200" src="https://www.cs.cornell.edu/sites/default/files/styles/icon-100x100/public/bailey_graeme_1.jpg?itok=i1PUSc_O" width="200"/>
 </div>
 <div class="person-listing">
  <strong class="inline-field">
   Adjunct Professor
  </strong>
  <span class="inline-field">
   ; Ph.D., University of Birmingham
  </span>
  <p>
   <strong>
    Research Focus:
   </strong>
   Mathematics
  </p>
  <div class="views-field views-field-nothing">
   <span class="field-content">
   </span>
  </div>
  <!-- end person-listing div -->
 </div>
 <!-- end person-list div -->
</div>



In [8]:
data[-1]

['Zhiru Zhang',
 'Assistant Professor',
 'Univ California LA',
 '2007',
 'Systems and Networking',
 'Systems',
 'Computer-aided design methodologies, optimization algorithms, compilers, and computer architectures of gigascale integrated systems; esp. system-on-chips',
 'http://www.csl.cornell.edu/~zhiruz/',
 'https://www.cs.cornell.edu/sites/default/files/styles/icon-100x100/public/Zhang%2C%20Shiru.jpg?itok=NH-hY5hI',
 '',
 '',
 '',
 '',
 'Electrical and Computer Engineering, CS Field Member',
 '']

In [9]:
# prepare research group dataframe
cols = ["research_group", "url"]
gropus = []
for i in all_research_dict.keys():
    gropus.append([i, ""])
    
df_research = pd.DataFrame(gropus, columns=cols)
df_research

Unnamed: 0,research_group,url
0,Artificial Intelligence,
1,Machine Learning,
2,Theory of Computing,
3,Theory of Computation,
4,Systems and Networking,
5,Systems,
6,Computer Architecture & VLSI,
7,Human Interaction,
8,Scientific Computing and Applications,
9,Graphics,


In [10]:
df = pd.DataFrame(data, columns=COLUMNS)

In [11]:
print(f"Number of faculties at {SCHOOL}: {df.shape[0]}")

Number of faculties at Cornell-CS: 122


In [12]:
df

Unnamed: 0,name,job_title,phd_univ,phd_year,research_area,research_concentration,research_focus,url,img_url,phone,email,cell_phone,office_address,department,school
0,Mohamed Abdelfattah,Assistant Professor,Univ Toronto,2016,"Artificial Intelligence, Machine Learning",Artificial Intelligence,,https://www.mohsaied.com,https://www.cs.cornell.edu/sites/default/files...,,,,,"Electrical and Computer Engineering, Cornell T...",
1,Jayadev Acharya,Assistant Professor,Univ California San Diego,2014,"Artificial Intelligence, Theory of Computing","Artificial Intelligence, Theory of Computation","Information theory, machine learning, and algo...",https://people.ece.cornell.edu/acharya/,https://www.cs.cornell.edu/sites/default/files...,,,,,"Electrical and Computer Engineering, CS Field ...",
2,Rachit Agarwal,Associate Professor,Univ Illinois Urbana-Champaign,2013,"Systems and Networking, Theory of Computing","Systems, Theory of Computation","Distributed systems, systems for big data anal...",http://www.cs.cornell.edu/~ragarwal/,https://www.cs.cornell.edu/sites/default/files...,,,,,"Computer Science, CS Field Member",
3,David Albonesi,Professor,Univ Mass Amherst,1996,"Computer Architecture & VLSI, Systems and Netw...",Systems,Adaptive and reconfigurable multi-core and pro...,http://www.csl.cornell.edu/~albonesi/,https://www.cs.cornell.edu/sites/default/files...,,,,,"Electrical and Computer Engineering, CS Field ...",
4,Lorenzo Alvisi,Professor,Cornell Univ,1996,Systems and Networking,Systems,Theory and practice of dependable distributed ...,http://www.cs.cornell.edu/lorenzo/,https://www.cs.cornell.edu/sites/default/files...,,,,,"Computer Science, CS Field Member, Tisch Unive...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,Christina Lee Yu,Assistant Professor,MIT,,Theory of Computing,Theory of Computation,"Theory of Computing, Artificial Intelligence (...",https://people.orie.cornell.edu/cleeyu/,https://www.cs.cornell.edu/sites/default/files...,,,,,"ORIE, CS Field Member",
118,Haiyuan Yu,Professor,Yale,2005,Computational Biology,Scientific Computing and Applications,"Biomedical systems biology, machine learning, ...",https://www.yulab.org,https://www.cs.cornell.edu/sites/default/files...,,,,,"Biological Statistics & Computational Biology,...",
119,Ramin Zabih,Professor,Stanford Univ,1994,"Human Interaction, Vision","Artificial Intelligence, Theory of Computation","Computer vision and its applications, especial...",http://www.cs.cornell.edu/~rdz,https://www.cs.cornell.edu/sites/default/files...,,,,,"Computer Science, Cornell Tech, CS Field Member",
120,Cheng Zhang,Assistant Professor,Georgia Institute Technology,2018,Artificial Intelligence,Systems,"Ubiquitous Computing, Wearable Computing, Huma...",http://www.czhang.org/,https://www.cs.cornell.edu/sites/default/files...,,,,,"Information Science, CS Field Member",


In [13]:
file_xlsx = f"faculty-{SCHOOL}.xlsx"
writer = pd.ExcelWriter(file_xlsx, engine='xlsxwriter')
df.to_excel(writer, sheet_name="Faculty", index=False)

if not df_research.empty:
    df_research.to_excel(writer, sheet_name = "Research Groups", index=False)
    
writer.save()

# The End