## US CS Faculty Dataset
- [CS Faculty Composition and Hiring Trends (Blog)](https://jeffhuang.com/computer-science-open-data/#cs-faculty-composition-and-hiring-trends)
- [2200 Computer Science Professors in 50 top US Graduate Programs](https://cs.brown.edu/people/apapouts/faculty_dataset.html)
- [CS Professors (Data Explorer)](https://drafty.cs.brown.edu/csprofessors?src=csopendata)
- [Drafty Project](https://drafty.cs.brown.edu/)


Use beautiful_soup to scrap CS Faculty info

In [1]:
from scrap_cs_faculty import *

In [2]:
SCHOOL = "CMU-CS"
URL = SCHOOL_DICT[SCHOOL]["url"]  #  "https://www.eecs.mit.edu/role/faculty-aid/"
print(URL)

https://csd.cmu.edu/people/faculty


In [3]:
page = requests.get(URL, headers=BROWSER_HEADERS)

In [4]:
soup = BeautifulSoup(page.content, "html.parser")

## Find Elements by HTML Class Name

In [5]:
cs_persons = soup.find("tbody").find_all("tr")

In [6]:
len(cs_persons) , cs_persons[0]

(120,
 <tr>
 <td class="views-field views-field-field-first-name" data-label="" headers="view-field-first-name-table-column"><a href="/people/faculty/umut-acar">Acar, Umut</a> </td>
 <td class="views-field views-field-field-professional-title-term" data-label="" headers="view-field-professional-title-term-table-column">Associate Professor        </td>
 <td class="views-field views-field-nothing" data-label="" headers="view-nothing-table-column">9101 Gates and Hillman Centers<br/>
 (412) 268-6791<br/>
 <a href="mailto:uacar@andrew.cmu.edu">uacar@andrew.cmu.edu</a> </td>
 </tr>)

### Extract Text From HTML Elements

You can add .text to a Beautiful Soup object to return only the text content of the HTML elements that the object contains:

In [7]:
DEBUG =  False # True # 

school, dept = map_school_dept(SCHOOL)
data = []
all_research_dict = {}
for n, person in enumerate(cs_persons):
    try:
        data_dict = {"school": school, "department": dept} # default
        if DEBUG and n > 0: break  # debug
        
        for x in person.find_all("td"):
            # get name/url
            name_x = x.find("a")
            if name_x:
                if "@" in name_x.text.strip():
                    data_dict['email'] = name_x.text.strip()
                    for y in x.text.strip().split("\n"):
                        if "@" not in y:
                            y2 = str(y).replace("(", "").replace(")", "").replace("-", "")
                            if re.match(r"\d{3}\s*\d{7}", y2):
                                data_dict['phone'] = y
                            else:
                                data_dict['office_address'] = y
                else:  
                    data_dict['name'] = name_x.text.strip()
                    uid = name_x["href"].split("/")[-1]
                    data_dict['url'] = f"{URL}/{uid}"
        
            if is_job_title(x.text.lower()):
                data_dict['job_title'] = x.text.strip()
                
        if DEBUG:
            print(f"n={n}\t=============")
            print(f"name= {data_dict.get('name','')}")
            print(f"job_title= {data_dict.get('job_title','')}")
            print(f"phone= {data_dict.get('phone','')}")
            print(f"office= {data_dict.get('office_address','')}")
            print(f"email= {data_dict.get('email','')}")
            print(f"url= {data_dict.get('url','')}")
            print(f"img_url= {data_dict.get('img_url','')}")
            print(f"research_area= {data_dict.get('research_area','')}")
            print(f"department= {data_dict.get('department','')}")
        
        if data_dict:
            row_data = []
            for c in COLUMNS:
                cell = data_dict.get(c,"")
                row_data.append(cell)
            data.append(row_data)
    except Exception as e:
        print(f"[Error] {str(e)}\n{person.prettify()}")

In [8]:
len(data) # data[-1]

120

In [9]:
df = pd.DataFrame(data, columns=COLUMNS)

In [10]:
print(f"Number of faculties at {SCHOOL}: {df.shape[0]}")

Number of faculties at CMU-CS: 120


In [11]:
df

Unnamed: 0,name,job_title,phd_univ,phd_year,research_area,research_concentration,research_focus,url,img_url,phone,email,cell_phone,office_address,department,school
0,"Acar, Umut",Associate Professor,,,,,,https://csd.cmu.edu/people/faculty/umut-acar,,(412) 268-6791,uacar@andrew.cmu.edu,,9101 Gates and Hillman Centers,Computer Science,Carnegie Mellon Univ
1,"Ada, Anil",Associate Teaching Professor,,,,,,https://csd.cmu.edu/people/faculty/anil-ada,,(412) 268-3835,aada@andrew.cmu.edu,,6215 Gates and Hillman Centers,Computer Science,Carnegie Mellon Univ
2,"Akoglu, Leman","Associate Professor, Affiliated Faculty",,,,,,https://csd.cmu.edu/people/faculty/leman-akoglu,,(412) 268-3043,lakoglu@andrew.cmu.edu,,2118C Hamburg Hall,Computer Science,Carnegie Mellon Univ
3,"Aldrich, Jonathan","Professor, Affiliated Faculty",,,,,,https://csd.cmu.edu/people/faculty/jonathan-al...,,(412) 268-7278,aldrich@cs.cmu.edu,,422 TCS Hall,Computer Science,Carnegie Mellon Univ
4,"Amvrosiadis, George","Assistant Research Professor, Affiliated Faculty",,,,,,https://csd.cmu.edu/people/faculty/george-amvr...,,,gamvrosi@andrew.cmu.edu,,2311 Mehrabian Collaborative Innovation Center,Computer Science,Carnegie Mellon Univ
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,"Wing, Jeannette",Adjunct Faculty,,,,,,https://csd.cmu.edu/people/faculty/jeannette-wing,,(412) 268-2000,jw35@andrew.cmu.edu,,,Computer Science,Carnegie Mellon Univ
116,"Woodruff, David",Professor,,,,,,https://csd.cmu.edu/people/faculty/david-woodr...,,,dwoodruf@andrew.cmu.edu,,7217 Gates and Hillman Centers,Computer Science,Carnegie Mellon Univ
117,"Xhakaj, Franceska",Assistant Teaching Professor,,,,,,https://csd.cmu.edu/people/faculty/franceska-x...,,,francesx@andrew.cmu.edu,,4003 Gates and Hillman Centers,Computer Science,Carnegie Mellon Univ
118,"Zhang, Hui",Consulting Professor,,,,,,https://csd.cmu.edu/people/faculty/hui-zhang-0,,,huiz1@andrew.cmu.edu,,,Computer Science,Carnegie Mellon Univ


In [12]:
# prepare research group dataframe
cols = ["research_group", "url"]
data = []
for i in all_research_dict.keys():
    data.append([i, all_research_dict.get(i,'')])
    print(f"{i}:\t {all_research_dict.get(i,'')}")

df_research = pd.DataFrame(data, columns=cols)
df_research

Unnamed: 0,research_group,url


In [13]:
# import xlsxwriter
file_xlsx = f"faculty-{SCHOOL}.xlsx"
writer = pd.ExcelWriter(file_xlsx, engine='xlsxwriter')
df.to_excel(writer, sheet_name="Faculty", index=False)
if not df_research.empty:
    df_research.to_excel(writer, sheet_name = "Research Groups", index=False)
writer.save()