## US CS Faculty Dataset
- [CS Faculty Composition and Hiring Trends (Blog)](https://jeffhuang.com/computer-science-open-data/#cs-faculty-composition-and-hiring-trends)
- [2200 Computer Science Professors in 50 top US Graduate Programs](https://cs.brown.edu/people/apapouts/faculty_dataset.html)
- [CS Professors (Data Explorer)](https://drafty.cs.brown.edu/csprofessors?src=csopendata)
- [Drafty Project](https://drafty.cs.brown.edu/)


Use beautiful_soup to scrap CS Faculty info

In [1]:
from scrap_cs_faculty import *

In [2]:
SCHOOL = "Stanford-CS"
URL = SCHOOL_DICT[SCHOOL]["url"]  #  "https://www.eecs.mit.edu/role/faculty-aid/"
print(URL)

https://cs.stanford.edu/directory/faculty


In [3]:
base_url = "/".join(URL.split("/")[:3])

In [4]:
page = requests.get(URL, headers=BROWSER_HEADERS)

In [5]:
soup = BeautifulSoup(page.content, "html.parser")

## Find Elements by HTML Class Name

In [6]:
cs_category = soup.find_all("h3")

len(cs_category)

6

In [7]:
faculty_cat = []

In [8]:
for cat in cs_category:
    faculty_cat.append(cat.text.strip())

In [9]:
faculty_cat

['Regular Faculty',
 'Lecturers',
 'Emeritus Faculty',
 'Courtesy Faculty',
 'Adjunct Faculty',
 'Visiting and Acting Faculty']

In [10]:
cs_tables = soup.find_all("table")

len(cs_tables) 

6

### Extract Text From HTML Elements

You can add .text to a Beautiful Soup object to return only the text content of the HTML elements that the object contains:

In [11]:
DEBUG =  False # True #  

school, dept = map_school_dept(SCHOOL)
data = []

for itable in range(len(faculty_cat)):
    cs_persons = cs_tables[itable].find_all("tr")
    for n, person in enumerate(cs_persons):
        try:
            data_dict = {"school": school, 
                         "department": dept,
                         'job_title': faculty_cat[itable]
                        } # default
            if n < 1: continue # skip header

#             if DEBUG and n > 10: break  # debug

            td = person.find_all("td")

            data_dict['name'] = td[0].text.strip()
            x_url = td[0].find("a")
            if x_url:
                data_dict['url'] = x_url["href"]
            data_dict['phone'] = td[1].text.strip()
            data_dict['office_address'] = td[2].text.strip()
            email = td[3].text.strip()
            data_dict['email'] = "" if " " in email else f"{email}@stanford.edu"


            if DEBUG:
                print(f"\nN={n}\t=============")
                print(f"name= {data_dict.get('name','')}")
                print(f"job_title= {data_dict.get('job_title','')}")
                print(f"phone= {data_dict.get('phone','')}")
                print(f"office= {data_dict.get('office_address','')}")
                print(f"email= {data_dict.get('email','')}")
                print(f"url= {data_dict.get('url','')}")
                print(f"img_url= {data_dict.get('img_url','')}")
                print(f"phd_univ= {data_dict.get('phd_univ','')}")
                print(f"phd_year= {data_dict.get('phd_year','')}")
                print(f"research_area= {data_dict.get('research_area','')}")
                print(f"department= {data_dict.get('department','')}")

            if data_dict:
                row_data = []
                for c in COLUMNS:
                    cell = data_dict.get(c,"")
                    row_data.append(cell)
                data.append(row_data)
        except Exception as e:
            print(f"[Error] {str(e)}\n{person.prettify()}")

In [12]:
len(data) , data[-1]

(146,
 ['Zhikun Zhang',
  'Visiting and Acting Faculty',
  '',
  '',
  '',
  '',
  '',
  'http://zhangzhk.com/',
  '',
  '',
  'zhikun@stanford.edu',
  '',
  '',
  'Computer Science',
  'Stanford Univ'])

In [13]:
df = pd.DataFrame(data, columns=COLUMNS)

In [14]:
print(f"Number of faculties at {SCHOOL}: {df.shape[0]}")

Number of faculties at Stanford-CS: 146


In [15]:
df

Unnamed: 0,name,job_title,phd_univ,phd_year,research_area,research_concentration,research_focus,url,img_url,phone,email,cell_phone,office_address,department,school
0,Sara Achour,Regular Faculty,,,,,,,,,sachour@stanford.edu,,Gates 484,Computer Science,Stanford Univ
1,Maneesh Agrawala,Regular Faculty,,,,,,http://graphics.stanford.edu/~maneesh,,,@stanford.edu,,Gates 364,Computer Science,Stanford Univ
2,Alex Aiken,Regular Faculty,,,,,,http://theory.stanford.edu/~aiken,,5-3359,aiken@stanford.edu,,Gates 490,Computer Science,Stanford Univ
3,Nima Anari,Regular Faculty,,,,,,https://nimaanari.com,,,nima.anari@stanford.edu,,Gates 168A,Computer Science,Stanford Univ
4,Clark Barrett,Regular Faculty,,,,,,http://theory.stanford.edu/~barrett,,650-736-0822,barrett@stanford.edu,,Gates 488,Computer Science,Stanford Univ
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141,Chris Hahn,Visiting and Acting Faculty,,,,,,http://www.christopherhahn.io,,,hahn@stanford.edu,,Gates 481,Computer Science,Stanford Univ
142,Hamed Nemati,Visiting and Acting Faculty,,,,,,https://hnemati.github.io/,,,hnnemati@stanford.edu,,Gates 999,Computer Science,Stanford Univ
143,Dolière Francis Somé,Visiting and Acting Faculty,,,,,,,,,doliere@stanford.edu,,,Computer Science,Stanford Univ
144,Marco Vassena,Visiting and Acting Faculty,,,,,,https://webspace.science.uu.nl/mvassena,,,vassena@stanford.edu,,Gates 478,Computer Science,Stanford Univ


In [16]:
# import xlsxwriter
file_xlsx = f"faculty-{SCHOOL}.xlsx"
writer = pd.ExcelWriter(file_xlsx, engine='xlsxwriter')
df.to_excel(writer, sheet_name="Faculty", index=False)

writer.save()