## US CS Faculty Dataset
- [CS Faculty Composition and Hiring Trends (Blog)](https://jeffhuang.com/computer-science-open-data/#cs-faculty-composition-and-hiring-trends)
- [2200 Computer Science Professors in 50 top US Graduate Programs](https://cs.brown.edu/people/apapouts/faculty_dataset.html)
- [CS Professors (Data Explorer)](https://drafty.cs.brown.edu/csprofessors?src=csopendata)
- [Drafty Project](https://drafty.cs.brown.edu/)


Use beautiful_soup to scrap CS Faculty info

In [2]:
from scrap_cs_faculty import *

In [3]:
SCHOOL = "UCB-CS"
URL = SCHOOL_DICT[SCHOOL]["url"]  #  "https://www.eecs.mit.edu/role/faculty-aid/"
print(URL)

https://www2.eecs.berkeley.edu/Faculty/Lists/CS/faculty.html


In [10]:
base_url = "/".join(URL.split("/")[:3])

In [4]:
page = requests.get(URL, headers=BROWSER_HEADERS)

In [5]:
soup = BeautifulSoup(page.content, "html.parser")

## Find Elements by HTML Class Name

In [6]:
cs_persons = soup.find_all("div", class_="media")

In [7]:
len(cs_persons) , cs_persons[0]

(131,
 <div class="media">
 <div class="media-left">
 <a href="/Faculty/Homepages/abbeel.html">
 <img alt="Photo for Pieter Abbeel" src="/Faculty/Photos/Homepages/abbeel.jpg" width="120"/>
 </a>
 </div>
 <div class="media-body">
 <h3 class="media-heading">
 <span id="A"></span>
 <a href="/Faculty/Homepages/abbeel.html">Pieter Abbeel</a>
 </h3>
 <p>
 <strong>Professor</strong>
 <br/>746 Sutardja Dai Hall, (510) 642-7034; pabbeel@cs.berkeley.edu
                 <br/><strong>Research Interests:</strong>
 <a href="/Research/Areas/AI">Artificial Intelligence (AI)</a>; <a href="/Research/Areas/CIR">Control, Intelligent Systems, and Robotics (CIR)</a>
 <br/><strong>Education:</strong>
                     2008, Ph.D., Computer Science, Stanford University; 2000, M.S., Electrical Engineering, KU Leuven, Belgium
                 <br/><strong>Office Hours:</strong>
                     arrange via email
                 
                                 <br/><strong>Teaching Schedule (Spring 20

### Extract Text From HTML Elements

You can add .text to a Beautiful Soup object to return only the text content of the HTML elements that the object contains:

In [34]:
DEBUG =  False # True # 

school, dept = map_school_dept(SCHOOL)
data = []
all_research_dict = {}
for n, person in enumerate(cs_persons):
    try:
        data_dict = {"school": school, "department": dept} # default
        # if DEBUG and n > 0: break  # debug
        
        url_img = person.find("div", class_="media-left")
        url = url_img.find("a")["href"]
        img_url = url_img.find("img")["src"]
        data_dict['url'] = f"{base_url}{url}"
        data_dict['img_url'] = f"{base_url}{img_url}"
        
        name = person.find("h3", class_="media-heading")
        data_dict['name'] = name.text.strip()
        
        # Debug record
        # if data_dict['name'] != "Rediet Abebe": continue
        # if data_dict['name'] != "Vern Paxson": continue
        
        
        p_text = person.find("p").text
        if DEBUG:
            print(f"p_text = {p_text}")
        
        if "Office Hours:" in p_text:
            p_text = p_text.split("Office Hours:")[0]

        if "Education:" in p_text: 
            contact_research, education = p_text.split("Education:")
        else:
            contact_research = p_text
            education = ""
        edu = ""
        if education:
            for x in education.strip().split(";"):
                if "Ph.D." in x:
                    edu = x
                
        if edu:
            edu_list = edu.split(",")
            phd_year = edu_list[0]
            phd_univ = edu_list[-1]
            
        if "Research Interests:" in contact_research:
            contact, research_interest = contact_research.split("Research Interests:")
        else:
            contact = contact_research
            research_interest = ""
        
        tmp = [x.strip()  for x in contact.split("\n") if x.strip()]
        if len(tmp) < 2:
            office_phone_email = ""
        else:
            office_phone_email = tmp[-1]
        
        if ";" in office_phone_email:
            tmp2 = office_phone_email.split(";")
            email = tmp2[-1].strip()            
            if "@" in email:
                office_phone = ";".join(tmp2[:-1])
            else:
                office_phone = office_phone_email
                email = ""
        else:
            office_phone = office_phone_email
            email = ""
        email = email.strip()
        if email:
            if "," in office_phone:
                phone = office_phone.split(",")[-1].strip()
                office = office_phone.replace(phone, "").strip()
            else:
                office = office_phone
                phone = ""
            data_dict['email'] = email
            data_dict['phone'] = phone
            data_dict['office_address'] = office

        else:
            data_dict['email'] = ""
            data_dict['phone'] = ""
            data_dict['office_address'] = office
            

        research_dict = {}
        if research_interest:
            for x in research_interest.split(";"):
                research_dict[x.strip()] = ""
        
        for x in person.find_all("strong"):
            token = x.text.strip()
            if is_job_title(token.lower()):
                data_dict['job_title'] = token
            
            if token.startswith("Research"):
                data_dict['research_area'] = ";".join(research_dict.keys())

            if token.startswith("Education"):
                data_dict['phd_univ'] = phd_univ
                data_dict['phd_year'] = phd_year
                
        for x in person.find_all("a"):
            if "/Research" in x["href"]:
                key = x.text.strip()
                if not key in all_research_dict:
                    all_research_dict[key] = f"{base_url}{x['href']}"
            
                                
        if DEBUG:
            print(f"n={n}\t=============")
            print(f"name= {data_dict.get('name','')}")
            print(f"job_title= {data_dict.get('job_title','')}")
            print(f"phone= {data_dict.get('phone','')}")
            print(f"office= {data_dict.get('office_address','')}")
            print(f"email= {data_dict.get('email','')}")
            print(f"url= {data_dict.get('url','')}")
            print(f"img_url= {data_dict.get('img_url','')}")
            print(f"phd_univ= {data_dict.get('phd_univ','')}")
            print(f"phd_year= {data_dict.get('phd_year','')}")
            print(f"research_area= {data_dict.get('research_area','')}")
            print(f"department= {data_dict.get('department','')}")
        
        if data_dict:
            row_data = []
            for c in COLUMNS:
                cell = data_dict.get(c,"")
                row_data.append(cell)
            data.append(row_data)
    except Exception as e:
        print(f"[Error] {str(e)}\n{person.prettify()}")

In [35]:
len(data) , data[-1]

(131,
 ['Matei Zaharia',
  'Associate Professor',
  ' Carnegie Mellon University',
  '2003',
  'Operating Systems & Networking (OSNT)',
  '',
  '',
  'https://www2.eecs.berkeley.edu/Faculty/Homepages/matei.html',
  'https://www2.eecs.berkeley.edu/Assets/placeholder_white_150x210.png',
  '',
  '',
  '',
  '367 Evans Hall,',
  'Computer Science',
  'Univ California Berkeley'])

In [36]:
all_research_dict

{'Artificial Intelligence (AI)': 'https://www2.eecs.berkeley.edu/Research/Areas/AI',
 'Control, Intelligent Systems, and Robotics (CIR)': 'https://www2.eecs.berkeley.edu/Research/Areas/CIR',
 'Information, Data, Network, and Communication Sciences (IDNCS)': 'https://www2.eecs.berkeley.edu/Research/Areas/IDNCS',
 'Theory (THY)': 'https://www2.eecs.berkeley.edu/Research/Areas/THY',
 'Computer Architecture & Engineering (ARC)': 'https://www2.eecs.berkeley.edu/Research/Areas/ARC',
 'Integrated Circuits (INC)': 'https://www2.eecs.berkeley.edu/Research/Areas/INC',
 'Operating Systems & Networking (OSNT)': 'https://www2.eecs.berkeley.edu/Research/Areas/OSNT',
 'Design, Modeling and Analysis (DMA)': 'https://www2.eecs.berkeley.edu/Research/Areas/DMA',
 'Education (EDUC)': 'https://www2.eecs.berkeley.edu/Research/Areas/EDUC',
 'Signal Processing (SP)': 'https://www2.eecs.berkeley.edu/Research/Areas/SP',
 'Biosystems & Computational Biology (BIO)': 'https://www2.eecs.berkeley.edu/Research/Areas/

In [37]:
df = pd.DataFrame(data, columns=COLUMNS)

In [38]:
print(f"Number of faculties at {SCHOOL}: {df.shape[0]}")

Number of faculties at UCB-CS: 131


In [39]:
df

Unnamed: 0,name,job_title,phd_univ,phd_year,research_area,research_concentration,research_focus,url,img_url,phone,email,cell_phone,office_address,department,school
0,Pieter Abbeel,Professor,Stanford University,2008,"Artificial Intelligence (AI);Control, Intellig...",,,https://www2.eecs.berkeley.edu/Faculty/Homepag...,https://www2.eecs.berkeley.edu/Faculty/Photos/...,(510) 642-7034,pabbeel@cs.berkeley.edu,,"746 Sutardja Dai Hall,",Computer Science,Univ California Berkeley
1,Rediet Abebe,Assistant Professor,,,"Artificial Intelligence (AI);Information, Data...",,,https://www2.eecs.berkeley.edu/Faculty/Homepag...,https://www2.eecs.berkeley.edu/Faculty/Photos/...,,,,"746 Sutardja Dai Hall,",Computer Science,Univ California Berkeley
2,Ahmed Alaa,Below The Line Assistant Professor,Stanford University,2008,,,,https://www2.eecs.berkeley.edu/Faculty/Homepag...,https://www2.eecs.berkeley.edu/Assets/placehol...,,,,"746 Sutardja Dai Hall,",Computer Science,Univ California Berkeley
3,Krste Asanović,Professor,Stanford University,2008,Computer Architecture & Engineering (ARC);Inte...,,,https://www2.eecs.berkeley.edu/Faculty/Homepag...,https://www2.eecs.berkeley.edu/Faculty/Photos/...,510-642-6506,krste@berkeley.edu,,"579B Soda Hall,",Computer Science,Univ California Berkeley
4,Babak Ayazifar,Teaching Professor,Massachusetts Institute of Technology,2003,Education (EDUC);Signal Processing (SP),,,https://www2.eecs.berkeley.edu/Faculty/Homepag...,https://www2.eecs.berkeley.edu/Faculty/Photos/...,510-642-9945,ayazifar@berkeley.edu,,"517 Cory Hall,",Computer Science,Univ California Berkeley
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126,Justin Yokota,Lecturer,MIT,1991,,,,https://www2.eecs.berkeley.edu/Faculty/Homepag...,https://www2.eecs.berkeley.edu/Assets/placehol...,,,,"50A Lawrence Berkeley National Laboratory,",Computer Science,Univ California Berkeley
127,Nir Yosef,Associate Professor,,,Biosystems & Computational Biology (BIO),,,https://www2.eecs.berkeley.edu/Faculty/Homepag...,https://www2.eecs.berkeley.edu/Faculty/Photos/...,,niryosef@eecs.berkeley.edu,,629 Soda Hall,Computer Science,Univ California Berkeley
128,Bin Yu,Professor,Berkeley,1990,Signal Processing (SP),,,https://www2.eecs.berkeley.edu/Faculty/Homepag...,https://www2.eecs.berkeley.edu/Faculty/Photos/...,510-642-2021,binyu@stat.berkeley.edu,,"367 Evans Hall,",Computer Science,Univ California Berkeley
129,Stella Yu,Adjunct Professor,Carnegie Mellon University,2003,"Artificial Intelligence (AI);Control, Intellig...",,,https://www2.eecs.berkeley.edu/Faculty/Homepag...,https://www2.eecs.berkeley.edu/Faculty/Photos/...,,,,"367 Evans Hall,",Computer Science,Univ California Berkeley


In [41]:
# prepare research group dataframe
cols = ["research_group", "url"]
data = []
for i in all_research_dict.keys():
    data.append([i, all_research_dict.get(i,'')])
    print(f"{i}:\t {all_research_dict.get(i,'')}")

Artificial Intelligence (AI):	 https://www2.eecs.berkeley.edu/Research/Areas/AI
Control, Intelligent Systems, and Robotics (CIR):	 https://www2.eecs.berkeley.edu/Research/Areas/CIR
Information, Data, Network, and Communication Sciences (IDNCS):	 https://www2.eecs.berkeley.edu/Research/Areas/IDNCS
Theory (THY):	 https://www2.eecs.berkeley.edu/Research/Areas/THY
Computer Architecture & Engineering (ARC):	 https://www2.eecs.berkeley.edu/Research/Areas/ARC
Integrated Circuits (INC):	 https://www2.eecs.berkeley.edu/Research/Areas/INC
Operating Systems & Networking (OSNT):	 https://www2.eecs.berkeley.edu/Research/Areas/OSNT
Design, Modeling and Analysis (DMA):	 https://www2.eecs.berkeley.edu/Research/Areas/DMA
Education (EDUC):	 https://www2.eecs.berkeley.edu/Research/Areas/EDUC
Signal Processing (SP):	 https://www2.eecs.berkeley.edu/Research/Areas/SP
Biosystems & Computational Biology (BIO):	 https://www2.eecs.berkeley.edu/Research/Areas/BIO
Graphics (GR):	 https://www2.eecs.berkeley.edu/Re

In [42]:
if all_research_dict:
    df_research = pd.DataFrame(data, columns=cols)
    df_research
else:
    df_research = None

In [43]:
df.to_csv(f"faculty-{SCHOOL}.csv", index=False)

In [45]:
# import xlsxwriter
file_xlsx = f"faculty-{SCHOOL}.xlsx"
writer = pd.ExcelWriter(file_xlsx, engine='xlsxwriter')
df.to_excel(writer, sheet_name = "CS Faculty-MIT", index=False)
if not df_research.empty:
    df_research.to_excel(writer, sheet_name = "Research Groups", index=False)
writer.save()