In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Set up the web driver
driver = webdriver.Chrome()
driver.get("https://info.lse.ac.uk/Staff/Departments-and-Institutes")

# Find and click the department
department = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.LINK_TEXT, 'Department of Statistics')))
driver.execute_script("arguments[0].scrollIntoView();", department)
department.click()

# Find and click "People"
people = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.LINK_TEXT, 'People')))
people.click()
people_url = driver.current_url

# Find and click "Academic Faculty"
academic_faculty = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.LINK_TEXT, 'Academic faculty')))
driver.execute_script("arguments[0].scrollIntoView();", academic_faculty)
academic_faculty.click()

In [2]:
from bs4 import BeautifulSoup
import requests
url = driver.current_url
r = requests.get(url)
soup = BeautifulSoup(r.content,'lxml')
academic_faculty = soup.find("div", attrs={'class': "accordion__content"})
academic_faculty
text_block = academic_faculty.find_all("div", attrs={'class': "accordion__txt"})

professors = []
for professor in text_block:
    prof_names = professor.find("a", attrs={'class': "sys_0 sys_t0"})
    professors.append(prof_names)

In [3]:
professor_urls = []
for professor in professors:
    url = professor.get("href")
    url = "https://www.lse.ac.uk" + url
    if url not in professor_urls:
        professor_urls.append(url)

In [4]:
import re

professors_dict = {}
professor_name_list = []
professor_prefix_list = []
key_expertise_list = []
professor_title_list = []
languages_list = []
title_list = []
modules_list = []

for one_url in professor_urls:
    
    r_2 = requests.get(one_url)
    soup_2 = BeautifulSoup(r_2.content,'lxml')

    # Get professor prefix and name
    professor = soup_2.find("h1", attrs={'class': 'people__name'})
    professor_prefix = professor.find('span', class_='people__title').text
    professor_name = professor.text.strip("Dr")
    professor_name = professor.text.strip("Professor")
    professor_name_list.append(professor_name)
    professor_prefix_list.append(professor_prefix)

    # Get the key expertise
    key_expertise_locate = soup_2.find('div', class_='peopleContact__method', text="Key Expertise")
    if key_expertise_locate: # if key_expertise_locate exists and doesn't return None
        key_expertise = key_expertise_locate.find_next_sibling('div').text
        key_expertise_list.append(key_expertise)
    else:
        key_expertise_list.append(None)

    # Get the languages
    languages_locate = soup_2.find('div', class_='peopleContact__method', text="Languages")
    if languages_locate:
        languages = languages_locate.find_next_sibling('div').text
        languages_list.append(languages)
    else:
        languages_list.append(None)

    
    # Get professor title
    professor_title = soup_2.find('h2', class_='people__position').text
    professor_title_list.append(professor_title)

    # Get courses that the professor is teaching
    pattern = r"ST\d{3}"
    text = soup_2.find('div', class_='people__bio').text
    modules = re.findall(pattern, text)
    modules_list.append(modules)
    
professors_dict["Professor Name"] = professor_name_list
professors_dict["Professor Prefix"] = professor_prefix_list
professors_dict["Key Expertise"] = key_expertise_list
professors_dict["Languages"] = languages_list
professors_dict["Title"] = professor_title_list
professors_dict["Modules"] = modules_list


In [5]:
import pandas as pd
professors_df = pd.DataFrame(professors_dict)
professors_df

Unnamed: 0,Professor Name,Professor Prefix,Key Expertise,Languages,Title,Modules
0,Dr James Abdey,Dr,,English,Associate Professorial Lecturer,[]
1,Dr Mona Azadkia,Dr,"Non-parametric Statistics, Causal inference, H...","English, Persian",Assistant Professor,[]
2,Dr Marcos Barret,Dr,"Big Data Linkage & Analytics, Artificial Intel...","English, Portuguese",Assistant Professorial Lecturer,"[ST446, ST449, ST456, ST207, ST498]"
3,Pauline Barrieu,Professor,,English,Professor and Head of Department,[]
4,Dr Erik Baurdoux,Dr,,English,Associate Professor,[]
5,Wicher Bergsma,Professor,,English,Professor and Deputy Head of Department (Teach...,[]
6,Umut Cetin,Professor,,English,Professor,[]
7,Dr Yining Chen,Dr,,English,Associate Professor,[]
8,Dr Yunxiao Chen,Dr,"Multivariate Statistics, Social Statistics, La...","English, Mandarin",Assistant Professor,[]
9,Angelos Dassi,Professor,,English,Professor,[]


In [6]:
professors_df.to_csv("stats.csv")