In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Set up the web driver
driver = webdriver.Chrome()
driver.get("https://info.lse.ac.uk/Staff/Departments-and-Institutes")

# Find and click the department
department = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.LINK_TEXT, 'Department of Mathematics')))
driver.execute_script("arguments[0].scrollIntoView();", department)
department.click()

# Find and click "People"
people = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.LINK_TEXT, 'People')))
people.click()
people_url = driver.current_url

# Find and click "Academic Faculty"
academic_faculty = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.LINK_TEXT, 'Academic Faculty')))
driver.execute_script("arguments[0].scrollIntoView();", academic_faculty)
academic_faculty.click()

In [2]:
from bs4 import BeautifulSoup
import requests
url = driver.current_url
r = requests.get(url)
soup = BeautifulSoup(r.content,'lxml')
academic_faculty = soup.find("div", attrs={'class': "accordion__content"})
img_text = academic_faculty.find_all("div", attrs={'class': "accordion__imgTxt"})
professors = []
#print(img_text)
for i in img_text:
    one_text_block = i.find("div",attrs={'class': "accordion__txt"})
    one_prof = one_text_block.find("a",attrs={"class": "sys_16"})
    another_prof = one_text_block.find("a",attrs={"class": "sys_0 sys_t0"})
    #if not one_prof:
     #   one_prof = one_text_block.find("a",attrs={"class": "sys_0 sys_t0"})
    professors.append(one_prof)
    professors.append(another_prof)

professors = list(filter(None, professors))
filtered_professors = []
for professor in professors:
    if professor['href'].startswith('http://www.lse.ac.uk') or professor['href'].startswith('/Mathematics'):
        filtered_professors.append(professor)

In [3]:
professor_urls = []
for professor in filtered_professors:
    url = professor.get("href")
    if  url.startswith('/Mathematics'):
        url = "http://www.lse.ac.uk" + url
    if url not in professor_urls:
        professor_urls.append(url)
professor_urls

['http://www.lse.ac.uk/Mathematics/people/Ahmad-Abdi',
 'http://www.lse.ac.uk/Mathematics/people/Peter-Allen',
 'http://www.lse.ac.uk/Mathematics/people/Martin-Anthony',
 'http://www.lse.ac.uk/Mathematics/People/Galit-Ashkenazi-Golan',
 'http://www.lse.ac.uk/Mathematics/people/Tugkan-Batu',
 'http://www.lse.ac.uk/Mathematics/people/Julia-Boettcher',
 'http://www.lse.ac.uk/Mathematics/people/Graham-Brightwell',
 'http://www.lse.ac.uk/Mathematics/people/Christoph-Czichowsky',
 'http://www.lse.ac.uk/Mathematics/people/Albina-Danilova',
 'http://www.lse.ac.uk/Mathematics/people/Pavel-Gapeev',
 'http://www.lse.ac.uk/Mathematics/people/Olivier-Gossner',
 'http://www.lse.ac.uk/Mathematics/People/Jan-van-den-Heuvel',
 'http://www.lse.ac.uk/Mathematics/People/Grammateia-Kotsialou',
 'http://www.lse.ac.uk/Mathematics/people/Ioannis-Kouletsis',
 'http://www.lse.ac.uk/Mathematics/people/Andrew-Lewis-Pye',
 'http://www.lse.ac.uk/Mathematics/people/Arne-Lokka',
 'http://www.lse.ac.uk/Mathematics/Peo

In [4]:
professors_dict = {}
professor_name_list = []
professor_prefix_list = []
key_expertise_list = []
professor_title_list = []
languages_list = []
title_list = []
modules_list = []

for one_url in professor_urls:
    
    r_2 = requests.get(one_url)
    soup_2 = BeautifulSoup(r_2.content,'lxml')

    # Get professor prefix and name
    professor = soup_2.find("h1", attrs={'class': 'people__name'})
    professor_prefix = professor.find('span', class_='people__title').text
    professor_name = professor.text.strip("Dr")
    professor_name = professor.text.strip("Professor")
    professor_name_list.append(professor_name)
    professor_prefix_list.append(professor_prefix)

    # Get the key expertise
    key_expertise_locate = soup_2.find('div', class_='peopleContact__method', text="Key Expertise")
    if key_expertise_locate: # if key_expertise_locate exists and doesn't return None
        key_expertise = key_expertise_locate.find_next_sibling('div').text
        key_expertise_list.append(key_expertise)
    else:
        key_expertise_list.append(None)

    # Get the languages
    languages_locate = soup_2.find('div', class_='peopleContact__method', text="Languages")
    if languages_locate:
        languages = languages_locate.find_next_sibling('div').text
        languages_list.append(languages)
    else:
        languages_list.append(None)

    
    # Get professor title
    professor_title = soup_2.find('h2', class_='people__position').text
    professor_title_list.append(professor_title)

    # Get courses that the professor is teaching
    teaching = soup_2.find(name=["h3","h2", "p"], text= ["Teaching", "Teaching:"])
    if teaching:
        module_list = teaching.find_next('ul')
        modules = [li.text for li in module_list.find_all('li')] # extract the items from the <li> elements
        modules = [module.replace('\xa0', " ").strip("\n") for module in modules]
        modules_list.append(modules)
    else:
        modules_list.append(None)

professors_dict["Professor Name"] = professor_name_list
professors_dict["Professor Prefix"] = professor_prefix_list
professors_dict["Key Expertise"] = key_expertise_list
professors_dict["Languages"] = languages_list
professors_dict["Title"] = professor_title_list
professors_dict["Modules"] = modules_list

import pandas as pd
professors_df = pd.DataFrame(professors_dict)

In [5]:
professors_df
# no modules listed

Unnamed: 0,Professor Name,Professor Prefix,Key Expertise,Languages,Title,Modules
0,Dr Ahmad Abdi,Dr,"Combinatorial Optimization, Integer and Linear...","English, Farsi",Assistant Professor,
1,Peter Allen,Professor,"Graph Theory, Probabilistic Combinatorics, Ext...",English,Professor,
2,Martin Anthony,Professor,Mathematical Aspects of Theory of Machine Lear...,English,Professor,
3,Dr Galit Ashkenazi-Golan,Dr,"Game Theory, Repeated Games, Stochastic Games,...","English, French, Hebrew",Assistant Professor,
4,Dr Tuğkan Batu,Dr,"Algorithms and Theory of Computation, Property...","English, Turkish",Assistant Professor,
5,Julia Böttch,Professor,"Extremal Combinatorics, Random Discrete Struct...","English, German",Professor,
6,Graham Brightwell,Professor,"Combinatorics, partially ordered sets, random ...",English,Professor,
7,Dr Christoph Czichowsky,Dr,"Financial Mathematics, Stochastic Analysis, Tr...","English, German",Associate Professor,
8,Dr Albina Danilova,Dr,"asymmetric information, derivative pricing, st...","English, Russian",Associate Professor,
9,Dr Pavel Gapeev,Dr,"Optimal Stopping Problems, Stochastic Control ...","English, Russian",Associate Professor,


In [6]:
professors_df.to_csv('maths.csv')