In [7]:
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests

## LTI Faculty Data
[https://lti.cs.cmu.edu/directory/all/154/1](https://lti.cs.cmu.edu/directory/all/154/1)

In [10]:
driver = webdriver.Chrome()  
urls = ["https://www.lti.cmu.edu/directory/all/154/1", "https://www.lti.cmu.edu/directory/all/154/1?page=1"]

scrape_data = []

# create a txt file to store the data
data_folder = '../../data/'
filename = 'lti_department_data'
with open(data_folder + filename + '.txt', 'w') as f:
    f.write('')

def combine_faculty_info_into_paragraph(name, title, email, office, phone, research_areas):
    res = name + 'is a ' + title + 'a at LTI. '
    if email != 'N/A':
        res += 'Their email is ' + email + '. '
    if office != 'N/A':
        res += 'Their office is located at ' + office + '. '
    if phone != 'N/A':
        res += 'You can reach ' + name + ' at ' + phone + '. '
    if research_areas != 'N/A':
        res += 'Their research areas include ' + research_areas + '.'
    res += '\n'
    return res

# Iterate through each URL and scrape the data
for url in urls:
    driver.get(url)
    html = driver.page_source
    soup = bs(html, 'html.parser')

    table = soup.find('table', class_='views-view-grid')
    rows = table.find_all('tr')

    for row in rows:
        cells = row.find_all('td')
        row_data = []
        for cell in cells:
            # Extracting info
            # impute N/A if there is no data for a particular field
            name_element = cell.find('h2')
            if not name_element:
                continue
            name = name_element.text.strip()
            
            title_element = cell.find('div', class_='views-field-field-computed-prof-title')
            title = title_element.text.strip() if title_element else "N/A"
            
            email_element = cell.find('a', href=lambda href: href and 'mailto' in href)
            email = email_element.text.strip() if email_element else "N/A"
            
            office_element = cell.find('div', class_='views-field-field-computed-building')
            office = office_element.text.strip().split(':')[1].strip() if office_element else "N/A"
            
            phone_element = cell.find('div', class_='views-field-field-computed-phone')
            phone = phone_element.text.strip().split(':')[1].strip() if phone_element else "N/A"
            
            research_areas_element = cell.find('div', class_='views-field-field-research-areas')
            research_areas = research_areas_element.text.strip().split(':')[1].strip() if research_areas_element else "N/A"

            # construct a paragraph for each faculty member
            row_paragraph = combine_faculty_info_into_paragraph(name, title, email, office, phone, research_areas)
            # write to txt file
            with open(data_folder + filename + '.txt', 'a') as f:
                f.write(row_paragraph)

            # Append the data to the list to form a csv output later
            row_data.append({
                'Name': name,
                'Title': title,
                'Email': email,
                'Office': office,
                'Phone': phone,
                'Research Areas': research_areas
            })
        scrape_data.extend(row_data)

driver.quit()

df = pd.DataFrame(scrape_data)
df.to_csv(data_folder + filename + '.csv', index=False)


## CMU Commencement

In [11]:
# pages = ['schedule', 'graduates', 'visitors', 'faq', 'contact-us']
# base_url = 'https://www.cmu.edu/commencement/'

# def extract_info(url):
#     response = requests.get(url)
#     soup = bs(response.content, 'html.parser')
    
#     content_divs = soup.find_all("div", class_="content")
#     content_data = [div.get_text(separator="\n", strip=True) for div in content_divs]
    
#     grid_div = soup.find("div", class_="grid")
#     grid_data = grid_div.get_text(separator="\n", strip=True) if grid_div else ""
    
#     return content_data, grid_data

# # Save each page scraped data
# for page in pages:
#     url = base_url + page + '/index.html'
#     content_data, grid_data = extract_info(url)   
#     scrape_content = pd.DataFrame({"content": content_data})    
#     scrape_grid = pd.DataFrame({"grid": [grid_data]})
    
#     scrape = pd.concat([scrape_content, scrape_grid], axis=1)
#     # scrape.to_csv('commencement/' + page + '_data.csv', index=False)
#     break

### commencement schedule

In [50]:
def extract_info(url):
    response = requests.get(url)
    soup = bs(response.content, 'html.parser')

    # content_grid_divs = soup.find_all('div', class_=["content", "grid"])
    target_divs = soup.find_all('div', class_=lambda x: x and (x == "content" or x == "grid" and x != "sidebar"))
    content_grid_data = [div.get_text(separator=" ", strip=True) for div in target_divs]
    
    return content_grid_data


In [65]:
# create a txt file to store the data
data_folder = '../../data/'
filename = 'commencement_schedule_data'
with open(data_folder + filename + '.txt', 'w') as f:
    f.write('')


############ commencement index page ############
url = 'https://www.cmu.edu/commencement/index.html'
content_grid_data = extract_info(url)
content_grid_data = [data.replace(u'\xa0', r' ') for data in content_grid_data]

for data in content_grid_data:
    with open(data_folder + filename + '.txt', 'a') as f:
        f.write(data + '\n')


############ schedule index page ############
url = 'https://www.cmu.edu/commencement/schedule/index.html'
content_grid_data = extract_info(url)
content_grid_data = [data.replace(u'\xa0', r' ') for data in content_grid_data]

for data in content_grid_data:
    with open(data_folder + filename + '.txt', 'a') as f:
        f.write(data + '\n')


############## main ceremony ############
# Main Commencement Ceremony
# 2023 Honorary Degree Recipients
# Student Speaker
url = 'https://www.cmu.edu/commencement/schedule/main-ceremony.html'
content_grid_data = extract_info(url)
content_grid_data = [data.replace(u'\xa0', r' ') for data in content_grid_data]

# combine the honorable degree recipients into one paragraph
honorary_degree_recipients = content_grid_data[1:3]
honorary_degree_recipients = ' '.join(honorary_degree_recipients)
content_grid_data = [content_grid_data[0], honorary_degree_recipients] + content_grid_data[3:]

for data in content_grid_data:
    with open(data_folder + filename + '.txt', 'a') as f:
        f.write(data + '\n')


############# Diploma Ceremonies page ############
url = 'https://www.cmu.edu/commencement/schedule/diploma-ceremonies.html'
response = requests.get(url)
soup = bs(response.content, 'html.parser')
target_divs = soup.find_all('div', class_="content")
content_data = [div.get_text(separator=" ", strip=True) for div in target_divs]
### find the grid div: contains schedule for each diploma ceremony
grid_div = soup.find("div", class_="grid")
# find all div within grid_div
div_in_grid_div = grid_div.find_all('div')
# extract the text from each div
grid_data = [div.get_text(separator=" ", strip=True) for div in div_in_grid_div]
# append "Diploma Ceremonies schedule for " to the beginning of the grid_data
grid_data = ['Diploma Ceremonies schedule for '+data for data in grid_data]
for data in content_data + grid_data:
    with open(data_folder + filename + '.txt', 'a') as f:
        f.write(data + '\n')