The html parsing code in this file is largely based on the help of ChatGPT.

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests
import time

import datetime

current_time = datetime.datetime.now()
print("Current Date and Time:", current_time.strftime("%Y-%m-%d %H:%M:%S"))

Current Date and Time: 2024-03-07 22:12:55


## LTI Faculty Data
[https://lti.cs.cmu.edu/directory/all/154/1](https://lti.cs.cmu.edu/directory/all/154/1)

In [2]:
driver = webdriver.Chrome()  
urls = ["https://www.lti.cmu.edu/directory/all/154/1", "https://www.lti.cmu.edu/directory/all/154/1?page=1"]

scrape_data = []

# create a txt file to store the data
data_folder = '../../data/'
filename = 'lti_department_data'
with open(data_folder + filename + '.txt', 'w') as f:
    f.write('')

def combine_faculty_info_into_paragraph(name, title, email, office, phone, research_areas):
    res = name + 'is a ' + title + 'a at LTI. '
    if email != 'N/A':
        res += 'Their email is ' + email + '. '
    if office != 'N/A':
        res += 'Their office is located at ' + office + '. '
    if phone != 'N/A':
        res += 'You can reach ' + name + ' at ' + phone + '. '
    if research_areas != 'N/A':
        res += 'Their research areas include ' + research_areas + '.'
    res += '\n'
    return res

# Iterate through each URL and scrape the data
for url in urls:
    driver.get(url)
    html = driver.page_source
    soup = bs(html, 'html.parser')

    table = soup.find('table', class_='views-view-grid')
    rows = table.find_all('tr')

    for row in rows:
        cells = row.find_all('td')
        row_data = []
        for cell in cells:
            # Extracting info
            # impute N/A if there is no data for a particular field
            name_element = cell.find('h2')
            if not name_element:
                continue
            name = name_element.text.strip()
            
            title_element = cell.find('div', class_='views-field-field-computed-prof-title')
            title = title_element.text.strip() if title_element else "N/A"
            
            email_element = cell.find('a', href=lambda href: href and 'mailto' in href)
            email = email_element.text.strip() if email_element else "N/A"
            
            office_element = cell.find('div', class_='views-field-field-computed-building')
            office = office_element.text.strip().split(':')[1].strip() if office_element else "N/A"
            
            phone_element = cell.find('div', class_='views-field-field-computed-phone')
            phone = phone_element.text.strip().split(':')[1].strip() if phone_element else "N/A"
            
            research_areas_element = cell.find('div', class_='views-field-field-research-areas')
            research_areas = research_areas_element.text.strip().split(':')[1].strip() if research_areas_element else "N/A"

            # construct a paragraph for each faculty member
            row_paragraph = combine_faculty_info_into_paragraph(name, title, email, office, phone, research_areas)
            # write to txt file
            with open(data_folder + filename + '.txt', 'a') as f:
                f.write(row_paragraph)

            # Append the data to the list to form a csv output later
            row_data.append({
                'Name': name,
                'Title': title,
                'Email': email,
                'Office': office,
                'Phone': phone,
                'Research Areas': research_areas
            })
        scrape_data.extend(row_data)

driver.quit()

df = pd.DataFrame(scrape_data)
df.to_csv(data_folder + filename + '.csv', index=False)


## CMU Commencement

In [3]:
# pages = ['schedule', 'graduates', 'visitors', 'faq', 'contact-us']
# base_url = 'https://www.cmu.edu/commencement/'

# def extract_info(url):
#     response = requests.get(url)
#     soup = bs(response.content, 'html.parser')
    
#     content_divs = soup.find_all("div", class_="content")
#     content_data = [div.get_text(separator="\n", strip=True) for div in content_divs]
    
#     grid_div = soup.find("div", class_="grid")
#     grid_data = grid_div.get_text(separator="\n", strip=True) if grid_div else ""
    
#     return content_data, grid_data

# # Save each page scraped data
# for page in pages:
#     url = base_url + page + '/index.html'
#     content_data, grid_data = extract_info(url)   
#     scrape_content = pd.DataFrame({"content": content_data})    
#     scrape_grid = pd.DataFrame({"grid": [grid_data]})
    
#     scrape = pd.concat([scrape_content, scrape_grid], axis=1)
#     # scrape.to_csv('commencement/' + page + '_data.csv', index=False)
#     break

### commencement schedule

In [4]:
def extract_info(url):
    response = requests.get(url)
    soup = bs(response.content, 'html.parser')

    # content_grid_divs = soup.find_all('div', class_=["content", "grid"])
    target_divs = soup.find_all('div', class_=lambda x: x and (x == "content" or x == "grid" and x != "sidebar"))
    content_grid_data = [div.get_text(separator=" ", strip=True) for div in target_divs]
    
    return content_grid_data


In [5]:
# create a txt file to store the data
data_folder = '../../data/'
filename = 'commencement_schedule_data'
with open(data_folder + filename + '.txt', 'w') as f:
    f.write('')


############ commencement index page ############
url = 'https://www.cmu.edu/commencement/index.html'
content_grid_data = extract_info(url)
content_grid_data = [data.replace(u'\xa0', r' ') for data in content_grid_data]

for data in content_grid_data:
    with open(data_folder + filename + '.txt', 'a') as f:
        f.write(data + '\n')


############ schedule index page ############
url = 'https://www.cmu.edu/commencement/schedule/index.html'
content_grid_data = extract_info(url)
content_grid_data = [data.replace(u'\xa0', r' ') for data in content_grid_data]

for data in content_grid_data:
    with open(data_folder + filename + '.txt', 'a') as f:
        f.write(data + '\n')


############## main ceremony ############
# Main Commencement Ceremony
# 2023 Honorary Degree Recipients
# Student Speaker
url = 'https://www.cmu.edu/commencement/schedule/main-ceremony.html'
content_grid_data = extract_info(url)
content_grid_data = [data.replace(u'\xa0', r' ') for data in content_grid_data]

# combine the honorable degree recipients into one paragraph
honorary_degree_recipients = content_grid_data[1:3]
honorary_degree_recipients = ' '.join(honorary_degree_recipients)
content_grid_data = [content_grid_data[0], honorary_degree_recipients] + content_grid_data[3:]

for data in content_grid_data:
    with open(data_folder + filename + '.txt', 'a') as f:
        f.write(data + '\n')


############# Diploma Ceremonies page ############
url = 'https://www.cmu.edu/commencement/schedule/diploma-ceremonies.html'
response = requests.get(url)
soup = bs(response.content, 'html.parser')
target_divs = soup.find_all('div', class_="content")
content_data = [div.get_text(separator=" ", strip=True) for div in target_divs]
### find the grid div: contains schedule for each diploma ceremony
grid_div = soup.find("div", class_="grid")
# find all div within grid_div
div_in_grid_div = grid_div.find_all('div')
# extract the text from each div
grid_data = [div.get_text(separator=" ", strip=True) for div in div_in_grid_div]
# append "Diploma Ceremonies schedule for " to the beginning of the grid_data
grid_data = ['Diploma Ceremonies schedule for '+data for data in grid_data]
for data in content_data + grid_data:
    with open(data_folder + filename + '.txt', 'a') as f:
        f.write(data + '\n')

## Spring Carnival schedule

In [6]:
# data_folder = '../../data/'
# filename = 'carnival_schedule_data'
# with open(data_folder + filename + '.txt', 'w') as f:
#     f.write('')

# ## Spring Carnival schedule
# driver = webdriver.Chrome() 
# url = 'https://web.cvent.com/event/ab7f7aba-4e7c-4637-a1fc-dd1f608702c4/websitePage:645d57e4-75eb-4769-b2c0-f201a0bfc6ce?locale=en'
# headers = {
#     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
# }
# response = requests.get(url, headers=headers)
# soup = bs(response.content, 'html.parser')

# # Weekend Highlights
# target_divs = soup.find('div', class_='container')
# content_data = [div.get_text(separator=" ", strip=True) for div in target_divs]
# content_data

# <div class="Grid__row___K84Rr Grid__small___joj2e"><div class="Grid__column___L6Qbn Grid__col_4___eGcOB Grid__small___joj2e"><div id="widget:4f6248af-a58f-443b-a807-4fee751ffa37" class="App__widgetIdcontainer___ms2gn"><div data-cvent-id="widget-NucleusText-widget:4f6248af-a58f-443b-a807-4fee751ffa37" class="Text__container___Z6gol" style="border-width: 0px; border-color: rgb(255, 255, 255); border-style: none; padding: 15px; background-color: transparent; border-radius: 0px; flex: 1 1 auto;"><div class="" style="font-family: &quot;Open Sans&quot;, Helvetica, Arial, sans-serif; font-weight: 400; font-size: 16px; color: rgb(102, 102, 102); line-height: 1.3; font-style: normal; text-align: center; background-color: transparent; border-radius: 0px; border-width: 0px; border-color: rgb(255, 255, 255); border-style: none; padding: 0px;"><div class="css-vsf5of"><p style="text-align:left;" class="carina-rte-public-DraftStyleDefault-block"><span style="color: rgb(102,102,102);"><span style="font-weight: bold;">Booth, Rides and Dog Houses</span></span></p><ul class="carina-rte-public-DraftStyleDefault-ul"><li style="text-align:left;"><span style="color: rgb(102,102,102);">Thursday: 3:30-11 p.m.</span></li><li style="text-align:left;"><span style="color: rgb(102,102,102);">Friday &amp; Saturday: 11 a.m.-11 p.m.&nbsp; </span></li></ul><p style="text-align:left;" class="carina-rte-public-DraftStyleDefault-block"><span style="color: rgb(102,102,102);"><span style="font-weight: bold;"><br>Carnival Headquarters Tent <br></span><span style="font-style: italic;">Check-In &amp; Registration</span></span></p><ul class="carina-rte-public-DraftStyleDefault-ul"><li style="text-align:left;"><span style="color: rgb(102,102,102);">Thursday: 8 a.m.-7 p.m.</span></li><li style="text-align:left;"><span style="color: rgb(102,102,102);">Friday: 8 a.m.-7 p.m.</span></li><li style="text-align:left;"><span style="color: rgb(102,102,102);">Saturday: 8 a.m.-7 p.m.</span></li></ul></div></div></div></div></div><div class="Grid__column___L6Qbn Grid__col_4___eGcOB Grid__small___joj2e"><div id="widget:c8425b98-71bd-4a7c-9202-737a3d08aa06" class="App__widgetIdcontainer___ms2gn"><div data-cvent-id="widget-NucleusImage-widget:c8425b98-71bd-4a7c-9202-737a3d08aa06" class="Image__container___bkibo" style="border-width: 0px; border-color: rgb(255, 255, 255); border-style: none; padding: 15px; background-color: transparent; border-radius: 0px; flex: 1 1 auto; justify-content: center;"><div class="Image__image___K4qxU" style="text-align: center; font-family: Arial, Helvetica, sans-serif; font-weight: 400; font-style: normal;"><div class="Image__imageWrapper___yrxys"><img alt="" srcset="https://images.cvent.com/3acb589e68044db09aa08f6a32c3e88e/pix/07114a9d814e406fac771607eaa45b20!_!17653e632bb8ab2cb04369b6a1f4eabe.png?d=320&amp;f=webp 320w,
#         https://images.cvent.com/3acb589e68044db09aa08f6a32c3e88e/pix/07114a9d814e406fac771607eaa45b20!_!17653e632bb8ab2cb04369b6a1f4eabe.png?d=480&amp;f=webp 480w,
#         https://images.cvent.com/3acb589e68044db09aa08f6a32c3e88e/pix/07114a9d814e406fac771607eaa45b20!_!17653e632bb8ab2cb04369b6a1f4eabe.png?f=webp 600w" src="https://images.cvent.com/3acb589e68044db09aa08f6a32c3e88e/pix/07114a9d814e406fac771607eaa45b20!_!17653e632bb8ab2cb04369b6a1f4eabe.png?f=webp" style="max-width: 600px;"></div></div></div></div></div><div class="Grid__column___L6Qbn Grid__col_4___eGcOB Grid__small___joj2e"><div id="widget:d12b93e6-edf2-46a5-89ad-0014743ed8e2" class="App__widgetIdcontainer___ms2gn"><div data-cvent-id="widget-NucleusText-widget:d12b93e6-edf2-46a5-89ad-0014743ed8e2" class="Text__container___Z6gol" style="border-width: 0px; border-color: rgb(255, 255, 255); border-style: none; padding: 15px; background-color: transparent; border-radius: 0px; flex: 1 1 auto;"><div class="" style="font-family: &quot;Open Sans&quot;, Helvetica, Arial, sans-serif; font-weight: 400; font-size: 16px; color: rgb(102, 102, 102); line-height: 1.3; font-style: normal; text-align: center; background-color: transparent; border-radius: 0px; border-width: 0px; border-color: rgb(255, 255, 255); border-style: none; padding: 0px;"><div class="css-vsf5of"><p style="text-align:left;" class="carina-rte-public-DraftStyleDefault-block"><span style="color: rgb(102,102,102);"><span style="font-weight: bold;">Buggy Races and Donut Tent</span></span></p><ul class="carina-rte-public-DraftStyleDefault-ul"><li style="text-align:left;"><span style="color: rgb(102,102,102);">Friday's Preliminary Sweepstakes Race: 8 a.m.-Noon</span></li><li style="text-align:left;"><span style="color: rgb(102,102,102);">Saturday's Final Sweepstakes Race: 8 a.m.-Noon</span></li></ul><p style="text-align:left;" class="carina-rte-public-DraftStyleDefault-block"><br></p><p style="text-align:left;" class="carina-rte-public-DraftStyleDefault-block"><span style="color: rgb(102,102,102);"><span style="font-weight: bold;">Scotch'n'Soda Performance of </span><span style="font-style: italic;"><span style="font-weight: bold;">The Little Mermaid</span></span></span></p><ul class="carina-rte-public-DraftStyleDefault-ul"><li style="text-align:left;"><span style="color: rgb(102,102,102);">Thursday: 7-9 p.m.</span></li><li style="text-align:left;"><span style="color: rgb(102,102,102);">Friday: 7-9 p.m. and 11 p.m.-1 a.m.</span></li><li style="text-align:left;"><span style="color: rgb(102,102,102);">Saturday: 3-5 p.m. and 7-9 p.m.</span></li></ul></div></div></div></div></div></div>

# driver.quit()

## History at CMU

### 25 great things SCS

In [7]:
def extract_info_h(url, headers):
    response = requests.get(url, headers=headers)
    soup = bs(response.content, 'html.parser')

    target_data = []
    # find the h1
    h1 = soup.find('h1')
    head = h1.get_text(separator=" ", strip=True)
    target_data.append(head)

    target_divs = soup.find('div', class_='field')
    target_divs_div = target_divs.find('div', class_='collapse-text-text')
    target_data.append(target_divs_div.get_text(separator=" ", strip=True))
    great_things_25 = target_divs.find_all('fieldset')
    target_data += [ head + ': ' + div.get_text(separator=" ", strip=True) for div in great_things_25]
    
    return target_data


In [8]:
# create a txt file to store the data
data_folder = '../../data/'
filename = 'history_25_great_things'
with open(data_folder + filename + '.txt', 'w') as f:
    f.write('')


url = 'https://www.cs.cmu.edu/scs25/25things'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
}
target_data = extract_info_h(url, headers)
target_data = [data.replace(u'\xa0', r' ') for data in target_data]

for data in target_data:
    with open(data_folder + filename + '.txt', 'a') as f:
        f.write(data + '\n')

### history of SCS

In [9]:
def extract_info_h2(url, headers):
    response = requests.get(url, headers=headers)
    soup = bs(response.content, 'html.parser')

    target_data = []
    # find the h1
    h1 = soup.find('h1')
    head = h1.get_text(separator=" ", strip=True)
    # target_data.append(head)

    target_divs = soup.find('div', class_='field')

    first_h2 = target_divs.find('h2')
    # Find all preceding siblings of the first h2 tag
    preceding_siblings = first_h2.find_all_previous()
    # Extract text content of preceding siblings
    text_before_h2 = ''.join([sibling.get_text(separator=" ", strip=True) for sibling in preceding_siblings if sibling.name == 'p'])

    target_data.append(head + ': ' + text_before_h2)
    
    # Find all h2 tags
    h2_tags = target_divs.find_all('h2')
    # Extract h2 and corresponding p tags
    for h2_tag in h2_tags:
        paragraph = head + ': ' + h2_tag.get_text(separator=" ", strip=True)
        # Find the next sibling p tags until the next h2 tag
        next_element = h2_tag.find_next_sibling()
        while next_element and next_element.name != 'h2':
            if next_element.name == 'p':
                paragraph += next_element.get_text(separator=" ", strip=True)
            next_element = next_element.find_next_sibling()
        target_data.append(paragraph)

    
    return target_data


In [10]:
# create a txt file to store the data
data_folder = '../../data/'
filename = 'history_of_scs'
with open(data_folder + filename + '.txt', 'w') as f:
    f.write('')

url = 'https://www.cs.cmu.edu/scs25/history'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
}
target_data = extract_info_h2(url, headers)
target_data = [data.replace(u'\xa0', r' ') for data in target_data]

for data in target_data:
    with open(data_folder + filename + '.txt', 'a') as f:
        f.write(data + '\n')

### history CMU

In [11]:
def extract_info_h3(url, headers):
    response = requests.get(url, headers=headers)
    soup = bs(response.content, 'html.parser')

    target_data = []
    # find the title
    title = soup.title.string
    # remove extra spaces
    title = ' '.join(title.split())

    target_divs = soup.select('div[class^="grid column2"]') #soup.find('div', class_='grid column2')
    target_data += [title + ': ' + div.get_text(separator=" ", strip=True) for div in target_divs]
    # target_divs_div = target_divs.find('div', class_='collapse-text-text')

    
    return target_data, target_divs


In [12]:
# create a txt file to store the data
data_folder = '../../data/'
filename = 'history_of_cmu'
with open(data_folder + filename + '.txt', 'w') as f:
    f.write('')

url = 'https://www.cmu.edu/about/history.html'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
}
target_data, divs = extract_info_h3(url, headers)
target_data = [data.replace(u'\xa0', r' ') for data in target_data]

for data in target_data:
    with open(data_folder + filename + '.txt', 'a') as f:
        f.write(data + '\n')

### buggy

In [13]:
# def extract_info_b(url, headers):
#     response = requests.get(url, headers=headers)
#     soup = bs(response.content, 'html.parser')

#     target_data = []
#     # find the title
#     title = soup.title.string
#     # remove extra spaces
#     title = ' '.join(title.split())

#     target_divs = soup.find('div', class_='content')
#     target_data += [div.get_text(separator=" ", strip=True) for div in target_divs]
#     # target_divs_div = target_divs.find('div', class_='collapse-text-text')

    
#     return target_data, target_divs


In [14]:
# # create a txt file to store the data
# data_folder = '../../data/'
# filename = 'buggy'
# with open(data_folder + filename + '.txt', 'w') as f:
#     f.write('')
    
# url = 'https://www.cmu.edu/news/stories/archives/2019/april/spring-carnival-buggy.html'
# headers = {
#     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
# }
# target_data, divs = extract_info_b(url, headers)
# target_data = [data.replace(u'\xa0', r' ') for data in target_data]

# for data in target_data:
#     with open(data_folder + filename + '.txt', 'a') as f:
#         f.write(data + '\n')

### Athletics - Tartans

In [15]:
def extract_info_a(url, headers):
    response = requests.get(url, headers=headers)
    soup = bs(response.content, 'html.parser')

    target_data = []
    # find the title
    title = soup.title.string
    # remove extra spaces
    title = ' '.join(title.split())
    print(title)

    target_div = soup.find('div', class_='article-text')
    # target_data += [title + ': ' + div.get_text(separator=" ", strip=True) for div in target_divs]

    contents = target_div.find_all('p', recursive=True)  # Only immediate children
    
    for content in contents:
        if content.strong:
            # Print the subtitle text and its corresponding content
            subtitle = content.get_text(separator=" ", strip=True)
            # print("Subtitle:", content.text)
            # Find the following sibling p elements until the next subtitle
            next_sibling = content.find_next_sibling()
            content = ' '
            while next_sibling and not next_sibling.strong:
                content += next_sibling.get_text(separator=" ", strip=True)
                # print("\t", next_sibling.text.strip())
                next_sibling = next_sibling.find_next_sibling()
            target_data.append(title + ': ' + subtitle + content)

    
    return target_data, contents


In [16]:
# create a txt file to store the data
data_folder = '../../data/'
filename = 'athletics'
with open(data_folder + filename + '.txt', 'w') as f:
    f.write('')
    
url = 'https://athletics.cmu.edu/athletics/tartanfacts'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
}
target_data, _ = extract_info_a(url, headers)
target_data = [data.replace(u'\xa0', r' ') for data in target_data]

for data in target_data:
    with open(data_folder + filename + '.txt', 'a') as f:
        f.write(data + '\n')

Tartan Facts - Carnegie Mellon University Athletics


### Athletics - Scotty

In [17]:
def extract_info_a2(url, headers):
    response = requests.get(url, headers=headers)
    soup = bs(response.content, 'html.parser')

    target_data = []
    # find the h2
    h2 = soup.find('h2')
    head = h2.get_text(separator=" ", strip=True)
    # target_data.append(head)

    target_divs = soup.select('div[class^="content"]')
    target_data += [head + ': ' + div.get_text(separator=" ", strip=True) for div in target_divs]
    
    return target_data, target_divs


In [18]:
## write to the same file
# # create a txt file to store the data
# data_folder = '../../data/'
# filename = 'athletics'
# with open(data_folder + filename + '.txt', 'w') as f:
#     f.write('')
    
url = 'https://athletics.cmu.edu/athletics/mascot/about'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
}
target_data, _ = extract_info_a2(url, headers)
target_data = [data.replace(u'\xa0', r' ') for data in target_data]

for data in target_data:
    with open(data_folder + filename + '.txt', 'a') as f:
        f.write(data + '\n')

### Athletics - Kiltie Band

In [19]:
def extract_info_a3(url, headers):
    response = requests.get(url, headers=headers)
    soup = bs(response.content, 'html.parser')

    target_data = []
    # find the h1
    h1 = soup.find('h1')
    head = h1.get_text(separator=" ", strip=True)
    # target_data.append(head)

    target_divs = soup.find('table')
    target_data.append(head + ': ' + target_divs.get_text(separator=" ", strip=True))

    
    return target_data, target_divs


In [20]:
## write to the same file
# # create a txt file to store the data
# data_folder = '../../data/'
# filename = 'athletics'
# with open(data_folder + filename + '.txt', 'w') as f:
#     f.write('')
    
url = 'https://athletics.cmu.edu/athletics/kiltieband/index'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
}
target_data, target_divs = extract_info_a3(url, headers)
target_data = [data.replace(u'\xa0', r' ') for data in target_data]

for data in target_data:
    with open(data_folder + filename + '.txt', 'a') as f:
        f.write(data + '\n')

## Course Schedule

In [49]:
import json

def extract_info_sche(headers):

    col_name = ["Course", "Title", "Units", "Lec/Sec", "Days", "Begin", "End", "Bldg/Room", "Location", "Instructor(s)"]
    courses = []

    urls = ['https://enr-apps.as.cmu.edu/assets/SOC/sched_layout_fall.htm',
            'https://enr-apps.as.cmu.edu/assets/SOC/sched_layout_spring.htm',
            'https://enr-apps.as.cmu.edu/assets/SOC/sched_layout_summer_1.htm',
            'https://enr-apps.as.cmu.edu/assets/SOC/sched_layout_summer_2.htm'
            ]
    semesters = ['Fall 2023', 'Spring 2024', 'Summer One/All 2024', 'Summer Two 2024']

    last_course_id = ''
    last_course_name = ''
    last_course_units = ''
    for i, url in enumerate(urls):
        semester = semesters[i]
        response = requests.get(url, headers=headers)
        soup = bs(response.content, 'html.parser')

        # Find the table rows
        for row in soup.find_all('tr')[1:]:  # Skipping the first row as it contains headers
            cols = row.find_all('td')
            if cols and len(cols) > 1:  # Ensure the row contains columns and it is not a title row
                course = {
                    'Semester': semester,
                    }
                for i, col in enumerate(cols):
                    if i > len(col_name) - 1:
                        break
                    if col_name[i] == 'Course':
                        if col.text.strip() == '':
                            course[col_name[i]] = last_course_id
                            continue
                        else:
                            last_course_id = col.text.strip()
                    if col_name[i] == 'Title':
                        if col.text.strip() == '':
                            course[col_name[i]] = last_course_name
                            continue
                        else:
                            last_course_name = col.text.strip()
                    if col_name[i] == 'Units':
                        if col.text.strip() == '':
                            course[col_name[i]] = last_course_units
                            continue
                        else:
                            last_course_units = col.text.strip()
                    course[col_name[i]] = col.text.strip()
            course_str = ''
            for key, value in course.items():
                course_str += key + ': ' + value + '; '
            courses.append(course_str)

    return courses

In [50]:
# url = 'https://enr-apps.as.cmu.edu/assets/SOC/sched_layout_summer_1.htm'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
}
courses = extract_info_sche(headers)

In [51]:
courses

['Semester: Fall 2023; Course: Course; Title: Title; Units: Units; Lec/Sec: Lec/Sec; Days: Days; Begin: Begin; End: End; Bldg/Room: Bldg/Room; Location: Location; Instructor(s): Instructor(s); ',
 'Semester: Fall 2023; Course: Architecture; Title: Title; Units: Units; Lec/Sec: ; Days: ; Begin: ; End: ; Bldg/Room: ; Location: ; Instructor(s): ; ',
 'Semester: Fall 2023; Course: 48050; Title: Study Abroad; Units: 0.0; Lec/Sec: A; Days: TBA; Begin: ; End: ; Bldg/Room: DNM DNM; Location: Pittsburgh, Pennsylvania; Instructor(s): Instructor TBA; ',
 'Semester: Fall 2023; Course: 48095; Title: Spatial Concepts for Non-Architecture Majors; Units: 10.0; Lec/Sec: A; Days: TR; Begin: 01:00PM; End: 02:50PM; Bldg/Room: CFA 211; Location: Pittsburgh, Pennsylvania; Instructor(s): Barbuto; ',
 'Semester: Fall 2023; Course: 48100; Title: Architecture Design Studio: POIESIS STUDIO 1; Units: 15.0; Lec/Sec: Lec; Days: MWF; Begin: 02:00PM; End: 04:50PM; Bldg/Room: MM A14; Location: Pittsburgh, Pennsylvania

In [52]:
# create a txt file to store the data
data_folder = '../../data/documents/'
filename = 'schedule_2324'
with open(data_folder + filename + '.txt', 'w') as f:
    f.write('')

for data in courses:
    with open(data_folder + filename + '.txt', 'a') as f:
        f.write(data + '\n')