In [8]:
import requests
from bs4 import BeautifulSoup
import time

# URL of the webpage to scrape
url = "https://catalog.tulane.edu/programs/?optionlessH#filter=.filter_1"

# Function to scrape the page with retries
def scrape_page(url, max_retries=3, retry_delay=5):
    retries = 0
    while retries < max_retries:
        try:
            response = requests.get(url)
            response.raise_for_status()
            return response.content
        except (requests.exceptions.RequestException, ConnectionError) as e:
            print(f"Error retrieving page, retrying in {retry_delay} seconds: {e}")
            retries += 1
            time.sleep(retry_delay)
    raise Exception("Failed to retrieve page after multiple attempts")

# Scrape the page
html_content = scrape_page(url)

# Parse the HTML content using Beautiful Soup
soup = BeautifulSoup(html_content, "html.parser")

# Find all the major titles
major_titles = soup.find_all("span", class_="title")

# Find the school and major type information
major_info = soup.find_all("span", class_="keyword")

# Loop through each major title and extract the information
for i in range(len(major_titles)):
    # Get the major name
    major_name = major_titles[i].text.strip()

    # Get the school and major type
    if 2*i < len(major_info):
        school = major_info[2*i].text.strip()
        major_type = major_info[2*i+1].text.strip()
    else:
        school = "N/A"
        major_type = "N/A"

    # Construct the links
    major_links = soup.find_all("a", class_="item-container")
    


    if i < len(major_links):
        major_href = major_links[i]["href"]
        parts = major_href.split('/')
        print("Part:", parts)
        if len(parts) >= 3:
            home_link = f"https://catalog.tulane.edu/{'/'.join(parts[1:-1])}/{parts[-1]}/"
            req_link = home_link + "#requirementstext"
        else:
            home_link = "N/A"
            req_link = "N/A"
    else:
        home_link = "N/A"
        req_link = "N/A"

'''
    print(f"Major: {major_name}")
    print(f"School: {school}")
    print(f"Major Type: {major_type}")
    print(f"Home Link: {home_link}")
    print(f"Requirements Link: {req_link}")
    print()

'''


for title in major_titles:
    # Get the major name
    major_name = title.text.strip()
    
    # Find the 'a' tag that wraps this 'span' with class "title"
    major_link_tag = title.find_parent("a")
    major_href = major_link_tag['href'] if major_link_tag and 'href' in major_link_tag.attrs else "N/A"

    # If a valid href is found, construct the full URL
    if major_href != "N/A":
        parts = major_href.split('/')
        home_link = f"https://catalog.tulane.edu/{'/'.join(parts[1:-1])}/{parts[-1]}/"
        req_link = home_link + "#requirementstext"
    else:
        home_link = "N/A"
        req_link = "N/A"
    
    # Print the information
    print(f"Major: {major_name}")
    print(f"Home Link: {home_link}")
    print(f"Requirements Link: {req_link}")
    print()

Major: Accounting Fundamentals Certificate
Home Link: https://catalog.tulane.edu/professional-advancement/business-leadership-studies/accounting-fundamentals-certificate//
Requirements Link: https://catalog.tulane.edu/professional-advancement/business-leadership-studies/accounting-fundamentals-certificate//#requirementstext

Major: Accounting Minor (Freeman School of Business)
Home Link: https://catalog.tulane.edu/business/accounting/accounting-minor//
Requirements Link: https://catalog.tulane.edu/business/accounting/accounting-minor//#requirementstext

Major: Accounting, MACCT
Home Link: https://catalog.tulane.edu/business/accounting/accounting-mac//
Requirements Link: https://catalog.tulane.edu/business/accounting/accounting-mac//#requirementstext

Major: Admiralty, LMA
Home Link: https://catalog.tulane.edu/law/master-laws/admiralty-lma//
Requirements Link: https://catalog.tulane.edu/law/master-laws/admiralty-lma//#requirementstext

Major: Advanced Emergency Management Certificate (G

Making dictionarys for furether exploration

In [9]:
# ... previous code ...

# Initialize a dictionary to store the major title and associated links
majors_dict = {}

for title in major_titles:
    # Get the major name
    major_name = title.text.strip()
    
    # Find the 'a' tag that wraps this 'span' with class "title"
    major_link_tag = title.find_parent("a")
    major_href = major_link_tag['href'] if major_link_tag and 'href' in major_link_tag.attrs else "N/A"

    # If a valid href is found, construct the full URL
    if major_href != "N/A":
        parts = major_href.split('/')
        home_link = f"https://catalog.tulane.edu/{'/'.join(parts[1:-1])}/{parts[-1]}/"
        req_link = home_link + "#requirementstext"
    else:
        home_link = "N/A"
        req_link = "N/A"
    
    # Add the information to the dictionary
    majors_dict[major_name] = [home_link, req_link]

# Print the dictionary
for major, links in majors_dict.items():
    print(f"Major: {major}")
    print(f"Home Link: {links[0]}")
    print(f"Requirements Link: {links[1]}")
    print()


Major: Accounting Fundamentals Certificate
Home Link: https://catalog.tulane.edu/professional-advancement/business-leadership-studies/accounting-fundamentals-certificate//
Requirements Link: https://catalog.tulane.edu/professional-advancement/business-leadership-studies/accounting-fundamentals-certificate//#requirementstext

Major: Accounting Minor (Freeman School of Business)
Home Link: https://catalog.tulane.edu/business/accounting/accounting-minor//
Requirements Link: https://catalog.tulane.edu/business/accounting/accounting-minor//#requirementstext

Major: Accounting, MACCT
Home Link: https://catalog.tulane.edu/business/accounting/accounting-mac//
Requirements Link: https://catalog.tulane.edu/business/accounting/accounting-mac//#requirementstext

Major: Admiralty, LMA
Home Link: https://catalog.tulane.edu/law/master-laws/admiralty-lma//
Requirements Link: https://catalog.tulane.edu/law/master-laws/admiralty-lma//#requirementstext

Major: Advanced Emergency Management Certificate (G

In [10]:
import requests
from bs4 import BeautifulSoup

# ... (rest of your previous code)

def extract_content_from_link(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Will raise an HTTPError if the HTTP request returned an unsuccessful status code
        
        # Parse the content with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the element with id 'col-content'
        col_content = soup.find(id='col-content')
        
        # If the 'col-content' section exists, extract paragraph texts
        if col_content:
            paragraphs = col_content.find_all('p')
            paragraph_texts = [p.get_text(strip=True) for p in paragraphs]
            return '\n'.join(paragraph_texts)  # Join all paragraph texts with a newline
        else:
            return "No content found under 'col-content'."
        
    except requests.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")  # e.g. response code != 200
    except Exception as err:
        print(f"An error occurred: {err}")

# Initialize a new dictionary to store the content for each major
majors_content_dict = {}

# Iterate over the previously created majors_dict to extract content from each link
for major_name, links in majors_dict.items():
    home_link = links[0]
    
    # Skip if no valid link was found
    if home_link == "N/A":
        continue
    
    # Extract the content from the home link
    home_content = extract_content_from_link(home_link)
    
    # Store the content in the new dictionary
    majors_content_dict[major_name] = home_content

# Now, you can print or work with the majors_content_dict as needed
for major, content in majors_content_dict.items():
    print(f"Major: {major}")
    print("Content:")
    print(content)
    print()


Major: Accounting Fundamentals Certificate
Content:
For students whose positions include bookkeeping and basic accounting responsibilities, the 18-credit Accounting Fundamentals Certificate provides a solid foundation, grounded in the Generally Accepted Accounting Principles (GAAP) that govern the accounting profession. Coursework progresses to the software tools that facilitate accounting processes and the laws of taxation that impact each transaction, overlain at each level with the legal and ethical constructs that govern all business relationships.

Major: Accounting Minor (Freeman School of Business)
Content:
At the undergraduate level, the Freeman School offers an accounting minor for students who are enrolled as business majors. Students must complete 9 credit hours including one required class (3 credit hours) and an additional 6 credit hours chosen from approved electives.
For additional information about academic minors, seeAcademic Requirements.

Major: Accounting, MACCT
Con

In [11]:
def extract_content_from_link(url, content_id):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        content_div = soup.find(id=content_id)
        if content_div:
            return ' '.join(content_div.stripped_strings)
        else:
            return "No content found under the specified ID."
    except requests.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
    except Exception as err:
        print(f"An error occurred: {err}")

# Initialize dictionaries to store the contents for each major
majors_home_content_dict = {}
majors_requirements_content_dict = {}

# Iterate over the previously created majors_dict to extract content from each link
for major_name, links in majors_dict.items():
    home_link, req_link = links
    
    # Extract content from the home page
    if home_link != "N/A":
        home_content = extract_content_from_link(home_link, "col-content")
        majors_home_content_dict[major_name] = home_content
    
    # Extract content from the requirements page
    if req_link != "N/A":
        req_content = extract_content_from_link(req_link, "requirementstextcontainer")
        majors_requirements_content_dict[major_name] = req_content

# Now, you can print or work with the majors_home_content_dict and majors_requirements_content_dict as needed
for major, content in majors_home_content_dict.items():
    print(f"Major: {major} - Home Content")
    print(content)
    print()

for major, content in majors_requirements_content_dict.items():
    print(f"Major: {major} - Requirements Content")
    print(content)
    print()

Major: Accounting Fundamentals Certificate - Home Content
Accounting Fundamentals Certificate Overview Requirements For students whose positions include bookkeeping and basic accounting responsibilities, the 18-credit Accounting Fundamentals Certificate provides a solid foundation, grounded in the Generally Accepted Accounting Principles (GAAP) that govern the accounting profession. Coursework progresses to the software tools that facilitate accounting processes and the laws of taxation that impact each transaction, overlain at each level with the legal and ethical constructs that govern all business relationships. Course List Course ID Title Credits BSLS 1110 Accounting I 3 BSLS 1120 Accounting II 3 BSLS 2210 Accounting Information Systems 3 BSLS 3210 Business Taxation 3 BSLS 3380 Business Ethics 3 BSLS 3400 Legal Aspects of Business 3 Total Credit Hours 18

Major: Accounting Minor (Freeman School of Business) - Home Content
Accounting Minor (Freeman School of Business) Overview Requi

In [12]:
import os

# Ensure the RAG_DATA directory exists
os.makedirs('RAG_DATA', exist_ok=True)

# Function to safely create a valid filename from a title
def safe_filename(title):
    # Remove invalid file characters in title
    return "".join(c for c in title if c.isalnum() or c in (' ', '.', '_')).rstrip()

# Iterate over majors and write the content to files
for major in majors_home_content_dict:
    # Construct the filename based on the major title
    filename = safe_filename(major) + ".txt"
    filepath = os.path.join('RAG_DATA', filename)

    # Open the file in write mode
    with open(filepath, 'w', encoding='utf-8') as file:
        # Write the home content
        file.write(majors_home_content_dict[major])
        file.write("\n\n")  # Separate home content and requirements content

        # If there's requirements content, append it
        if major in majors_requirements_content_dict:
            file.write(majors_requirements_content_dict[major])

    print(f"Content for {major} written to {filepath}")


Content for Accounting Fundamentals Certificate written to RAG_DATA/Accounting Fundamentals Certificate.txt
Content for Accounting Minor (Freeman School of Business) written to RAG_DATA/Accounting Minor Freeman School of Business.txt
Content for Accounting, MACCT written to RAG_DATA/Accounting MACCT.txt
Content for Admiralty, LMA written to RAG_DATA/Admiralty LMA.txt
Content for Advanced Emergency Management Certificate (Graduate) written to RAG_DATA/Advanced Emergency Management Certificate Graduate.txt
Content for Advertising, Certificate written to RAG_DATA/Advertising Certificate.txt
Content for Advertising, Minor written to RAG_DATA/Advertising Minor.txt
Content for Africana Studies Major written to RAG_DATA/Africana Studies Major.txt
Content for Africana Studies Minor written to RAG_DATA/Africana Studies Minor.txt
Content for Aging Studies, PhD written to RAG_DATA/Aging Studies PhD.txt
Content for American Law, AML written to RAG_DATA/American Law AML.txt
Content for Anatomic Pat

Gathering Professor data

In [2]:

import requests
from bs4 import BeautifulSoup

In [9]:
import requests
from bs4 import BeautifulSoup
import re

# URL of the main page
base_url = 'https://www.ratemyprofessors.com/school/1041'

# Get the HTML of the main page
response = requests.get(base_url)
soup = BeautifulSoup(response.text, 'html.parser')

# Define the patterns for the classes
name_pattern = re.compile(r'CardName__StyledCardName')
comment_pattern = re.compile(r'Comments__StyledComments')
difficulty_pattern = re.compile(r'FeedbackItem__FeedbackNumber')

# Find all professors' names and their links (IDs are embedded in links)
professors = soup.find_all('div', class_=name_pattern)
professor_links = soup.find_all('a', href=re.compile(r'/professor/\d+$'))


print(professors)

professor_data = []

for professor, link in zip(professors, professor_links):
    name = professor.text
    href = link['href']
    professor_id = href.split('/')[-1]
    professor_url = f'https://www.ratemyprofessors.com/professor/{professor_id}'

    # Fetch professor page
    prof_response = requests.get(professor_url)
    prof_soup = BeautifulSoup(prof_response.text, 'html.parser')

    # Extract comments and difficulty ratings using the regex patterns
    comments = prof_soup.find_all('div', class_=comment_pattern)
    difficulties = prof_soup.find_all('div', class_=difficulty_pattern)

    comment_texts = [comment.text.strip() for comment in comments]
    difficulty_ratings = [difficulty.text.strip() for difficulty in difficulties]

    professor_data.append({
        'name': name,
        'id': professor_id,
        'comments': comment_texts,
        'difficulties': difficulty_ratings
    })

# Output the data
for data in professor_data:
    print(data)


[]


Web Scraping Schedule of classes page

In [12]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlencode

# Base URL of the Search page
base_url = "https://classschedule.tulane.edu/Search.aspx"

# Send a GET request to the base URL
response = requests.get(base_url)

# Parse the HTML content of the Search page
soup = BeautifulSoup(response.content, "html.parser")

# Find the dropdown by its ID
dropdown = soup.find("select", id="ddlsubject")

# Extract the option values
option_values = [option.get_text() for option in dropdown.find_all("option")]

for option_value in option_values:
    print(option_value)



All
Accounting
Admiralty Law
Africana Studies
Aging Studies
American Sign Language Studies
Anatomy - Graduate
Anthropology
Arabic
Arch - Design
Architecture
Art History
Art Studio
Asian Studies
Astronomy
Biochemistry & Molecular Biol
Bioethic & Medical Humanities
Biomedical Engineering
Biomedical Informatics
Biomedical Sciences
Biostatistics
Business & Leadership Studies
Business Doctoral Courses
Business General Courses
Career Devel & Mgmt
Career Development
Cell & Molecular Biology
Chemical Engineering
Chemistry
Chinese Language
Cinema Studies
City, Culture, and Community
Classics
Clinical Research
Cognitive Studies
Colloquia
Communication
Computational Science
Computer Science
Ctr for Engd Learning/Teaching
Cybersecurity Management
Dance
Data Literacy
Digital Design
Digital Media Practices
Disaster Resilience Leader Sci
Earth & Environmental Sciences
Ecology & Evolutionary Biology
Economics
Education
Education - Liberal Arts
Emergency and Security Studies
Energy
Energy Law
Engineeri

Functions needed to save the scraped data

In [28]:
import json
from collections import defaultdict

def load_data(filepath):
    try:
        with open(filepath, 'r') as f:
            return json.load(f)
    except FileNotFoundError:
        return defaultdict(dict)  # Return an empty defaultdict if the file does not exist

def save_data(data, filepath):
    with open(filepath, 'w') as f:
        json.dump(data, f, indent=4)

In [37]:
def scrape_page(driver):
    data = []
    # Wait for the rows to be present
    bold_tags = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.TAG_NAME, "b")))

    


    for b in bold_tags:
        label = b.text  # The text of the <b> tag
        parent_td = b.find_element(By.XPATH, "./ancestor::td[1]")  # Find the <td> that contains this <b> tag
        following_siblings = parent_td.find_elements(By.XPATH, "./following-sibling::td")  # Find all following sibling <td> elements

        # The actual text might be in the immediate sibling or further down the nested structure
        associated_text = None
        if following_siblings:
            # Check if immediate sibling <td> contains text
            if following_siblings[0].text.strip() != "":
                associated_text = following_siblings[0].text.strip()
            else:
                # Otherwise, look deeper into the structure
                nested_divs = following_siblings[0].find_elements(By.XPATH, ".//div")
                for div in nested_divs:
                    nested_tds = div.find_elements(By.TAG_NAME, "td")
                    for td in nested_tds:
                        if td.text.strip() != "":
                            associated_text = td.text.strip()
                            break
                    if associated_text:
                        break

        # If we found associated text, add it to the data
        if associated_text:
            print(f"Label: {label}, Text: {associated_text}")
            data.append({label: associated_text})

    return data

        


Scrape Page with json return

In [38]:
def scrape_page(driver):
    page_data = {}  # A dictionary to store data grouped by the subject code
    current_subject_code = ''  # To keep track of the subject code we are currently processing
    course_details = {}  # A dictionary to hold the details of the current course

    # Wait for the page elements to be loaded
    bold_tags = WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.TAG_NAME, "b"))
    )

    for b in bold_tags:
        # Check if the text in the <b> tag is a course code with a subject prefix
        if '-' in b.text:
            potential_subject_code = b.text.split('-')[0]
            # If it's a new subject code, reset the course details
            if potential_subject_code.isalpha() and potential_subject_code != current_subject_code:
                # If there is a current subject code, it means we've collected details for it
                # and need to add it to the page data before resetting it
                if current_subject_code:
                    page_data[current_subject_code] = course_details
                # Reset the course details and update the current subject code
                course_details = {}
                current_subject_code = potential_subject_code
        
        # Now that we have ensured that the subject code is set or updated,
        # we can proceed to collect the course details

        # The label and its associated data are in the same <td>
        # Get the following sibling <td> which contains the value
        value_td = b.find_element(By.XPATH, "./following-sibling::td")
        label = b.text.strip()
        value = value_td.text.strip() if value_td.text.strip() else "N/A"

        # Add the value to the course details using label as the key
        course_details[label] = value

    # After finishing the loop, add the last subject's details if there are any
    if current_subject_code and course_details:
        page_data[current_subject_code] = course_details

    return page_data


Working page iterator. DO NOT MESS WITH!!!

In [39]:
from selenium import webdriver
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import time  # For sleep

# Initialize the Chrome driver
driver = webdriver.Chrome()

# Navigate to the Search page
driver.get("https://classschedule.tulane.edu/Search.aspx")

# Wait for the dropdown to be visible and interactable
wait = WebDriverWait(driver, 10)
dropdown = wait.until(EC.visibility_of_element_located((By.ID, "ddlsubject")))
select = Select(dropdown)

# Get all options except the first one
options = [o.get_attribute("value") for o in select.options if o.get_attribute("value")]

#------------------------------------- possibly remove this block

filepath = 'Data/courses_data.json'
courses_data = load_data(filepath)


#------------------------------------- possibly remove this block



# Navigate through each option in the dropdown
for value in options:
    print("Processing:", value)

    # Reacquire the dropdown element on each iteration to avoid StaleElementReferenceException
    dropdown = wait.until(EC.visibility_of_element_located((By.ID, "ddlsubject")))
    select = Select(dropdown)

    # Now interact with the dropdown
    select.select_by_value(value)

    submit_button = wait.until(EC.element_to_be_clickable((By.ID, "btnSearchAll")))
    submit_button.click()
    
    # Add a delay to allow the page to load, this is not the best practice
    # but can be used for simplicity's sake
    time.sleep(5)  # Adjust time as necessary based on observed load times
    

    page_number = 1
    while True:

        print(f"Scraping page {page_number} for subject {value}.")

        new_data = scrape_page(driver)
        
        print(new_data)

        courses_data.update(new_data)

        next_page_number = page_number + 1
        try:
            # Look for the link to the next page
            next_page_link = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, str(next_page_number))))
            next_page_link.click()
            page_number = next_page_number
            # Add a delay after clicking to allow the new page to load
            time.sleep(5)  # Adjust time as necessary
        except TimeoutException:
            # If the next page link isn't found, it means we're on the last page
            print(f"Finished processing {value}")
            driver.get("https://classschedule.tulane.edu/Search.aspx")
            break
            

            #just for testing sj'ajfdkla;kfjak ds

        save_data(courses_data, filepath)
        

# Close the driver
save_data(courses_data, filepath)
driver.quit()


Processing: Accounting
Scraping page 1 for subject Accounting.


NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"./following-sibling::td"}
  (Session info: chrome=124.0.6367.62); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   chromedriver                        0x0000000105066934 chromedriver + 4368692
1   chromedriver                        0x000000010505edc8 chromedriver + 4337096
2   chromedriver                        0x0000000104c82c04 chromedriver + 289796
3   chromedriver                        0x0000000104cc4e00 chromedriver + 560640
4   chromedriver                        0x0000000104cbb368 chromedriver + 521064
5   chromedriver                        0x0000000104cfd5ec chromedriver + 792044
6   chromedriver                        0x0000000104cb9ab4 chromedriver + 514740
7   chromedriver                        0x0000000104cba50c chromedriver + 517388
8   chromedriver                        0x000000010502ae50 chromedriver + 4124240
9   chromedriver                        0x000000010502fc40 chromedriver + 4144192
10  chromedriver                        0x0000000105010818 chromedriver + 4016152
11  chromedriver                        0x0000000105030570 chromedriver + 4146544
12  chromedriver                        0x00000001050022cc chromedriver + 3957452
13  chromedriver                        0x000000010504feb8 chromedriver + 4275896
14  chromedriver                        0x0000000105050034 chromedriver + 4276276
15  chromedriver                        0x000000010505ea28 chromedriver + 4336168
16  libsystem_pthread.dylib             0x000000019111bfa8 _pthread_start + 148
17  libsystem_pthread.dylib             0x0000000191116da0 thread_start + 8


Scraping Courses Website. The plan here is to obtain links for all course pages, and then separate each individual course into it's own text file that will be vectorized. Hopefully, this will create a dynamic course search, where you can search for certain qualities you want in a class, and they will come up

In [50]:
import requests
from bs4 import BeautifulSoup

base_url = "https://catalog.tulane.edu/courses/"
url = base_url
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

abbreviations = {}

# Find all unordered lists within "letternav-head" class
letternav_heads = soup.find_all("h2", class_="letternav-head")

for head in letternav_heads:
    # Find the next unordered list after the "letternav-head"
    ul = head.find_next_sibling("ul")
    if ul:
        # Find all links within the unordered list
        abbr_links = ul.find_all("a")
        for abbr_link in abbr_links:
            abbr = abbr_link.text.split()[0]
        
                 # Extract the abbreviation
            link = base_url + abbr_link["href"][9:]
            
              # Extract the link and prepend base_url

            # Add the abbreviation and link to the dictionary if not present
            if abbr not in abbreviations:
                abbreviations[abbr] = link

# Print the dictionary
for abbr, link in abbreviations.items():
    print(f"{abbr}: {link}")

360: https://catalog.tulane.edu/courses/circ/
Accounting: https://catalog.tulane.edu/courses/accn/
Admiralty: https://catalog.tulane.edu/courses/admr/
Africana: https://catalog.tulane.edu/courses/afrs/
Aging: https://catalog.tulane.edu/courses/agst/
American: https://catalog.tulane.edu/courses/asls/
Anatomy: https://catalog.tulane.edu/courses/anat/
Anthropology: https://catalog.tulane.edu/courses/anth/
Arabic: https://catalog.tulane.edu/courses/arbc/
Architecture: https://catalog.tulane.edu/courses/arch/
Art: https://catalog.tulane.edu/courses/arhs/
Asian: https://catalog.tulane.edu/courses/asta/
Astronomy: https://catalog.tulane.edu/courses/astr/
Biochemistry: https://catalog.tulane.edu/courses/gbch/
Bioethics: https://catalog.tulane.edu/courses/bemh/
Biology: https://catalog.tulane.edu/courses/pabi/
Biomedical: https://catalog.tulane.edu/courses/bmen/
Biostatistics: https://catalog.tulane.edu/courses/bios/
Business: https://catalog.tulane.edu/courses/bsls/
Career: https://catalog.tul

In [52]:

def scrape_content(abbr, link):
    response = requests.get(link)
    content_soup = BeautifulSoup(response.content, "html.parser")
    content_div = content_soup.find("main", id="content")
    if content_div:
        content_text = content_div.get_text(strip=True)
        save_to_file(abbr, content_text)

def save_to_file(abbr, content_text):
    # Create the 'Data' folder if it doesn't exist
    #os.makedirs("RAG_DATA", exist_ok=True)
    # Save the content to a text file with the abbreviation as the filename
    filename = f"RAG_DATA/{abbr}.txt"
    with open(filename, "w", encoding="utf-8") as file:
        file.write(content_text)    

# Scrape the content for each link in the dictionary
for abbr, link in abbreviations.items():
    scrape_content(abbr, link)

Testing Splitter Function on The one subject

In [72]:
import re


def parse_and_save_courses(text, directory):
    # Create directory if it doesn't exist
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    # Regex to split on any four-letter uppercase word followed by space and four digits
    course_pattern = r"(?=[A-Z]{4} \d{4})"
    courses = re.split(course_pattern, text)
    for course in courses:
        if "Special Topics" in course or "Independent Study" in course or "Transfer Coursework" in course or "Ind. Study" in course or "Semester Abroad" in course or "Study Abroad" in course:
            continue  # Skip courses that are special topics or independent study
        
        # Extract the course code using a generalized regex pattern
        try:
            course_code = re.search(r'([A-Z]{4} \d{4})', course).group(1)
            filename = os.path.join(directory, f"{course_code}.txt")  # Use the directory parameter
            with open(filename, 'w', encoding='utf-8') as file:
                file.write(course.strip())
            
        except AttributeError:
            # Handle the case where the course code might not be found
            print("Could not find a valid course code in:", course[:50])
            
        
# Example usage
course_text = """
DATA 1010Introduction to Data(3)DATA 1010aims to provide students with an overview to what data is, how it is used correctly and incorrectly, how it is found, stored, and managed, and how it can be used as a basis for decision making and analysis. The overall goal of this course is to increase data literacy, such that students are more confidently able to work with the increasing amounts of data in their lives, jobs, and academic careers. This course is aimed towards students in all schools and fields and has no prerequisites.DATA 1940Transfer Course Work(0-4)Transfer CourseworkMaximum Hours:99DATA 2020Data Analysis(3)This course provides an overview of the statistical tools most commonly used to analyze quantitative data. Topics include describing data, statistical inference, statistical significance, hypothesis testing, and regression analyses. The course focuses on understanding how to use appropriate analytical techniques and interpret the results of statistical analyses for variables with different levels of measurement. For each topic area, the methodology, including the underlying theory, assumptions, and mechanics of how each analytical tool works, is discussed, along with the appropriate interpretation of results. Concepts are presented in the context of real-world examples using publicly available data sets. The course will also introduce students to statistical software. Students of all skill levels are welcome, including those with limited or no statistical, mathematical, or programming backgrounds. All data analysis skills will be taught in class.DATA 2030Data Visualization(3)Students will examine different creative and analytical theories and techniques for understanding and developing data visualizations, including maps, graphs, charts, and interactive tools such as dashboards. Students will access and clean data for visualizing potential, analyze data visualizations for bias and persuasive intent, and create data visualizations to communicate findings and tell engaging stories for diverse audiences. Students will also consider the societal role that data visualizations play in validating knowledge while exploring ethical concerns and critiques around communicating arguments visually. As practice, students will storyboard, create, peer review, and justify design choices when using a variety of open-source data visualizations. Students of all skill levels are welcome, and all data visualization skills will be taught in class.DATA 2150Applied Generative AI(3)The introduction of widely available and accessible generative Artificial Intelligence tools, such as ChatGPT, democratizes expertise, unlocks knowledge, and bestows impressive abilities. This hands-on course provides students with practical experience employing generative AI to perform real-world tasks. By the end of the course, students will be able to effectively collect accurate historical and real-time information, generate high-quality text and media, transform content between formats, analyze data to derive insights and deploy generative AI to tackle private and professional challenges.DATA 2810Special Topics(3)Special Topics in Data Literacy. Course may be repeated up to unlimited credit hours.Maximum Hours:99DATA 2940Transfer Coursework(0-20)Transfer CourseworkMaximum Hours:99DATA 3810Special Topics(3)Special Topics in Data Literacy. Course may be repeated up to unlimited credit hours.Maximum Hours:99DATA 3940Transfer Course Work(0-4)Transfer CourseworkMaximum Hours:99
"""
#parse_and_save_courses(course_text, "Course_singular_Data")


In [73]:
def process_files(input_directory, output_directory):
    # Ensure the output directory exists
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # List all files in the input directory
    for filename in os.listdir(input_directory):
        # Construct the full file path
        print("Procesing: ", filename)
        if filename.startswith('.'):
            print(f"Skipping hidden file: {filename}")
            continue
        file_path = os.path.join(input_directory, filename)

        # Skip directories
        if os.path.isdir(file_path):
            continue
        
        # Read the content of each file
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        
        # Apply the course parsing function
        parse_and_save_courses(content, output_directory)

      

In [76]:
input_directory = "Course_Data"
output_directory = "RAG_DATA"

# Call the function to process all files
process_files(input_directory, output_directory)

Procesing:  Accounting.txt
Could not find a valid course code in: 
Procesing:  German.txt
Could not find a valid course code in: 
Procesing:  Managerial.txt
Could not find a valid course code in: 
Procesing:  Mathematics.txt
Could not find a valid course code in: 
Procesing:  Neuroscience.txt
Could not find a valid course code in: 
Procesing:  Wellness.txt
Could not find a valid course code in: 
Procesing:  Arabic.txt
Could not find a valid course code in: 
Procesing:  Energy.txt
Could not find a valid course code in: 
Procesing:  Japanese.txt
Could not find a valid course code in: 
Procesing:  Chemistry.txt
Could not find a valid course code in: 
Procesing:  Taylor.txt
Could not find a valid course code in: 
Procesing:  Biostatistics.txt
Could not find a valid course code in: 
Procesing:  Music.txt
Could not find a valid course code in: 
Procesing:  Digital.txt
Could not find a valid course code in: 
Procesing:  Master.txt
Could not find a valid course code in: 
Procesing:  Taxation.t