In [8]:
import requests
from bs4 import BeautifulSoup

def fetch_course_page(url):
    # Send a request to the website
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the content of the webpage
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the specific <div> with the given class
        course_info = soup.find('div', class_="col-sm-8 bs-region bs-region--left")
        if not course_info:
            return None
        return course_info.text
    else:
        return None

import os

def save_string_to_file(text, filename):
    """
    Save a string to a text file, creating the folder if it doesn't exist.
    
    :param text: The string to save
    :param filename: The name of the file, including its path
    """
    # Extract the directory from the filename
    folder = os.path.dirname(filename)
    
    # Create the folder if it doesn't exist
    if folder and not os.path.exists(folder):
        os.makedirs(folder)
    
    # Write the text to the file
    with open(filename, 'w') as file:
        file.write(text)


In [9]:
def url2txt(url):
    course_info_string = fetch_course_page(url)
    if not course_info_string:
        print(f"{url} cannot find content") 
    else:
        # print(len(course_info_string))
        save_string_to_file(course_info_string, '../data/rag_data/course/'+url.split('/')[-1]+'.txt')

In [10]:
import pandas as pd


url = 'https://cs.duke.edu/course-catalog'
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the content of the webpage
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the specific <div> with the given class
    course_info = soup.find('table', class_="tablesaw tablesaw-stack table table-hover views-table views-view-table cols-3")
    # Extract headers
    headers = [header.text.strip() for header in course_info.find_all('th')]

    # Extract rows
    rows = course_info.find_all('tr')[1:]  # Skip the header row
    data = []

    for row in rows:
        cells = row.find_all('td')
        cell_data = []
        for cell in cells:
            # Check if there is a link in the cell
            link = cell.find('a')
            if link:
                cell_text = cell.text.strip()
                cell_href = link.get('href', '')
                cell_data.append({'text': cell_text, 'href': cell_href})
            else:
                cell_data.append({'text': cell.text.strip(), 'href': ''})
        data.append(cell_data)
            # Create a DataFrame
    df = pd.DataFrame(data, columns=headers)


In [11]:
urls = ['https://cs.duke.edu'+df['Title'][i]['href'] for i in range(len(df['Title']))]
print(urls)

['https://cs.duke.edu/courses/principles-computer-science', 'https://cs.duke.edu/courses/first-year-seminar-0', 'https://cs.duke.edu/courses/technical-and-social-analysis-information-and-internet-0', 'https://cs.duke.edu/courses/history-computing-cryptography-and-robotic-devices', 'https://cs.duke.edu/courses/programming-and-problem-solving-0', 'https://cs.duke.edu/courses/programming-and-problem-solving-1', 'https://cs.duke.edu/courses/introduction-computer-science-0', 'https://cs.duke.edu/courses/introduction-computer-science-1', 'https://cs.duke.edu/courses/interdisciplinary-introduction-computer-science', 'https://cs.duke.edu/courses/computing-and-brain', 'https://cs.duke.edu/courses/information-society-culture-bass-connections-gateway', 'https://cs.duke.edu/courses/introduction-digital-feminism', 'https://cs.duke.edu/courses/foundations-data-science-0', 'https://cs.duke.edu/courses/topics-computer-science-2', 'https://cs.duke.edu/courses/duke-administered-study-abroad-special-topi

In [12]:
print(len(urls))

124


In [13]:
from tqdm import tqdm


for url in tqdm(urls, desc="Processing URLs"):
    url2txt(url)

Processing URLs: 100%|██████████| 124/124 [02:22<00:00,  1.15s/it]


In [1]:
import os

# Define the directory path
directory = '/Users/zihengs/Desktop/Duke_AI_advisor/data/rag_data/course'

# Iterate over all .txt files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        filepath = os.path.join(directory, filename)
        
        # Read the file and process the lines
        with open(filepath, 'r') as file:
            lines = file.readlines()

        # Remove empty lines and strip leading whitespace
        cleaned_lines = [line.lstrip() for line in lines if line.strip()]

        # Check if the file contains "No Overview" and delete it if so
        if any("No Overview" in line for line in cleaned_lines):
            os.remove(filepath)
            print(f"Deleted file: {filepath} (contains 'No Overview')")
        else:
            # Write the cleaned lines back to the file
            with open(filepath, 'w') as file:
                file.writelines(cleaned_lines)
            print(f"Processed file: {filepath}")


Processed file: /Users/zihengs/Desktop/Duke_AI_advisor/data/rag_data/course/algorithms-real-world.txt
Processed file: /Users/zihengs/Desktop/Duke_AI_advisor/data/rag_data/course/modern-optimization-statistical-learning.txt
Processed file: /Users/zihengs/Desktop/Duke_AI_advisor/data/rag_data/course/geometric-algorithms.txt
Processed file: /Users/zihengs/Desktop/Duke_AI_advisor/data/rag_data/course/high-dimensional-statistics-and-machine-learning.txt
Processed file: /Users/zihengs/Desktop/Duke_AI_advisor/data/rag_data/course/topics-computer-science-3.txt
Processed file: /Users/zihengs/Desktop/Duke_AI_advisor/data/rag_data/course/topics-computer-science-2.txt
Processed file: /Users/zihengs/Desktop/Duke_AI_advisor/data/rag_data/course/delivering-software-concept-client-0.txt
Processed file: /Users/zihengs/Desktop/Duke_AI_advisor/data/rag_data/course/digital-systems-0.txt
Processed file: /Users/zihengs/Desktop/Duke_AI_advisor/data/rag_data/course/artificial-intelligence-1.txt
Processed file

In [2]:
print(f'now the folder have total {len(os.listdir(directory))} files')

now the folder have total 124 files


In [6]:
import os

# Define the directory path
directory = '/Users/zihengs/Desktop/Duke_AI_advisor/data/rag_data/course'

# Initialize counters for total word count and file count
total_word_count = 0
file_count = 0

# Iterate over all .txt files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        filepath = os.path.join(directory, filename)
        
        # Read the file and count the words
        with open(filepath, 'r') as file:
            lines = file.readlines()
            word_count = sum(len(line.split()) for line in lines)  # Count words in each line
        
        # Update counters
        total_word_count += word_count
        file_count += 1
        
        print(f"{filename} has {word_count} words")

# Calculate and print the average word count
if file_count > 0:
    average_word_count = total_word_count / file_count
    print(f"Average word count per file: {average_word_count:.2f}")
else:
    print("No .txt files found in the directory.")

algorithms-real-world.txt has 86 words
modern-optimization-statistical-learning.txt has 110 words
geometric-algorithms.txt has 42 words
high-dimensional-statistics-and-machine-learning.txt has 72 words
topics-computer-science-3.txt has 18 words
topics-computer-science-2.txt has 2 words
delivering-software-concept-client-0.txt has 67 words
digital-systems-0.txt has 72 words
artificial-intelligence-1.txt has 38 words
computer-science-education-research.txt has 44 words
foundations-blockchains.txt has 118 words
introduction-design-and-analysis-algorithms-0.txt has 41 words
technical-and-social-analysis-information-and-internet-1.txt has 99 words
computer-network-architecture-0.txt has 109 words
topics-computer-science-0.txt has 8 words
topics-computer-science-1.txt has 36 words
topological-data-analysis.txt has 39 words
technical-and-social-analysis-information-and-internet-0.txt has 67 words
duke-administered-study-abroad-advanced-special-topics-computer-science-0.txt has 6 words
topics-