In [None]:
# web_page to all professor's personal page
web_page = 'https://cs.duke.edu/people/appointed-faculty/primary-faculty'

In [13]:
import requests
from bs4 import BeautifulSoup

# URL of the webpage to scrape
url = "https://cs.duke.edu/people/appointed-faculty/primary-faculty"
all_personal_pages = set()

# Send a GET request to the webpage
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content of the webpage
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all links (anchor tags) in the webpage
    links = soup.find_all('a', href=True)
    
    # Filter and print URLs that could be personal pages
    print("Personal URLs found:")
    for link in links:
        href = link['href']
        # Check if the link contains "http" (external link) and possibly represents a personal page
        if "https://scholars.duke.edu/person/" in href:
            all_personal_pages.add(href)
    all_personal_pages = list(all_personal_pages)
    print(all_personal_pages)
    print(f'in total, we have {len(all_personal_pages)} unique pages')
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")


Personal URLs found:
['https://scholars.duke.edu/person/debmalya.panigrahi', 'https://scholars.duke.edu/person/brandon.fain', 'https://scholars.duke.edu/person/tomasi', 'https://scholars.duke.edu/person/alberto', 'https://scholars.duke.edu/person/ashwin', 'https://scholars.duke.edu/person/Kristin.Stephens-Martinez', 'https://scholars.duke.edu/person/rongge', 'https://scholars.duke.edu/person/tananun.songdechakraiwut', 'https://scholars.duke.edu/person/sudeepa', 'https://scholars.duke.edu/person/xiaowei.yang', 'https://scholars.duke.edu/person/kamesh', 'https://scholars.duke.edu/person/nicki.washington', 'https://scholars.duke.edu/person/rcd', 'https://scholars.duke.edu/person/monica.agrawal', 'https://scholars.duke.edu/person/rudin', 'https://scholars.duke.edu/person/raluca.gordan', 'https://scholars.duke.edu/person/yesenia.velasco', 'https://scholars.duke.edu/person/Jian.Pei', 'https://scholars.duke.edu/person/rodger', 'https://scholars.duke.edu/person/pardis.emami_naeini', 'https://s

In [36]:
def get_overview(url, cnt=0):
    rst = ''
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the webpage
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the div with class 'excerpt'
        excerpt_div = soup.find('div', class_='excerpt')
        name_span = soup.find('span', class_='pr-3 leading-d1') 

        
        # Extract the data-excerpt-text or the full-text if available
        if excerpt_div and name_span:
            cleaned_text = excerpt_div.get_text(strip=True)  # Strips leading/trailing whitespace
            name_span = name_span.get_text(strip=True)
            rst = f'Overview of professor {name_span}: \n' + cleaned_text
            return rst
        else:
            rst = "No Overview"
            return rst
    else:
        if cnt>2:
            print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
            print(url)
            print('-----------')
        else:
            get_overview(url, cnt+1)

In [37]:
print(get_overview('https://scholars.duke.edu/person/pankaj'))

Overview of professor Pankaj K. Agarwal: 
Geometric algorithms, discrete geometry, geometric data analysis, data structures, database systems and data mining, robotics algorithms, geographic information systems.


In [38]:
import os

def save_string_to_file(text, filename):
    """
    Save a string to a text file, creating the folder if it doesn't exist.
    
    :param text: The string to save
    :param filename: The name of the file, including its path
    """
    # Extract the directory from the filename
    folder = os.path.dirname(filename)
    
    # Create the folder if it doesn't exist
    if folder and not os.path.exists(folder):
        os.makedirs(folder)
    
    # Write the text to the file
    with open(filename, 'w') as file:
        file.write(text)


In [39]:
from tqdm import tqdm


for url in tqdm(all_personal_pages, desc="Processing URLs"):
    text = get_overview(url)
    save_string_to_file(text, '../data/rag_data/professor/'+url.split('/')[-1]+'.txt')

Processing URLs: 100%|██████████| 43/43 [00:16<00:00,  2.55it/s]


In [40]:
import os

# Define the directory path
directory = '/Users/zihengs/Desktop/Duke_AI_advisor/data/rag_data/professor'

# Iterate over all .txt files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        filepath = os.path.join(directory, filename)
        
        # Read the file and process the lines
        with open(filepath, 'r') as file:
            lines = file.readlines()

        # Remove empty lines and strip leading whitespace
        cleaned_lines = [line.lstrip() for line in lines if line.strip()]

        # Check if the file contains "No Overview" and delete it if so
        if any("No Overview" in line for line in cleaned_lines):
            os.remove(filepath)
            print(f"Deleted file: {filepath} (contains 'No Overview')")
        else:
            # Write the cleaned lines back to the file
            with open(filepath, 'w') as file:
                file.writelines(cleaned_lines)
            print(f"Processed file: {filepath}")


Processed file: /Users/zihengs/Desktop/Duke_AI_advisor/data/rag_data/professor/amink.txt
Deleted file: /Users/zihengs/Desktop/Duke_AI_advisor/data/rag_data/professor/anru.zhang.txt (contains 'No Overview')
Processed file: /Users/zihengs/Desktop/Duke_AI_advisor/data/rag_data/professor/bhuwan.dhingra.txt
Processed file: /Users/zihengs/Desktop/Duke_AI_advisor/data/rag_data/professor/bruce.donald.txt
Processed file: /Users/zihengs/Desktop/Duke_AI_advisor/data/rag_data/professor/Jian.Pei.txt
Processed file: /Users/zihengs/Desktop/Duke_AI_advisor/data/rag_data/professor/danyang.zhuo.txt
Processed file: /Users/zihengs/Desktop/Duke_AI_advisor/data/rag_data/professor/Michael.Reiter.txt
Processed file: /Users/zihengs/Desktop/Duke_AI_advisor/data/rag_data/professor/ola.txt
Processed file: /Users/zihengs/Desktop/Duke_AI_advisor/data/rag_data/professor/pardis.emami_naeini.txt
Deleted file: /Users/zihengs/Desktop/Duke_AI_advisor/data/rag_data/professor/alexander.steiger.txt (contains 'No Overview')


In [41]:
print(f'now the folder have total {len(os.listdir(directory))} files')

now the folder have total 33 files


In [42]:
import os

# Define the directory path
directory = '/Users/zihengs/Desktop/Duke_AI_advisor/data/rag_data/professor'

# Initialize counters for total word count and file count
total_word_count = 0
file_count = 0

# Iterate over all .txt files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        filepath = os.path.join(directory, filename)
        
        # Read the file and count the words
        with open(filepath, 'r') as file:
            lines = file.readlines()
            word_count = sum(len(line.split()) for line in lines)  # Count words in each line
        
        # Update counters
        total_word_count += word_count
        file_count += 1
        
        print(f"{filename} has {word_count} words")

# Calculate and print the average word count
if file_count > 0:
    average_word_count = total_word_count / file_count
    print(f"Average word count per file: {average_word_count:.2f}")
else:
    print("No .txt files found in the directory.")

amink.txt has 31 words
bhuwan.dhingra.txt has 18 words
bruce.donald.txt has 38 words
Jian.Pei.txt has 19 words
danyang.zhuo.txt has 21 words
Michael.Reiter.txt has 74 words
ola.txt has 18 words
pardis.emami_naeini.txt has 93 words
xiaowei.yang.txt has 9 words
pankaj.txt has 25 words
chase.txt has 21 words
reif.txt has 28 words
brandon.fain.txt has 30 words
robert.calderbank.txt has 413 words
rudin.txt has 312 words
yesenia.velasco.txt has 19 words
rongge.txt has 11 words
bmm.txt has 24 words
Kristin.Stephens-Martinez.txt has 120 words
tomasi.txt has 58 words
kate.o.hanlon.txt has 19 words
parr.txt has 24 words
Kartik.Nayak.txt has 30 words
debmalya.panigrahi.txt has 22 words
xiaobai.sun.txt has 32 words
Matthew.Lentz.txt has 12 words
rodger.txt has 11 words
Benjamin.Rossman.txt has 58 words
rcd.txt has 19 words
junyang.txt has 60 words
alberto.txt has 109 words
kamesh.txt has 43 words
sudeepa.txt has 204 words
Average word count per file: 61.36
