In [7]:
""" Python program that scraps all the table data from website https://oedb.org/rankings/graduation-rate/page using Selenium
which automates the process. Beautiful soup is used in the extract process. Empty lists are initialized to store the table
headers and table rows and data. The lists are converted into data frames. The data frames are written into CSV files"""


import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import time

# This main function will configure the browser to run in normal mode and not headless (but the option exists to run in headless mode)
# The message "Chrome is being controlled by automated test software" will be displayed. The extract function will be called and 
# the empty lists will be initialized and the data frames will be written using pandas csv functions.

def main():
    options = Options()
    options.add_argument("window-size=1920,1200")
    # options.headless = True   # Option to run browser in headless mode
    ua = UserAgent()
    a = ua.random
    user_agent = ua.random
    
    
    options.add_argument(f'user-agent={user_agent}')
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    
#Empty header and table data lists to be used in the scraping
    title=[]
    row_data = []
    
# Loop through all the pages (there are 571 rows in the table in 6 pages, so setting it to 7 ensures all 6 pages are scraped)
# All data is saved to a dataframe df and the extract function is executed with arguments of the url info and the empty dataframe.
    
    print("Begin Scraping Data....")
    
    for i in range(1,7):
        url = "https://oedb.org/rankings/graduation-rate/page" + str(i)
        driver.get(url)
        extract(driver,row_data)  # Call the data scraping function and give driver and empty list as inputs
          
    df = pd.DataFrame(row_data)
    print("Finished Scraping Data....")
    
# Create the headers by searching for the th (Cannot be done inside loop as we need only the title headers once) 
    headers_t = []  # empty list for headers
    html1 = driver.page_source
    soup1 = BeautifulSoup(html1, 'html.parser')
    table1 = soup1.find('table', {'data-waypoint': 'table-rankings'})
    
# Find all table headers and loop through the page and save them to an empty list.Append headers to the title empty list.

    for j in table1.find_all('th'):
        title = j.text  
        headers_t.append(title)   # Append headers to the title
        
# These titles are needed because on extracting the data, it was found that under td tags, more data than columns
# So dummy titles below will be needed to serve as titles. They will dropped later during the transformation stage.
# Display the data frame on the screen with display

    x = ["Title A","Title B","Title C", "Title D", "Title E", "Title F"]
    headers2t = headers_t + x   # Add the dummy titles to the actual titles 
    
# Convert lists to dataframe and save data frame to CSV file with index turned to False

    df_head = pd.DataFrame(headers2t)  # Convert the list to a dataframe
    df.columns = headers2t
    df.to_csv("college_final.csv", index=False)
    df_head.to_csv("output_headers_final.csv", index = False)
    
    time.sleep(5)
    
    print("Data Frame Loading .....")
    
    display(df) # display the dataframe on the screen
    
""" This function scrapes the table data from the website with BeautifulSoup.. There is a pause of 5 seconds between scraping the 
6 pages so as not to bombard the webserver. Function expects url and empty list arguments. BeautifulSoup is used with a html parser
to find the table class and also find the tag tr and td and append them to the empty list""" 
    
def extract(driver,row_data):
        
    time.sleep(5)
        
    html = driver.page_source
       
    soup = BeautifulSoup(html, 'html.parser')
    table = soup.find('table', {'data-waypoint': 'table-rankings'})  # Searching all html for table tag, class data-waypoint

    for row in table.find_all('tr'):
        col = row.find_all('td')
        col = [ele.text.strip() for ele in col]
        row_data.append(col)
        
if __name__ == '__main__':
    main()

Begin Scraping Data....
Finished Scraping Data....
Data Frame Loading .....


Unnamed: 0,Rank,School,Student to Faculty Ratio,Graduation Rate,Retention Rate,Acceptance Rate,Enrollment Rate,Institutional Aid Rate,Default Rate,Title A,Title B,Title C,Title D,Title E,Title F
0,,,,,,,,,,,,,,,
1,1,Luther Rice University & Seminary,23 to 1,100%,100%,100%,100%,65%,,,,,",",N/AN/A,
2,2,Maine College of Health Professions,5 to 1,100%,100%,25%,25%,59%,,,,,",",N/AN/A,
3,3,Averett University-Non-Traditional Programs,7 to 1,100%,,55%,17%,34%,11%,,,,",",N/AN/A,
4,4,Virginia Baptist College,5 to 1,100%,25%,,,38%,,,,,",",N/AN/A,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
572,567,Grace College of Divinity,6 to 1,,83%,53%,50%,24%,,,,,",",N/AN/A,
573,568,Midwives College of Utah,5 to 1,,100%,100%,100%,30%,,,,,",",N/AN/A,
574,569,Touro University Worldwide,13 to 1,,100%,,,76%,4%,,,,",",N/AN/A,
575,570,Unitek College,16 to 1,,100%,,,20%,,,,,",",N/AN/A,
