In [1]:
import requests
from bs4 import BeautifulSoup
import csv
from tqdm import tqdm
import json

session = requests.Session()

# open the urls.csv for scraping
file = open('urls.csv')
data = list(csv.reader(file, delimiter=","))
file.close()

# separate urls based on Cloud Provider     
providers = {'Azure': [item[0] for item in data],
             'AWS': [item[1] for item in data],
             'Google': [item[2] for item in data]}

name, course_name, course_URL, price, enrollment, rating, num_ratings = [], [], [], [], [], [], []

# scraping loop for each cloud providers urls
for i, url in providers.items():
    for URL in tqdm(url):
        try: # open the webpage and grab the html
            req = session.get(URL).text
            soup = BeautifulSoup(req, 'html.parser')
        except:
            continue
        try:
            course_name_temp = soup.find("h1", {"class": "ud-heading-xl clp-lead__title clp-lead__title--small"}).text
            course_name.append(course_name_temp)
            course_URL.append(URL)
        except:
            print(URL)
            continue
        try: # get the price (non sale) of each course
            meta_price = soup.find("meta", {"property": "udemy_com:price"})
            price.append(meta_price.get("content", None))
            name.append(i)
        except:
            print(URL)
            continue
        try: # check the number of subscribers for each course
            enrollment.append(soup.find("div", {"class": "enrollment"}).text)
        except:
            print(URL)
            continue
        try: # find the rating out of 5 for each course
            rating_temp = soup.find("span", {"class": "star-rating--star-wrapper--5Fj0L star-rating--medium--3Lhzz star-rating--dark-background--4E2W3"})
            rating.append(rating_temp.find("span", {"class": "ud-heading-sm star-rating--rating-number--3l80q"}).text)
        except:
            print(URL)
            continue
        try: # find the number of ratings for each course
            num_ratings_temp = soup.find("div", {"class": "clp-lead__element-item clp-lead__element-item--row"})
            num_ratings_temp = num_ratings_temp.find("a")
            num_ratings_temp = num_ratings_temp.find_all("span")
            num_ratings.append(num_ratings_temp[-1].text)
        except:
            print(URL)
            continue


 52%|█████▏    | 215/412 [02:22<02:22,  1.39it/s]

https://www.udemy.com/course/microsoft-azure-az-900-microsoft-fundamentals-certification/


100%|██████████| 412/412 [05:08<00:00,  1.34it/s]
 62%|██████▏   | 254/412 [04:24<48:43, 18.50s/it]

https://www.udemy.com/course/aws-certified-sysops-admin-associate/


100%|██████████| 412/412 [04:54<00:00,  1.40it/s]
100%|██████████| 412/412 [00:45<00:00,  8.97it/s]


In [2]:
# cleanup function, removes all non integer characters for each section
def cleanup(price, enrollment, rating, num_ratings):    
    info = [price, enrollment, rating, num_ratings]

    if all(len(info[0]) == len(i) for i in info[1:]):
        for i in range(0, len(price)):
            
            price[i] = price[i].strip("$")
            
            enrollment[i] = enrollment[i].strip(" studen")

            num_ratings[i] = num_ratings[i].strip("() ratings")

    return price, enrollment, rating, num_ratings

In [3]:
price, enrollment, rating, num_ratings = cleanup(price, enrollment, rating, num_ratings)

print(len(name), len(course_name), len(course_URL), len(price), len(enrollment), len(rating), len(num_ratings))

766 766 766 766 766 766 766


In [None]:
# creates csv file with all scraping information
rows = zip(name, price, enrollment, rating, num_ratings)
header_list = ('name','course_name','course_URL','price', 'enrollment', 'rating', 'num_ratings')

with open('udemy_data.csv', "w") as f:
    writer = csv.writer(f)
    writer.writerow(header_list)
    for row in rows:
        writer.writerow(row)