In [None]:
import csv
from bs4 import BeautifulSoup
import requests

# Open a CSV file in append mode so that we can add some new courses if available
with open('courseradata.csv', 'a', newline='', encoding='utf-8') as csvfile:
    # Create a CSV writer
    csvwriter = csv.writer(csvfile)

    # Write the header row if the file is empty
    if csvfile.tell() == 0:
        csvwriter.writerow(['Course', 'Organisation', 'Rating', 'Reviews_count', 'Link', 'Level', 'Certificate_type', 'Duration', 'Skills', 'Description'])

    i = 1

    # List of URLs to loop through
    urls = [
        'https://www.coursera.org/courses?page=', #coursera all courses webpage
        'https://www.coursera.org/search?topic=Data%20Science&page=', #data science courses
        'https://www.coursera.org/courses?topic=Math%20and%20Logic&page=', #Maths & logic 
        'https://www.coursera.org/courses?topic=Physical%20Science%20and%20Engineering&page=', #Physical Science
        'https://www.coursera.org/courses?topic=Arts%20and%20Humanities&page=', # Art & Humanities
        'https://www.coursera.org/courses?topic=Business&page=', #Bussiness
        'https://www.coursera.org/courses?topic=Health&page=', #Health
        'https://www.coursera.org/courses?topic=Computer%20Science&page=', #Computer Science
        'https://www.coursera.org/courses?topic=Information%20Technology&page=', #Information Technology
        'https://www.coursera.org/courses?topic=Language%20Learning&page=', #Language
        'https://www.coursera.org/courses?topic=Personal%20Development&page=', # personal Development
    ]

    # Loop through each URL
    for url in urls:
        # Loop through pages from 1 to 84 for each URL,84 is choosen as cureently at the time of scraping only 84 pages were there
        for page in range(1, 85):
            # Constructing the full URL
            full_url = f'{url}{page}'

            # Send a GET request to the URL
            response = requests.get(full_url)
            if response.status_code == 200:
                # Parse the HTML content of the page
                soup = BeautifulSoup(response.text, 'html.parser')

                # Find elements with the specified class names
                Courses = soup.find_all('h3', class_='cds-119 cds-CommonCard-title css-e7lgfl cds-121')
                organisations = soup.find_all('p', class_='cds-119 cds-ProductCard-partnerNames css-dmxkm1 cds-121')
                ratings = soup.find_all('p', class_='cds-119 css-11uuo4b cds-121')
                reviews_divs = soup.find_all('div', class_='product-reviews css-pn23ng')
                links = soup.select('a.cds-119.cds-113.cds-115.cds-CommonCard-titleLink.css-si869u.cds-142')
                metadata_divs = soup.find_all('div', class_='cds-CommonCard-metadata')
                skills_elements = soup.find_all('b', class_='css-14m26ju')

                # Loop through the elements, ratings, reviews, links, metadata, and skills
                for course, organisation, rating, reviews_div, link, metadata_div, skills_element in zip(
                        Courses, organisations, ratings, reviews_divs, links, metadata_divs, skills_elements
                ):
                    # Extract the text content
                    course_name = course.text if course else 'N/A'
                    organisation_name = organisation.text if organisation else 'N/A'
                    rating_text = rating.text if rating else 'N/A'
                    link_href = link.get('href') if link else 'N/A'
                    reviews_paragraph = reviews_div.find('p', class_='cds-119 cds-Typography-base css-dmxkm1 cds-121')
                    reviews_text = reviews_paragraph.text if reviews_paragraph else 'N/A'

                    # Extract metadata information
                    metadata_paragraph = metadata_div.find('p', class_='cds-119 cds-Typography-base css-dmxkm1 cds-121')
                    metadata_text = metadata_paragraph.text if metadata_paragraph else 'N/A'

                    # Extract skills information
                    skills_text = skills_element.next_sibling.strip() if skills_element else 'N/A'

                    # Split metadata_text into three columns
                    metadata_parts = metadata_text.split('Â·')

                    # Fill in empty values if the split gives fewer values
                    level = metadata_parts[0].strip() if len(metadata_parts) > 0 else 'N/A'
                    certificate_type = metadata_parts[1].strip() if len(metadata_parts) > 1 else 'N/A'
                    duration = metadata_parts[2].strip() if len(metadata_parts) > 2 else 'N/A'

                    # Send a GET request to the course URL
                    course_response = requests.get('https://www.coursera.org' + link_href)
                    if course_response.status_code == 200:
                        # Parse the HTML content of the course page
                        course_soup = BeautifulSoup(course_response.text, 'html.parser')
                        description_paragraph = course_soup.find('p', class_='cds-119 cds-Typography-base css-80vnnb cds-121')
                        description = description_paragraph.text.strip() if description_paragraph else 'N/A'
                        print(description)

                        # Write the row to the CSV file
                        csvwriter.writerow([course_name, organisation_name, rating_text, reviews_text,
                                            'https://www.coursera.org'+link_href, level, certificate_type, duration, skills_text, description])
                        print(i)
                        i = i + 1
                    else:
                        print(f'Failed to retrieve course page. Status code: {course_response.status_code}')
            else:
                print(f'Failed to retrieve the page. Status code: {response.status_code}')
