
# **Web Scraping Class Central dengan Selenium**
---
Pada bagian ini, akan dilakukan web scraping dari situs [Class Central](https://www.classcentral.com/subject/cs) menggunakan **Selenium**.

Tujuan scraping ini adalah untuk mengambil data kursus seperti judul, provider, rating, bahasa, ketersediaan sertifikat, dan status gratis/berbayar.

In [18]:
import requests
import time
import json
import logging
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

from webdriver_manager.chrome import ChromeDriverManager

print("All libraries installed successfully!")

All libraries installed successfully!


In [20]:
def setup_driver():

  # Fungsi untuk menyiapkan dan mengembalikan WebDriver (Chrome) di Google Colab.
  # Menggunakan opsi headless dan beberapa flags tambahan agar dapat berjalan stabil.

    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Hapus jika ingin tampilkan browser
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    return driver

## Scraping Data Class Central

Mengambil data dari 10 halaman pertama pada kategori Computer Science di Class Central. Untuk setiap kursus, kita ambil informasi:
- Judul kursus
- Provider/platform
- Bahasa
- Sertifikat (tersedia/tidak)
- Rating rata-rata
- Status (gratis atau tidak)
- Jumlah ulasan
- Link ke kursus



---



Menggunakan struktur HTML dan atribut `data-track-props` untuk mengekstrak informasi tersebut.


In [21]:
def scrape_classcentral(driver, urls_to_scrape):

    # Melakukan scraping dari halaman-halaman Class Central berdasarkan URL dasar.

    all_courses_data = []
    for subject, base_url in urls_to_scrape.items():
      logging.info(f"===== Memulai scraping untuk subjek: {subject.upper()} =====")

      for page in range(1, 101):  # Halaman 1 sampai 200
            url = f"{base_url}?page={page}"
            logging.info(f"Scraping halaman {page}: {url}")
            driver.get(url)
            time.sleep(3)  # Jeda untuk memberi waktu halaman loading

            # Cari semua elemen yang merupakan nama kursus
            course_cards = driver.find_elements(By.CSS_SELECTOR, 'li.course-list-course')
            logging.info(f"Halaman {page}: ditemukan {len(course_cards)} kursus.")

            for card in course_cards:
                try:
                    # Cari judul di dalam 'kartu' kursus
                    title_elem = card.find_element(By.CSS_SELECTOR, 'a.color-charcoal.course-name')
                    # Ambil teks judul dan link
                    title = title_elem.text.strip()
                    link = title_elem.get_attribute('href')

                    # Ambil atribut JSON tersembunyi untuk informasi tambahan
                    data_props_raw = title_elem.get_attribute('data-track-props')
                    data_props = json.loads(data_props_raw)

                    provider = data_props.get("course_provider", "Unknown")

                    # Certificate & Price Formatting
                    certificate = "Certificate Available" if data_props.get("course_certificate", False) else "No Certificate"

                    language = data_props.get("course_language", "N/A")
                    avg_rating = round(data_props.get("course_avg_rating", 0.0), 1)

                    is_free = data_props.get("course_is_free", False)
                    price_type = "Free" if is_free else "Paid"

                    overview_elem = card.find_element(By.CSS_SELECTOR, 'a.color-charcoal.block.hover-no-underline.break-word')
                    course_overview = overview_elem.text.strip() if overview_elem else ""

                    # Ambil container untuk durasi (parent element dua tingkat ke atas)
                    try:
                        # Kita cari elemen durasi di dalam 'card' yang sudah kita temukan
                        duration_elem = card.find_element(By.CSS_SELECTOR, 'span[aria-label="Workload and duration"]')
                        duration = duration_elem.text.strip()
                    except:
                        duration = "N/A" # Fallback jika tidak ditemukan

                    try:
                        # Coba ambil teks ulasan
                        reviews = title_elem.find_element(By.XPATH, '../..').find_element(By.CSS_SELECTOR, 'span.color-gray').text.strip()
                    except:
                        reviews = "0 reviews"

                    # Simpan data ke list
                    all_courses_data.append({
                        'title': title,
                        'category': subject,
                        'provider': provider,
                        'language': language,
                        'certificate': certificate,
                        'avg_rating': avg_rating,
                        'price_type': price_type,
                        'reviews': reviews,
                        'duration': duration,
                        'overview': course_overview,
                        'link': f"https://www.classcentral.com{link}"
                    })

                except Exception as e:
                    logging.warning(f"Error saat membaca 1 kursus di halaman {page}: {e}")
                    continue

    # Konversi hasil scraping ke DataFrame
    return pd.DataFrame(all_courses_data)


In [23]:
# Inisialisasi WebDriver
driver = None
try:
    driver = setup_driver()

    target_urls = {
        "Data Science": "https://www.classcentral.com/subject/data-science",
        "Machine Learning": "https://www.classcentral.com/subject/machine-learning",
        "Data Analysis": "https://www.classcentral.com/subject/data-analysis",
        "Data Engineering": "https://www.classcentral.com/subject/data-engineering",
        "Computer Science": "https://www.classcentral.com/subject/cs"
    }

    scraped_data = scrape_classcentral(driver, target_urls)

    if not scraped_data.empty:
        scraped_data.drop_duplicates(subset=['title', 'provider'], inplace=True, keep='first')

        new_column_names = {
            'title': 'Title',
            'category': 'Category',
            'provider': 'Provider',
            'language': 'Language',
            'certificate': 'Certificate',
            'avg_rating': 'Average Rating',
            'price_type': 'Price Type',
            'reviews': 'Reviews',
            'duration': 'Duration',
            'overview': 'Overview',
            'link': 'Link'
        }

        scraped_data.rename(columns=new_column_names, inplace=True)
        logging.info("Nama kolom berhasil diubah.")

        # Simpan ke CSV dengan nama kolom yang sudah baru
        output_file_csv = 'courses_data.csv'
        scraped_data.to_csv(output_file_csv, index=False)

        print(f"\n✅ Scraping selesai. Data disimpan ke: '{output_file_csv}'")
        print("Contoh 5 baris pertama data dengan header baru:")
        print(scraped_data.head())
    else:
        logging.warning("Scraping tidak menghasilkan data.")

finally:
    if driver:
        driver.quit()
        logging.info("WebDriver ditutup.")



In [25]:
df = pd.read_csv('courses_data.csv')
df

FileNotFoundError: [Errno 2] No such file or directory: 'courses_data.csv'

## Scraping Data JobStreet

Mengambil data dari 10 halaman pertama pada kategori Computer Science di Class Central. Untuk setiap kursus, kita ambil informasi:
- Judul kursus
- Provider/platform
- Bahasa
- Sertifikat (tersedia/tidak)
- Rating rata-rata
- Status (gratis atau tidak)
- Jumlah ulasan
- Link ke kursus



---



Menggunakan struktur HTML dan atribut `data-track-props` untuk mengekstrak informasi tersebut.


In [None]:
keywords = ["data scientist", "data analyst", "machine learning", "data engineer", "data science", "computer science"]

# Function for scraping per country
def scrape_jobstreet(domain, country_label):
    all_hrefs = []
    for keyword in keywords:
        search_keyword = keyword.replace(" ", "-")
        search_url = f"https://{domain}/en/job-search/{search_keyword}-jobs/"
        print(f"\nSearching on {country_label.upper()} for: {keyword.upper()} jobs")
        driver.get(search_url)
        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "a[data-automation='jobTitle']"))
            )
            job_links_elements = driver.find_elements(By.CSS_SELECTOR, "a[data-automation='jobTitle']")
            hrefs = [(el.get_attribute("href"), country_label) for el in job_links_elements if el.get_attribute("href")]
            all_hrefs.extend(hrefs)
            print(f"  Found {len(hrefs)} job links in {country_label} for '{keyword}'")
        except TimeoutException:
            print(f"  Timeout loading results for: {keyword} in {country_label}")
    return all_hrefs

# Scrape per country
all_job_hrefs_malaysia = scrape_jobstreet("jobstreet.com.my", "malaysia")
all_job_hrefs_singapore = scrape_jobstreet("jobstreet.com.sg", "singapore")
all_job_hrefs_indonesia = scrape_jobstreet("id.jobstreet.com", "indonesia")

# Gabungkan semua
all_job_hrefs = all_job_hrefs_malaysia + all_job_hrefs_singapore + all_job_hrefs_indonesia

# Deduplicate
unique_href_map = {}
for href, country in all_job_hrefs:
    if href and href not in unique_href_map:
        unique_href_map[href] = country

# Scrape detail tiap job
scraped_jobs = []
for i, (href, country_name) in enumerate(list(unique_href_map.items())):
    if not href.startswith("http"):
        continue

    print(f"\n[{i+1}] Navigating to: {href}")
    try:
        driver.get(href)
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "h1[data-automation='job-detail-title']"))
        )

        title = driver.find_element(By.CSS_SELECTOR, "h1[data-automation='job-detail-title']").text
        company = driver.find_element(By.CSS_SELECTOR, "span[data-automation='advertiser-name']").text
        location = driver.find_element(By.CSS_SELECTOR, "span[data-automation='job-detail-location']").text
        category = driver.find_element(By.CSS_SELECTOR, "span[data-automation='job-detail-classifications']").text
        work_type = driver.find_element(By.CSS_SELECTOR, "span[data-automation='job-detail-work-type']").text
        description = driver.find_element(By.CSS_SELECTOR, "div[data-automation='jobAdDetails']").text

        # Tambahan: Gaji
        try:
            salary_element = driver.find_element(By.CSS_SELECTOR, "span[data-automation='job-detail-salary']")
            salary = salary_element.text
        except:
            salary = "Not specified"

        # Tambahan: Ekstrak requirement sederhana dari deskripsi
        requirements = ""
        for line in description.splitlines():
            if "requirement" in line.lower() or "qualification" in line.lower() or "kualifikasi" in line.lower():
                requirements += line.strip() + " | "
        if not requirements:
            requirements = "Not specified"

        scraped_jobs.append({
            "Title": title.strip(),
            "Company": company.strip(),
            "Country": country_name.strip().title(),
            "Location": location.strip(),
            "Category": category.strip(),
            "Work Type": work_type.strip(),
            "Salary": salary.strip(),
            "Requirements": requirements.strip(),
            "Description": description.strip(),
            "Link": href.strip()
        })

        print(f"  Title: {title}")
        print(f"  Company: {company}")
        print(f"  Country: {country_name}")
        print(f"  Location: {location}")
        print(f"  Category: {category}")
        print(f"  Work Type: {work_type}")
        print(f"  Salary: {salary}")
        print(f"  Requirements: {requirements}")
        print(f"  Description preview: {description[:200]}...")
        print(f"  Link: {href}")

    except (TimeoutException, NoSuchElementException) as e:
        print(f"  Error scraping {href}: {e}")

driver.quit()

df_jobs = pd.DataFrame(scraped_jobs)
print("\nTotal jobs scraped:", len(df_jobs))

# Main Execution

In [17]:
if __name__ == "__main__":
    driver = setup_driver()

    # Jobstreet Scraping
    job_links = []
    job_links += scrape_jobstreet("jobstreet.com.my", "malaysia", driver)
    job_links += scrape_jobstreet("jobstreet.com.sg", "singapore", driver)
    job_links += scrape_jobstreet("id.jobstreet.com", "indonesia", driver)

    df_jobstreet = scrape_jobstreet_details(job_links, driver)
    df_jobstreet.to_csv("cleaned_jobstreet.csv", index=False)
    print("\n✅ JobStreet scraping done. Saved to cleaned_jobstreet.csv")

    # Class Central Scraping
    urls = {
        "Data Science": "https://www.classcentral.com/subject/data-science",
        "Machine Learning": "https://www.classcentral.com/subject/machine-learning",
        "Data Analysis": "https://www.classcentral.com/subject/data-analysis",
        "Data Engineering": "https://www.classcentral.com/subject/data-engineering",
        "Computer Science": "https://www.classcentral.com/subject/cs"
    }

    df_classcentral = scrape_classcentral(driver, urls)
    df_classcentral.drop_duplicates(subset=["Title", "Provider"], inplace=True)
    df_classcentral.to_csv("cleaned_classentral.csv", index=False)
    print("\n✅ Class Central scraping done. Saved to cleaned_classentral.csv")

    driver.quit()


Searching on MALAYSIA for: DATA SCIENTIST
  Timeout loading results for: data scientist in malaysia

Searching on MALAYSIA for: DATA ANALYST
  Timeout loading results for: data analyst in malaysia

Searching on MALAYSIA for: MACHINE LEARNING
  Timeout loading results for: machine learning in malaysia

Searching on MALAYSIA for: DATA ENGINEER
  Timeout loading results for: data engineer in malaysia

Searching on MALAYSIA for: DATA SCIENCE
  Timeout loading results for: data science in malaysia

Searching on SINGAPORE for: DATA SCIENTIST
  Timeout loading results for: data scientist in singapore

Searching on SINGAPORE for: DATA ANALYST
  Timeout loading results for: data analyst in singapore

Searching on SINGAPORE for: MACHINE LEARNING
  Timeout loading results for: machine learning in singapore

Searching on SINGAPORE for: DATA ENGINEER
  Timeout loading results for: data engineer in singapore

Searching on SINGAPORE for: DATA SCIENCE
  Timeout loading results for: data science in si

In [11]:
df_job = pd.read_csv('\\dataset\\classcentral_data.csv')
df_job

FileNotFoundError: [Errno 2] No such file or directory: '\\dataset\\classcentral_data.csv'

In [None]:
# Import Library Tambahan

import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download resource NLTK untuk Bahasa Inggris
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt_tab')

print("Library berhasil di-import.")

In [None]:
# Menghapus URL, Hashtag, Emoji, Angka, dan Tanda Baca
def clean_noise(text):

  # Menghapus semua tag HTML secara utuh
  text = re.sub(r'<.*?>', '', text)
  # Menghapus URL
  text = re.sub(r'https?://\S+|www\.\S+', '', text)
  # Menghapus Hashtag
  text = re.sub(r'#\w+', '', text)
  # Menghapus Emoji dan Tanda Baca
  text = re.sub(r'[^\w\s]', '', text)
  # Menghapus Angka
  text = re.sub(r'\d+', '', text)
  # Menghapus spasi berlebih
  text = re.sub(r'\s+', ' ', text).strip()
  return text

In [None]:
# Menghapus Stopwords

# Define list_stopwords
from nltk.corpus import stopwords
list_stopwords = set(stopwords.words('english'))

def remove_stopwords(text):

  # Memecah kalimat menjadi kata-kata (tokenization)
  tokens = text.split()

  # Menghapus stopwords dari daftar token
  tokens_without_stopwords = [word for word in tokens if word not in list_stopwords]

  # Menggabungkan kembali token menjadi kalimat
  text = ' '.join(tokens_without_stopwords)
  return text

In [None]:
# Menghapus URL, Hashtag, Emoji, Angka, dan Tanda Baca
def clean_noise(text):

  # Menghapus semua tag HTML secara utuh
  text = re.sub(r'<.*?>', '', text)
  # Menghapus URL
  text = re.sub(r'https?://\S+|www\.\S+', '', text)
  # Menghapus Hashtag
  text = re.sub(r'#\w+', '', text)
  # Menghapus Emoji dan Tanda Baca
  text = re.sub(r'[^\w\s]', '', text)
  # Menghapus Angka
  text = re.sub(r'\d+', '', text)
  # Menghapus spasi berlebih
  text = re.sub(r'\s+', ' ', text).strip()
  return text

In [None]:
# Menghapus Stopwords

# Define list_stopwords
from nltk.corpus import stopwords
list_stopwords = set(stopwords.words('english'))

def remove_stopwords(text):

  # Memecah kalimat menjadi kata-kata (tokenization)
  tokens = text.split()

  # Menghapus stopwords dari daftar token
  tokens_without_stopwords = [word for word in tokens if word not in list_stopwords]

  # Menggabungkan kembali token menjadi kalimat
  text = ' '.join(tokens_without_stopwords)
  return text

# Stemming
# Membuat stemmer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [None]:
def cleaning_pipeline(text):
    text = clean_noise(text)

    # 1. Lowercase
    text = text.lower()

    # 2. Remove non-alphabetic characters
    text = re.sub(r'[^a-z\s]', '', text)

    # 3. Tokenize
    tokens = word_tokenize(text)

    # 4. Remove stopwords
    tokens = [word for word in tokens if word not in list_stopwords]

    # 5. Stemming
    stemmed_tokens = [stemmer.stem(word) for word in tokens]

    # 6. Join back
    cleaned_text = ' '.join(stemmed_tokens)

    return cleaned_text

In [None]:
# =====================
# POST-SCRAPING PIPELINE
# =====================

import pandas as pd

df_jobs = pd.read_csv("/content/jobstreet_data.csv")

if not df_jobs.empty:
    print("\n✅ Scraping berhasil. Memulai pipeline post-processing...")

    # Kolom-kolom yang ingin dibersihkan
    text_columns = ['Title', 'Company', 'Country', 'Location', 'Category',
                    'Work Type', 'Salary', 'Requirements', 'Description']

    # Simpan contoh data sebelum preprocessing
    sample_row = df_jobs.iloc[0]

    # Bersihkan semua kolom teks
    for col in text_columns:
        df_jobs[f'Cleaned {col}'] = df_jobs[col].apply(cleaning_pipeline)

    print("\n--- CONTOH HASIL CLEANING (5 Data Pertama) ---")
    print(df_jobs[[f'Cleaned {col}' for col in text_columns]].head(5))

    # Urutkan kolom biar rapi
    final_columns = text_columns + [f'Cleaned {col}' for col in text_columns] + ['Link']
    df_jobs = df_jobs[final_columns]

    # Simpan ke file
    output_csv = "jobstreet_final_data.csv"
    df_jobs.to_csv(output_csv, index=False)
    print(f"\n📁 Data berhasil disimpan ke: '{output_csv}'")

    output_json = "jobstreet_final_data.json"
    df_jobs.to_json(output_json, orient='records', indent=4)
    print(f"📁 Data berhasil disimpan ke: '{output_json}'")

else:
    print("⚠️ Tidak ada data yang berhasil di-scrape.")