In [1]:
import pandas as pd
import requests
import time
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
pd.set_option('display.max_colwidth', 200)

In [2]:
def scrape_multiple_keywords(api_url, keywords, max_pages):

    job_id =[]
    titles = []
    companies = []
    
    for keyword in keywords:
        print(f"Scraping jobs for keyword: {keyword}")
        api_url_keyword = api_url.replace("keywords=", f"keywords={keyword}")
        
        for page_number in range(1, max_pages + 1):
            page_url = f'{api_url_keyword}&page={page_number}'
            
            response = requests.get(api_url)
            if response.status_code == 200:
                # Parse the JSON response
                data = response.json()

                # Extract advertiser IDs from each item in the 'data' list
                for item in data['data']:
                    
                    jid = item['id']
                    title = item['title']
                    company = item['advertiser'].get('description', '')

                    job_id.append(jid)
                    titles.append(title)
                    companies.append(company)

            else:
                print(f"Failed to retrieve data from the API. Status Code: {response.status_code}")
                break
    
    return job_id, titles, companies

api_url = 'https://id.jobstreet.com/api/jobsearch/v5/search?siteKey=ID-Main&sourcesystem=houston&userqueryid=45d2e71105d82100f674c0d5ac35d3cc-1751202&userid=89aba55b-b35f-40aa-9485-dfacb5738b52&usersessionid=89aba55b-b35f-40aa-9485-dfacb5738b52&eventCaptureSessionId=89aba55b-b35f-40aa-9485-dfacb5738b52&page=1&keywords=&classification=6281&pageSize=32&include=seodata,relatedsearches,joracrosslink,gptTargeting,pills&baseKeywords=informatics&locale=id-ID&solId=d06fed81-7ac5-4601-9b9d-bfd4d8ae15c4&relatedSearchesCount=12'
keywords = ['informatics',
            'teknik+informatika',
            'ilmu+komputer',
            'computer+science',
            'information+systems',
            'sistem+informasi',
            'manajemen+informatika'
            ]
max_pages = 1

job_id, titles, companies = scrape_multiple_keywords(api_url, keywords, max_pages)

Scraping jobs for keyword: informatics
Scraping jobs for keyword: teknik+informatika
Scraping jobs for keyword: ilmu+komputer
Scraping jobs for keyword: computer+science
Scraping jobs for keyword: information+systems
Scraping jobs for keyword: sistem+informasi
Scraping jobs for keyword: manajemen+informatika


In [3]:
def setup_driver():
    chrome_driver_path = r"D:/SKRIPSI/skillviz/SkillMapping/chromedriver.exe"
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                         "AppleWebKit/537.36 (KHTML, like Gecko) "
                         "Chrome/122.0.0.0 Safari/537.36")

    service = Service(executable_path=chrome_driver_path)
    driver = webdriver.Chrome(service=service, options=options)
    return driver


def scrape_with_selenium(job_id):
    url = f'https://id.jobstreet.com/id/job/{job_id}'
    driver = setup_driver()

    try:
        driver.get(url)
        time.sleep(random.uniform(5, 7))

        job_desc_ul_elements = driver.find_elements(By.XPATH, '//div[@data-automation="jobAdDetails"]//ul')
        job_desc = "".join([ul.text for ul in job_desc_ul_elements])
        print(f"Job {job_id} scraped")

        return job_desc
    
    except Exception as e:
        print("Gagal mengambil data pekerjaan:", e)
        return None

    finally:
        driver.quit()

def scrape_and_store_text(job_ids):
    data = {'job_id': [],'job_title':[],'company':[], 'descriptions': [], 'desc_extracted': []}

    for job_id in job_ids: 
        data['job_id'].append(job_id)
        data['descriptions'].append(scrape_with_selenium(job_id))
        print(f"Desc {job_id} stored")

    data['job_title'] = titles
    data['company'] = companies
    data['desc_extracted'] = ''
    
    return data

In [4]:
data = scrape_and_store_text(job_id)
result_df = pd.DataFrame(data)
print(result_df)

print("Jumlah baris sebelum dihapus duplikat:", len(result_df))

df_cleaned = result_df.drop_duplicates(subset=['job_id', 'job_title', 'descriptions'])
df_cleaned = df_cleaned.dropna(subset=['job_id', 'job_title', 'descriptions'])

print("Jumlah baris setelah dihapus duplikat:", len(df_cleaned)) 

df_cleaned.to_excel("jobstreet_scrape_16Juni2025.xlsx", index=False)
print("Data scraping selesai!")

Job 84498400 scraped
Desc 84498400 stored
Job 84894842 scraped
Desc 84894842 stored
Job 84842928 scraped
Desc 84842928 stored
Job 84684916 scraped
Desc 84684916 stored
Job 84895414 scraped
Desc 84895414 stored
Job 84831724 scraped
Desc 84831724 stored
Job 84718301 scraped
Desc 84718301 stored
Job 84623690 scraped
Desc 84623690 stored
Job 84614844 scraped
Desc 84614844 stored
Job 84709426 scraped
Desc 84709426 stored
Job 84881355 scraped
Desc 84881355 stored
Job 84761700 scraped
Desc 84761700 stored
Job 84645827 scraped
Desc 84645827 stored
Job 84537288 scraped
Desc 84537288 stored
Job 84831154 scraped
Desc 84831154 stored
Job 84804211 scraped
Desc 84804211 stored
Job 84501186 scraped
Desc 84501186 stored
Job 84819013 scraped
Desc 84819013 stored
Job 84650204 scraped
Desc 84650204 stored
Job 84938106 scraped
Desc 84938106 stored
Job 84907679 scraped
Desc 84907679 stored
Job 84895675 scraped
Desc 84895675 stored
Job 84464183 scraped
Desc 84464183 stored
Job 84634647 scraped
Desc 84634647