In [None]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

# 查看職缺數量
url = 'https://www.104.com.tw/jobs/search/?ro=0&jobcat=2007001004%2C2007001016&expansionType=area%2Cspec%2Ccom%2Cjob%2Cwf%2Cwktm&area=6001001000%2C6001002000&order=16&asc=0&sctp=M&scmin=40000&scstrict=1&scneg=0&page=1&jobexp=1%2C3&mode=s&jobsource=2018indexpoc&langFlag=0&langStatus=0&recommendJob=1&hotJob=1'
result = requests.get(url)
soup = BeautifulSoup(result.text, 'html.parser')

meta_content = soup.find('meta', property='og:description')['content']
pattern = r'(\d+) 個工作機會'
match = re.search(pattern, meta_content)
jobs_count = match.group(1)

# 篩選條件
filter_condition = soup.find('meta', property='og:title')['content']
print("篩選條件:",filter_condition)
print("職缺數量:",jobs_count)


# 取得職缺清單
job_list = []
page_number = 1
while True:
    url = f'https://www.104.com.tw/jobs/search/?ro=0&jobcat=2007001004%2C2007001016&expansionType=area%2Cspec%2Ccom%2Cjob%2Cwf%2Cwktm&area=6001001000%2C6001002000&order=16&asc=0&sctp=M&scmin=40000&scstrict=1&scneg=0&page={page_number}&jobexp=1%2C3&mode=s&jobsource=2018indexpoc&langFlag=0&langStatus=0&recommendJob=1&hotJob=1'
    result = requests.get(url)
    soup = BeautifulSoup(result.text, 'html.parser')
    
    job_items = soup.find_all(
        'article', class_='b-block--top-bord job-list-item b-clearfix js-job-item')

    if job_items:
        pass
    else:
        page_number -= 1
        break
        
    for item in job_items:
        temp = {
            'Customer Name': item['data-cust-name'],
            'Customer Number': item['data-cust-no'],
            'Industry Category': item['data-indcat'],
            'Industry Category Description': item['data-indcat-desc'],
            'Is Apply': item['data-is-apply'],
            'Is Save': item['data-is-save'],
            'Job Name': item['data-job-name'],
            'Job Number': item['data-job-no'],
            'Job RO': item['data-job-ro'],
            'Job Source': item['data-jobsource'],
            'QA ID': item['data-qa-id'],
            'Link': item.find('a', class_='js-job-link')['href'].replace('//', 'https://')
        }
        job_list.append(temp)
    print("第", page_number,"頁")
    page_number += 1
    
print("Done", "頁數:", page_number, "筆數:", len(job_list))

In [None]:
# 以職缺連結取得詳細資料
jobs_data = []
for ind, job_data in enumerate(job_list):
    try:
        company_name = job_data['Customer Name']
        company_category = job_data['Industry Category Description']
        job_name = job_data['Job Name']
        job_link = job_data['Link']
        
        job_data = {'公司名稱' : company_name, 
                    '公司類別' : company_category, 
                    '職缺名稱' : job_name, 
                    '職缺連結' : job_link}
        
        result = requests.get(job_link)
        soup = BeautifulSoup(result.text, 'html.parser')

        job_descriptions = soup.find(
            'p', class_='mb-5 r3 job-description__content text-break').text.replace('\n', ' ')

        job_type = []
        for n in soup.findAll('div', {'data-gtm-content': '職務類別'}):
            job_type.append(n.text)

        job_salary = soup.find('div', class_='list-row row mb-2 identity-type').text.replace(
            ' ', '').replace('\n', '').replace('工作待遇', '')
        
        
        items = soup.findAll('div', class_='list-row row mb-2')
        job_details_text = ''
        
        for item in items[1:]:
            job_details_text += item.text + '\n'


        patterns = {
            '工作性質': r'工作性質\s+(.+)',
            '上班地點': r'上班地點\s+(.+)',
            '管理責任': r'管理責任\s+(.+)',
            '出差外派': r'出差外派\s+(.+)',
            '上班時段': r'上班時段\s+(.+)',
            '休假制度': r'休假制度\s+(.+)',
            '可上班日': r'可上班日\s+(.+)',
            '需求人數': r'需求人數\s+(.+)',
            '工作經歷': r'工作經歷\s+(.+)',
            '學歷要求': r'學歷要求\s+(.+)',
            '科系要求': r'科系要求\s+(.+)',
            '語文條件': r'語文條件\s+(.+)',
            '擅長工具': r'擅長工具\s+(.+)',
            '工作技能': r'工作技能\s+(.+)',
            '其他條件': r'其他條件\s+(.+)',
        }

        temp_data = {key: '' for key in patterns}

        for line in job_details_text.split('\n'):
            for key, pattern in patterns.items():
                match = re.search(pattern, line)
                if match:
                    temp_data[key] = match.group(1)


        other_conditions = []

        in_other_conditions = False

        for line in job_details_text.split('\n'):
            if '其他條件' in line:
                in_other_conditions = True
                continue
            if in_other_conditions and line.strip():
                other_conditions.append(line.strip())

        temp_data['其他條件'] = temp_data['其他條件'] + ' ' + ' '.join(other_conditions)
        
        job_data.update(temp_data)
        jobs_data.append(job_data)
    
    except Exception as e:
        print(e)
        pass
        
    print(ind+1, job_name)


In [None]:
# 將職缺資料存成 Excel 檔案
jobs_data_df = pd.DataFrame(jobs_data)
jobs_data_df.to_excel('jobs104.xlsx', index=False)
print("File Saved")