## scraping API

In [6]:
import requests
import pandas as pd
import time
import random
import json
import os
from dotenv import load_dotenv

load_dotenv()

cookie_value = os.getenv("COOKIE")

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Cookie": cookie_value
}

# ==========================================
# 1. การตั้งค่า (Configuration)
# ==========================================
START_ID = 1000
END_ID = 2000
OUTPUT_FILENAME = "cedt_intern_data_1000_2000.csv"

# URL ของ API เป้าหมาย (ใช้ {} ตรง ID เพื่อรอการแทนค่า)
API_URL_TEMPLATE = "https://cedtintern.cp.eng.chula.ac.th/api/sessions/5/openings/{}"

# ตัวแปรสำหรับเก็บข้อมูลทั้งหมด
all_job_data = []


In [2]:
# ==========================================
# 2. เริ่มการวนลูป (Scraping Loop)
# ==========================================
print(f"Starting scrape from ID {START_ID} to {END_ID}...")

for job_id in range(START_ID, END_ID + 1):
    url = API_URL_TEMPLATE.format(job_id)
    
    try:
        # ยิง Request ไปที่ API
        response = requests.get(url, headers=HEADERS, timeout=10)
        
        # กรณีเจอข้อมูล (Status 200)
        if response.status_code == 200:
            data = response.json()
            
            # ดึงข้อมูลเฉพาะ field ที่ต้องการ (Safe Extraction)
            # ใช้ .get() เพื่อป้องกัน Error กรณีไม่มีข้อมูลใน field นั้น
            job_info = {
                "id": data.get("openingId"),
                "company_name": data.get("company", {}).get("companyNameTh"),
                "position_title": data.get("title"),
                "quota": data.get("quota"),
                "salary_amount": data.get("compensationAmount"),
                "salary_type": data.get("compensationType", {}).get("compensationType"),
                "work_type": data.get("workingCondition"),
                "location": data.get("officeName"),
                # รวม Tags ทั้งหมดเป็นข้อความเดียวคั่นด้วย comma
                "tags": ", ".join([t['tagName'] for t in data.get("tags", [])]),
                # เก็บ Description (อาจจะมี HTML tag ติดมา)
                "description_html": data.get("description"),
                "api_url": url
            }
            
            all_job_data.append(job_info)
            print(f"[OK] ID {job_id}: Found '{job_info['position_title']}'")
            
        # กรณีไม่เจอข้อมูล (404) หรือไม่มีสิทธิ์ (403)
        elif response.status_code == 404:
            print(f"[SKIP] ID {job_id}: Not Found")
        else:
            print(f"[ERR] ID {job_id}: Status {response.status_code}")

    except Exception as e:
        print(f"[ERR] ID {job_id}: Exception occurred - {e}")

    # ==========================================
    # 3. หน่วงเวลาแบบสุ่ม (Random Delay)
    # ==========================================
    # สุ่มเวลาระหว่าง 1.0 ถึง 3.0 วินาที เพื่อไม่ให้ Server จับได้
    delay = random.uniform(1.0, 3.0)
    time.sleep(delay)

# ==========================================
# 4. บันทึกผลลัพธ์ (Export to CSV)
# ==========================================
print("-" * 30)
if all_job_data:
    df = pd.DataFrame(all_job_data)
    
    # บันทึกไฟล์ (ใช้ encoding='utf-8-sig' เพื่อให้อ่านภาษาไทยใน Excel รู้เรื่อง)
    df.to_csv(OUTPUT_FILENAME, index=False, encoding='utf-8-sig')
    
    print(f"Scraping Finished! Successfully saved {len(df)} records.")
    print(f"File saved as: {OUTPUT_FILENAME}")
    
    # แสดงตัวอย่างข้อมูล 5 แถวแรก
    print(df.head())
else:
    print("Scraping Finished, but NO data was found. Please check your Cookie or ID range.")

Starting scrape from ID 1000 to 2000...
[SKIP] ID 1000: Not Found
[SKIP] ID 1001: Not Found
[SKIP] ID 1002: Not Found
[SKIP] ID 1003: Not Found
[SKIP] ID 1004: Not Found
[SKIP] ID 1005: Not Found
[SKIP] ID 1006: Not Found
[SKIP] ID 1007: Not Found
[SKIP] ID 1008: Not Found
[SKIP] ID 1009: Not Found
[SKIP] ID 1010: Not Found
[SKIP] ID 1011: Not Found
[SKIP] ID 1012: Not Found
[SKIP] ID 1013: Not Found
[SKIP] ID 1014: Not Found
[SKIP] ID 1015: Not Found
[SKIP] ID 1016: Not Found
[SKIP] ID 1017: Not Found
[SKIP] ID 1018: Not Found
[SKIP] ID 1019: Not Found
[SKIP] ID 1020: Not Found
[SKIP] ID 1021: Not Found
[SKIP] ID 1022: Not Found
[SKIP] ID 1023: Not Found
[SKIP] ID 1024: Not Found
[SKIP] ID 1025: Not Found
[SKIP] ID 1026: Not Found
[SKIP] ID 1027: Not Found
[SKIP] ID 1028: Not Found
[SKIP] ID 1029: Not Found
[SKIP] ID 1030: Not Found
[SKIP] ID 1031: Not Found
[SKIP] ID 1032: Not Found
[SKIP] ID 1033: Not Found
[SKIP] ID 1034: Not Found
[SKIP] ID 1035: Not Found
[SKIP] ID 1036: Not Foun