# Indeed Web Crawler

In [7]:
import requests
import csv
import time
import os
import json

# Adzuna API credentials
APP_ID = "87405112"
APP_KEY = "4ac7a99919cc2847aa9dc0cfeb89b66e"

# Base URL for API requests
BASE_URL = "https://api.adzuna.com/v1/api/jobs/{country}/search/"

# Maximum results per page
RESULTS_PER_PAGE = 50

# Countries to query
countries = ['fr', 'de', 'lu', 'nl', 'gb']

# Data-related job titles to query
data_related_jobs = [
    'data science', 'machine learning engineer', 'NLP', 'data engineering',
    'data analyst', 'data architect', 'data scientist', 'AI engineer',
    'big data engineer', 'deep learning engineer'
]

# Number of jobs per role per country
JOBS_PER_ROLE_PER_COUNTRY = 200
PAGES_PER_ROLE = JOBS_PER_ROLE_PER_COUNTRY // RESULTS_PER_PAGE

# Base parameters for API requests
params = {
    'app_id': APP_ID,
    'app_key': APP_KEY,
    'results_per_page': RESULTS_PER_PAGE,
    'content-type': 'application/json'
}

# CSV file path
CSV_FILE_PATH = 'adzuna_data_related_jobs.csv'
# Progress log file path
PROGRESS_LOG_PATH = 'progress_log.json'

def save_progress(country_code, job_title, page):
    """Save current progress to log file"""
    progress = {
        'country_code': country_code,
        'job_title': job_title,
        'page': page
    }
    with open(PROGRESS_LOG_PATH, 'w') as log_file:
        json.dump(progress, log_file)
    print(f"Progress saved: {progress}")

def load_progress():
    """Load progress log, return None if not exists"""
    if os.path.exists(PROGRESS_LOG_PATH):
        with open(PROGRESS_LOG_PATH, 'r') as log_file:
            return json.load(log_file)
    return None

def get_job_data(country_code, job_title, page):
    """Send API request and return JSON data"""
    try:
        params['what'] = job_title
        response = requests.get(BASE_URL.format(country=country_code) + str(page), params=params)
        response.raise_for_status()
        job_data = response.json()
        return job_data
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from page {page} for {job_title} in {country_code}: {e}")
        return None

def parse_job_data(job_data, country_code, job_title):
    """Parse JSON data from API response"""
    job_list = []

    for job in job_data.get('results', []):
        job_info = {
            'Country': country_code,
            'Job Title': job.get('title', 'N/A'),
            'Company': job.get('company', {}).get('display_name', 'N/A'),
            'Company Size': job.get('company', {}).get('size', 'N/A'),
            'Industry': job.get('category', {}).get('label', 'N/A'),
            'Job Type': job.get('contract_type', 'N/A'),
            'Remote Type': job.get('contract_time', 'N/A'),
            'Salary Min': job.get('salary_min', 'N/A'),
            'Salary Max': job.get('salary_max', 'N/A'),
            'Experience': job.get('experience', 'N/A'),
            'Post Date': job.get('created', 'N/A'),
            'Required Language': job.get('description', 'N/A'),
            'Redirect URL': job.get('redirect_url', 'N/A'),
            'Location': job.get('location', {}).get('display_name', 'N/A'),
            'Job Category': job_title,
            'Description': job.get('description', 'N/A')
        }
        job_list.append(job_info)

    return job_list

def save_to_csv(job_list, file_path):
    """Save job data to CSV file"""
    try:
        write_header = not os.path.exists(file_path)

        with open(file_path, mode='a', newline='', encoding='utf-8') as file:
            writer = csv.DictWriter(file, fieldnames=job_list[0].keys())
            
            if write_header:
                writer.writeheader()
                
            writer.writerows(job_list)
        print(f"Data successfully saved to {file_path}")
    except IOError as e:
        print(f"Error saving data to CSV: {e}")ssia for its invasion of Ukraine. Japan and Russia each expelled a number of diplomats and Russia halted peace ...

# Main function
if __name__ == "__main__":
    # Load previous progress
    progress = load_progress()
    if progress:
        start_country = progress['country_code']
        start_job_title = progress['job_title']
        start_page = progress['page']
        print(f"Resuming from {start_country}, {start_job_title}, page {start_page}")
    else:
        start_country = countries[0]
        start_job_title = data_related_jobs[0]
        start_page = 1
        print("Starting from the beginning.")

    resume_flag = False if progress else True

    # Query jobs for each country and job title
    for country_code in countries:
        for job_title in data_related_jobs:
            for page in range(1, PAGES_PER_ROLE + 1):
                if not resume_flag:
                    if country_code == start_country and job_title == start_job_title and page == start_page:
                        resume_flag = True
                    else:
                        continue

                job_data = get_job_data(country_code, job_title, page)

                if job_data and 'results' in job_data:
                    jobs = parse_job_data(job_data, country_code, job_title)
                    if jobs:
                        save_to_csv(jobs, CSV_FILE_PATH)

                if not job_data or not job_data.get('results'):
                    print(f"No more results for {job_title} in {country_code} on page {page}.")
                    break

                save_progress(country_code, job_title, page)

                time.sleep(1)
    
    if os.path.exists(PROGRESS_LOG_PATH):
        os.remove(PROGRESS_LOG_PATH)


Resuming from fr, data scientist, page 2
Data successfully saved to adzuna_data_related_jobs.csv
Progress saved: {'country_code': 'fr', 'job_title': 'data scientist', 'page': 2}
Data successfully saved to adzuna_data_related_jobs.csv
Progress saved: {'country_code': 'fr', 'job_title': 'data scientist', 'page': 3}
Data successfully saved to adzuna_data_related_jobs.csv
Progress saved: {'country_code': 'fr', 'job_title': 'data scientist', 'page': 4}
Data successfully saved to adzuna_data_related_jobs.csv
Progress saved: {'country_code': 'fr', 'job_title': 'AI engineer', 'page': 1}
Data successfully saved to adzuna_data_related_jobs.csv
Progress saved: {'country_code': 'fr', 'job_title': 'AI engineer', 'page': 2}
Data successfully saved to adzuna_data_related_jobs.csv
Progress saved: {'country_code': 'fr', 'job_title': 'AI engineer', 'page': 3}
No more results for AI engineer in fr on page 4.
Data successfully saved to adzuna_data_related_jobs.csv
Progress saved: {'country_code': 'fr', 'j

In [2]:
import requests
import csv
import time
import os
import json
from datetime import datetime, timedelta

# Adzuna API 凭证
APP_ID = "87405112"
APP_KEY = "4ac7a99919cc2847aa9dc0cfeb89b66e"

# API 请求的基础URL
BASE_URL = "https://api.adzuna.com/v1/api/jobs/{country}/search/"

# 每页返回的最大结果数
RESULTS_PER_PAGE = 50

# 定义你要查询的国家
countries = ['fr', 'de', 'nl', 'gb']  # 法国、德国、卢森堡、荷兰、英国

# 定义你要查询的与数据相关的岗位
data_related_jobs = [
    'data science', 'machine learning engineer', 'NLP', 'data engineering',
    'data analyst', 'data architect', 'data scientist', 'AI engineer',
    'big data engineer', 'deep learning engineer'
]

# 每个岗位每个国家请求 200 个工作岗位
JOBS_PER_ROLE_PER_COUNTRY = 200
PAGES_PER_ROLE = JOBS_PER_ROLE_PER_COUNTRY // RESULTS_PER_PAGE

# API 请求的基础参数
params = {
    'app_id': APP_ID,
    'app_key': APP_KEY,
    'results_per_page': RESULTS_PER_PAGE,  # 每次请求返回的结果数
    'content-type': 'application/json'
}

# CSV 文件保存路径
CSV_FILE_PATH = 'adzuna_data_related_jobs.csv'
# 进度日志文件路径
PROGRESS_LOG_PATH = 'progress_log.json'

def save_progress(country_code, job_title, page, max_days_old):
    """保存当前进度到日志文件"""
    progress = {
        'country_code': country_code,
        'job_title': job_title,
        'page': page,
        'max_days_old': max_days_old
    }
    with open(PROGRESS_LOG_PATH, 'w') as log_file:
        json.dump(progress, log_file)
    print(f"Progress saved: {progress}")

def load_progress():
    """加载进度日志，如果不存在返回 None"""
    if os.path.exists(PROGRESS_LOG_PATH):
        with open(PROGRESS_LOG_PATH, 'r') as log_file:
            return json.load(log_file)
    return None

def get_job_data(country_code, job_title, page, max_days_old):
    """发送API请求并返回JSON数据，按最大天数限制过滤"""
    try:
        params['what'] = job_title  # 设置当前请求的工作职位关键词
        params['max_days_old'] = max_days_old  # 添加最大天数限制
        response = requests.get(BASE_URL.format(country=country_code) + str(page), params=params)
        response.raise_for_status()  # 检查请求是否成功
        job_data = response.json()  # 返回JSON格式的数据
        return job_data
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from page {page} for {job_title} in {country_code}: {e}")
        return None

def parse_job_data(job_data, country_code, job_title):
    """解析API返回的JSON数据，提取关键信息"""
    job_list = []

    for job in job_data.get('results', []):
        job_info = {
            'Country': country_code,
            'Job Title': job.get('title', 'N/A'),
            'Company': job.get('company', {}).get('display_name', 'N/A'),
            'Company Size': job.get('company', {}).get('size', 'N/A'),
            'Industry': job.get('category', {}).get('label', 'N/A'),
            'Job Type': job.get('contract_type', 'N/A'),
            'Remote Type': job.get('contract_time', 'N/A'),
            'Salary Min': job.get('salary_min', 'N/A'),
            'Salary Max': job.get('salary_max', 'N/A'),
            'Experience': job.get('experience', 'N/A'),
            'Post Date': job.get('created', 'N/A'),
            'Required Language': job.get('description', 'N/A'),
            'Redirect URL': job.get('redirect_url', 'N/A'),  # 新增字段
            'Location': job.get('location', {}).get('display_name', 'N/A'),  # 位置名称
            'Job Category': job_title,  # 添加岗位类别信息
            'Description': job.get('description', 'N/A')  # 职位描述
        }
        job_list.append(job_info)

    return job_list

def save_to_csv(job_list, file_path):
    """将工作数据保存到CSV文件"""
    try:
        write_header = not os.path.exists(file_path)  # 使用 os.path.exists() 检查文件是否存在

        with open(file_path, mode='a', newline='', encoding='utf-8') as file:
            writer = csv.DictWriter(file, fieldnames=job_list[0].keys())
            
            # 只有第一次写入文件时才写入表头
            if write_header:
                writer.writeheader()
                
            writer.writerows(job_list)
        print(f"Data successfully saved to {file_path}")
    except IOError as e:
        print(f"Error saving data to CSV: {e}")

def get_max_days_old_values():
    """生成过去两年的 max_days_old 值，每次间隔约30天"""
    max_days_old_values = []
    today = datetime.today()
    for months_ago in range(24):  # 过去两年 = 24 个月
        days_ago = months_ago * 30
        max_days_old_values.append(days_ago)
    return max_days_old_values

# 主函数
if __name__ == "__main__":
    # 加载上次的进度
    progress = load_progress()
    if progress:
        start_country = progress['country_code']
        start_job_title = progress['job_title']
        start_page = progress['page']
        max_days_old = progress['max_days_old']
        print(f"Resuming from {start_country}, {start_job_title}, page {start_page}, max_days_old {max_days_old}")
    else:
        start_country = countries[0]
        start_job_title = data_related_jobs[0]
        start_page = 1
        max_days_old = 0  # 默认从最新数据开始
        print("Starting from the beginning.")

    # 初始化标志，找到开始位置
    resume_flag = False if progress else True

    max_days_old_values = get_max_days_old_values()

    # 针对每个国家和每个数据相关岗位进行请求
    for country_code in countries:
        for job_title in data_related_jobs:
            for max_days_old in max_days_old_values:
                for page in range(1, PAGES_PER_ROLE + 1):
                    # 跳过已完成的部分，直到找到上次中断的地方
                    if not resume_flag:
                        if country_code == start_country and job_title == start_job_title and page == start_page and max_days_old == progress['max_days_old']:
                            resume_flag = True  # 找到恢复位置
                        else:
                            continue

                    # 获取工作数据
                    job_data = get_job_data(country_code, job_title, page, max_days_old)

                    if job_data and 'results' in job_data:  # 检查job_data是否为None且包含'results'
                        jobs = parse_job_data(job_data, country_code, job_title)
                        save_to_csv(jobs, CSV_FILE_PATH)  # 每次请求后立即保存数据

                    # 如果没有数据了，则跳出循环
                    if not job_data or not job_data.get('results'):
                        print(f"No more results for {job_title} in {country_code} on page {page}.")
                        break

                    # 保存当前进度
                    save_progress(country_code, job_title, page, max_days_old)

                    # 加入延迟避免触发速率限制
                    time.sleep(1)
    
    # 删除进度日志，任务完成
    if os.path.exists(PROGRESS_LOG_PATH):
        os.remove(PROGRESS_LOG_PATH)

Starting from the beginning.
Data successfully saved to adzuna_data_related_jobs.csv
Progress saved: {'country_code': 'fr', 'job_title': 'data science', 'page': 1, 'max_days_old': 0}
Data successfully saved to adzuna_data_related_jobs.csv
Progress saved: {'country_code': 'fr', 'job_title': 'data science', 'page': 2, 'max_days_old': 0}
Data successfully saved to adzuna_data_related_jobs.csv
Progress saved: {'country_code': 'fr', 'job_title': 'data science', 'page': 3, 'max_days_old': 0}
Data successfully saved to adzuna_data_related_jobs.csv
Progress saved: {'country_code': 'fr', 'job_title': 'data science', 'page': 4, 'max_days_old': 0}
Data successfully saved to adzuna_data_related_jobs.csv
Progress saved: {'country_code': 'fr', 'job_title': 'data science', 'page': 1, 'max_days_old': 30}
Data successfully saved to adzuna_data_related_jobs.csv
Progress saved: {'country_code': 'fr', 'job_title': 'data science', 'page': 2, 'max_days_old': 30}
Data successfully saved to adzuna_data_relate

IndexError: list index out of range