In [1]:
import requests
import pandas as pd
from pathlib import Path
import shutil
from datetime import datetime


In [2]:
import FileUtils as FileUtils


In [3]:

# Constants and configurations
DATASET = 'amm_opiskelijat_ja_tutkinnot_vuosi_tutkinto'
LIMIT = 5000
MAX_RETRIES = 3
BASE_URL = f"https://api.vipunen.fi/api/resources/{DATASET}/data?limit={LIMIT}&offset="
COUNT_URL = f"https://api.vipunen.fi/api/resources/{DATASET}/data/count"
HEADERS = {
    "Content-Type": "application/json",
    "Accept": "application/json",
    "Caller-Id": "0201689-0.rastor-instituutti"
}

DATA_DIR = "../data/raw"

In [4]:
def backup_existing_file(file_path: Path):
    """
    Backs up an existing file by moving it to a backup directory.

    Args:
        file_path (Path): The path to the file to be backed up.

    Returns:
        None
    """
    if file_path.exists():
        modified_date = datetime.fromtimestamp(file_path.stat().st_mtime).strftime('%Y-%m-%d')
        backup_file_path = file_path.parent / f"old_api_calls_output/{file_path.stem}_{modified_date}.csv"
        shutil.move(str(file_path), str(backup_file_path))

def fetch_data(url):
    """
    Fetches data from the specified URL using GET request.

    Args:
        url (str): The URL to fetch data from.

    Returns:
        dict: The JSON response from the URL.

    Raises:
        requests.RequestException: If an error occurs during the request.

    """
    for attempt in range(MAX_RETRIES):
        try:
            response = requests.get(url, headers=HEADERS)
            response.raise_for_status()
            return response.json()
        except requests.RequestException as e:
            print(f"Attempt {attempt+1}: {e}")
            if attempt == MAX_RETRIES - 1:
                raise

def save_data(data, file_path, mode='a'):
    """
    Save data to a CSV file.

    Parameters:
    - data: The data to be saved. It should be a list of dictionaries or a pandas DataFrame.
    - file_path: The file path where the data will be saved.
    - mode: The file mode. Default is 'a' (append), but can also be 'w' (write).

    Returns:
    None
    """
    header = (mode == 'w')
    df = pd.DataFrame(data)
    df.to_csv(file_path, mode=mode, sep=';', na_rep='', header=header, 
              index=False, encoding='utf-8', quoting=0, quotechar='"', lineterminator="\n", escapechar="$")

def main(directory, dataset):
    """
    Fetches data from an API and saves it to a CSV file.

    Args:
        directory (str): The directory where the CSV file will be saved.
        dataset (str): The name of the dataset.

    Returns:
        None
    """
    file_path = Path(directory) / f"{dataset}.csv"
    backup_existing_file(file_path)

    max_rows = requests.get(COUNT_URL, headers=HEADERS).json()
    print(f"Total rows to fetch: {max_rows}")

    mode = 'w'
    for offset in range(0, max_rows, LIMIT):
        data = fetch_data(BASE_URL + str(offset))
        save_data(data, file_path, mode=mode)
        mode = 'a'  # Switch to append mode after first write

    print("Data fetching complete.")
    print(f"Data saved to {file_path}")

# Example usage
main(DATA_DIR, DATASET)


Total rows to fetch: 60755
Data fetching complete.
Data saved to ..\data\raw\amm_opiskelijat_ja_tutkinnot_vuosi_tutkinto.csv
