In [2]:
import os
import time
import datetime
import urllib.request
import pandas as pd

# Starting and ending dates
start_date = datetime.date(2016, 9, 1)
end_date = datetime.date(2023, 7, 26)

# Directory to store files
dir_path = r"C:\\Users\\gbray\\Desktop\\python\\project 2\\data"

# URL pattern
url_pattern = "https://www.pse.pl/getcsv/-/export/csv/PL_CENY_NIEZB_RB/data_od/{start}/data_do/{end}"

# If directory doesn't exist, create it
if not os.path.exists(dir_path):
    os.makedirs(dir_path)

# List to store dataframes
df_list = []

# Function to download the file with retries on 403 error
def download_with_retry(url, file_path, max_retries=3, delay_between_retries=5):
    for _ in range(max_retries):
        try:
            urllib.request.urlretrieve(url, file_path)
            return True
        except urllib.error.HTTPError as e:
            if e.code == 403:
                print("Received 403 error. Retrying after a delay...")
                time.sleep(delay_between_retries)
            else:
                raise
    return False

# Generate each month from start_date to end_date
curr_date = start_date
while curr_date <= end_date:
    next_date = (curr_date + datetime.timedelta(days=31)).replace(day=1)
    if next_date > end_date:
        next_date = end_date + datetime.timedelta(days=1)

    # Create URL
    url = url_pattern.format(
        start=curr_date.strftime('%Y%m%d'), 
        end=(next_date - datetime.timedelta(days=1)).strftime('%Y%m%d')
    )

    # Path for file
    file_path = os.path.join(dir_path, f"data_{curr_date.strftime('%Y%m')}.csv")

    # Download and save file with retries
    download_with_retry(url, file_path)

    if os.path.exists(file_path):
        # Print status
        print(f"Downloaded data for {curr_date.strftime('%Y-%m')}")

        # Read downloaded data into DataFrame and add to list
        df = pd.read_csv(file_path)
        df_list.append(df)

    # Move to the next date
    curr_date = next_date

# Concatenate all dataframes
df_all = pd.concat(df_list, ignore_index=True)

# Save the merged data to a new CSV file
df_all.to_csv(os.path.join(dir_path, 'all_data.csv'), index=False)
print('Merged all data into all_data.csv')


Downloaded data for 2016-09
Downloaded data for 2016-10
Received 403 error. Retrying after a delay...
Downloaded data for 2016-11
Downloaded data for 2016-12
Downloaded data for 2017-01
Received 403 error. Retrying after a delay...
Downloaded data for 2017-02
Downloaded data for 2017-03
Downloaded data for 2017-04
Downloaded data for 2017-05
Downloaded data for 2017-06
