In [1]:
import requests
import os
import zipfile

In [2]:
# URL to scrape for ZIP files
base_url = "https://s3.amazonaws.com/capitalbikeshare-data/"  # Replace with your target website
download_folder = "../Data/Raw"  # Folder to store downloaded files

# Create the download folder if it doesn't exist
os.makedirs(download_folder, exist_ok=True)

The CaBi data follows a common pattern. Example files:
https://s3.amazonaws.com/capitalbikeshare-data/202201-capitalbikeshare-tripdata.zip
https://s3.amazonaws.com/capitalbikeshare-data/202305-capitalbikeshare-tripdata.zip

All we need to do is update the year-month value.

In [3]:
links = []

for year in range(2022,2026):
    for month in range(1, 13):
        # Construct the URL for the specific month and year
        url = f"{base_url}{year}{month:02d}-capitalbikeshare-tripdata.zip"
        links.append(url)

links

['https://s3.amazonaws.com/capitalbikeshare-data/202201-capitalbikeshare-tripdata.zip',
 'https://s3.amazonaws.com/capitalbikeshare-data/202202-capitalbikeshare-tripdata.zip',
 'https://s3.amazonaws.com/capitalbikeshare-data/202203-capitalbikeshare-tripdata.zip',
 'https://s3.amazonaws.com/capitalbikeshare-data/202204-capitalbikeshare-tripdata.zip',
 'https://s3.amazonaws.com/capitalbikeshare-data/202205-capitalbikeshare-tripdata.zip',
 'https://s3.amazonaws.com/capitalbikeshare-data/202206-capitalbikeshare-tripdata.zip',
 'https://s3.amazonaws.com/capitalbikeshare-data/202207-capitalbikeshare-tripdata.zip',
 'https://s3.amazonaws.com/capitalbikeshare-data/202208-capitalbikeshare-tripdata.zip',
 'https://s3.amazonaws.com/capitalbikeshare-data/202209-capitalbikeshare-tripdata.zip',
 'https://s3.amazonaws.com/capitalbikeshare-data/202210-capitalbikeshare-tripdata.zip',
 'https://s3.amazonaws.com/capitalbikeshare-data/202211-capitalbikeshare-tripdata.zip',
 'https://s3.amazonaws.com/capit

In [4]:
def download_file(url, destination):
    """
    Download a file from a URL to a specified destination
    with progress tracking
    """
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        file_size = int(response.headers.get('content-length', 0))
        filename = os.path.basename(url)
        filepath = os.path.join(destination, filename)
        
        # Skip download if file already exists with same size
        if os.path.exists(filepath) and os.path.getsize(filepath) == file_size:
            print(f"File already exists: {filename}")
            return filepath
        
        # Download with progress bar
        with open(filepath, 'wb') as f:
            print(f"Downloading {filename}")
            f.write(response.content)
    else:
        print(f"Failed to download {url}: Status code {response.status_code}")
        return None


In [9]:
for link in links:
    # Download each file
    download_file(link, download_folder)

File already exists: 202201-capitalbikeshare-tripdata.zip
Downloading 202202-capitalbikeshare-tripdata.zip
Downloading 202203-capitalbikeshare-tripdata.zip
Downloading 202204-capitalbikeshare-tripdata.zip
Downloading 202205-capitalbikeshare-tripdata.zip
Downloading 202206-capitalbikeshare-tripdata.zip
Downloading 202207-capitalbikeshare-tripdata.zip


KeyboardInterrupt: 

In [6]:
def unzip_file(zip_path, extract_to=None):
    """
    Unzip a file to the specified directory
    If extract_to is None, extract to the same directory as the zip file
    """
    if extract_to is None:
        extract_to = os.path.dirname(zip_path)
    
    # Create extraction directory if it doesn't exist
    os.makedirs(extract_to, exist_ok=True)
    
    filename = os.path.basename(zip_path)
    print(f"Extracting {filename}...")
    
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        # Get list of files to extract
        file_list = zip_ref.namelist()
        
        # Extract all files with progress
        for file in file_list:
            zip_ref.extract(file, extract_to)


In [12]:
# List all zip files in the Raw folder
zip_files = [f'{download_folder}/{f}' for f in os.listdir(download_folder) if f.endswith('.zip')]
zip_files

['../Data/Raw/202201-capitalbikeshare-tripdata.zip',
 '../Data/Raw/202202-capitalbikeshare-tripdata.zip',
 '../Data/Raw/202203-capitalbikeshare-tripdata.zip',
 '../Data/Raw/202204-capitalbikeshare-tripdata.zip',
 '../Data/Raw/202205-capitalbikeshare-tripdata.zip',
 '../Data/Raw/202206-capitalbikeshare-tripdata.zip',
 '../Data/Raw/202207-capitalbikeshare-tripdata.zip']

In [13]:
for file in zip_files:
    # Unzip each file
    unzip_file(file)

Extracting 202201-capitalbikeshare-tripdata.zip...
Extracting 202202-capitalbikeshare-tripdata.zip...
Extracting 202203-capitalbikeshare-tripdata.zip...
Extracting 202204-capitalbikeshare-tripdata.zip...
Extracting 202205-capitalbikeshare-tripdata.zip...
Extracting 202206-capitalbikeshare-tripdata.zip...
Extracting 202207-capitalbikeshare-tripdata.zip...


BadZipFile: File is not a zip file