In [2]:
import requests
import zipfile
import io
import os
import re

# Base URL for downloading data
base_url = "https://s3.amazonaws.com/tripdata/"

# Output directory
output_dir = "citibike_feb_data"
os.makedirs(output_dir, exist_ok=True)

def download_and_extract():
    # --- Download and Extract YEARLY Zip Files (2014-2023) ---
    for year in range(2014, 2024):  # 2014 up to (and including) 2023
        yearly_zip_filename = f"{year}-citibike-tripdata.zip"
        url = base_url + yearly_zip_filename
        print(f"Downloading {url}...")

        try:
            response = requests.get(url, stream=True)
            response.raise_for_status()

            with zipfile.ZipFile(io.BytesIO(response.content)) as outer_z:
                for outer_zip_info in outer_z.infolist():
                    # --- Check for Nested Structure (2020-2023) ---
                    if int(year) >= 2020:
                        if outer_zip_info.filename.endswith(".zip"):
                            with outer_z.open(outer_zip_info) as nested_zip_file:
                                try:
                                    with zipfile.ZipFile(nested_zip_file) as inner_z:
                                        for inner_zip_info in inner_z.infolist():
                                            if inner_zip_info.filename.endswith(".csv") and re.search(r"(?:/|^)(?:JC-)?20\d{2}02[-_.]?.*\.csv", inner_zip_info.filename, re.IGNORECASE):
                                                parts = inner_zip_info.filename.split("/")
                                                cleaned_filename = parts[-1]

                                                year_dir = os.path.join(output_dir, str(year))
                                                month_dir = os.path.join(year_dir, "02")
                                                os.makedirs(month_dir, exist_ok=True)

                                                output_path = os.path.join(month_dir, cleaned_filename)
                                                with inner_z.open(inner_zip_info) as inner_zf, open(output_path, "wb") as f:
                                                    f.write(inner_zf.read())

                                except zipfile.BadZipFile as e:
                                    print(f"  Error extracting nested zip file {outer_zip_info.filename}: {e}")

                    # --- Standard Extraction for 2014-2019 ---
                    else:
                        if outer_zip_info.filename.endswith(".csv") and re.search(r"(?:/|^)(?:JC-)?20\d{2}02[-_.]?.*\.csv", outer_zip_info.filename, re.IGNORECASE):
                            parts = outer_zip_info.filename.split("/")
                            cleaned_filename = parts[-1]

                            year_dir = os.path.join(output_dir, str(year))
                            month_dir = os.path.join(year_dir, "02")
                            os.makedirs(month_dir, exist_ok=True)

                            output_path = os.path.join(month_dir, cleaned_filename)
                            with outer_z.open(outer_zip_info) as zf, open(output_path, "wb") as f:
                                f.write(zf.read())

        except requests.exceptions.RequestException as e:
            print(f"  Error downloading {yearly_zip_filename}: {e}")
        except zipfile.BadZipFile as e:
            print(f"  Error extracting {yearly_zip_filename}: {e}")
        except Exception as e:
            print(f"  An unexpected error occurred with {yearly_zip_filename}: {e}")

    print("Download and extraction complete.")

download_and_extract()


Downloading https://s3.amazonaws.com/tripdata/2014-citibike-tripdata.zip...
Downloading https://s3.amazonaws.com/tripdata/2015-citibike-tripdata.zip...
Downloading https://s3.amazonaws.com/tripdata/2016-citibike-tripdata.zip...
Downloading https://s3.amazonaws.com/tripdata/2017-citibike-tripdata.zip...
Downloading https://s3.amazonaws.com/tripdata/2018-citibike-tripdata.zip...
Downloading https://s3.amazonaws.com/tripdata/2019-citibike-tripdata.zip...
Downloading https://s3.amazonaws.com/tripdata/2020-citibike-tripdata.zip...
Downloading https://s3.amazonaws.com/tripdata/2021-citibike-tripdata.zip...
  Error extracting nested zip file __MACOSX/2021-citibike-tripdata/._202107-citibike-tripdata.zip: File is not a zip file
  Error extracting nested zip file __MACOSX/2021-citibike-tripdata/._202105-citibike-tripdata.zip: File is not a zip file
  Error extracting nested zip file __MACOSX/2021-citibike-tripdata/._202103-citibike-tripdata.zip: File is not a zip file
  Error extracting nested 