In [10]:
from pathlib import Path
import requests
import zipfile
import io

def fetch_raw_data(year: int, month: int) -> str:
    url = f"https://s3.amazonaws.com/tripdata/JC-{year}{month:02}-citibike-tripdata.csv.zip"
    response = requests.get(url)
    
    if response.status_code == 200:
        raw_dir = Path("..") / "data" / "raw"
        raw_dir.mkdir(parents=True, exist_ok=True)

        zip_path = raw_dir / f"citibike_{year}_{month:02}.zip"
        csv_filename = None

        # Save and extract the zip file
        with zipfile.ZipFile(io.BytesIO(response.content)) as z:
            z.extractall(path=raw_dir)
            csv_filename = z.namelist()[0]  # assume only one file inside

        csv_path = raw_dir / csv_filename
        print(f"✅ Successfully fetched and extracted: {csv_path}")
        return str(csv_path)
    else:
        raise Exception(f"❌ Failed to fetch data from: {url}")


In [11]:
for month in range(1, 13):  # Months 2 to 12
    try:
        fetch_raw_data(2024, month)
    except Exception as e:
        print(f"❌ Error fetching data for 2024-{month:02}: {e}")


✅ Successfully fetched and extracted: ..\data\raw\JC-202401-citibike-tripdata.csv
✅ Successfully fetched and extracted: ..\data\raw\JC-202402-citibike-tripdata.csv
✅ Successfully fetched and extracted: ..\data\raw\JC-202403-citibike-tripdata.csv
✅ Successfully fetched and extracted: ..\data\raw\JC-202404-citibike-tripdata.csv
✅ Successfully fetched and extracted: ..\data\raw\JC-202405-citibike-tripdata.csv
✅ Successfully fetched and extracted: ..\data\raw\JC-202406-citibike-tripdata.csv
✅ Successfully fetched and extracted: ..\data\raw\JC-202407-citibike-tripdata.csv
✅ Successfully fetched and extracted: ..\data\raw\JC-202408-citibike-tripdata.csv
✅ Successfully fetched and extracted: ..\data\raw\JC-202409-citibike-tripdata.csv
✅ Successfully fetched and extracted: ..\data\raw\JC-202410-citibike-tripdata.csv
✅ Successfully fetched and extracted: ..\data\raw\JC-202411-citibike-tripdata.csv
✅ Successfully fetched and extracted: ..\data\raw\JC-202412-citibike-tripdata.csv


In [13]:
import pandas as pd

def show_columns_and_sample(file_path: str, num_records: int = 5):
    df = pd.read_csv(file_path)  # Changed from read_parquet to read_csv
    print("Columns in the dataset:")
    print(df.columns)
    print("\nSample records:")
    print(df.head(num_records))

# Example usage
file_path = "../data/raw/JC-202401-citibike-tripdata.csv"  # Path to your Citi Bike data
show_columns_and_sample(file_path)


Columns in the dataset:
Index(['ride_id', 'rideable_type', 'started_at', 'ended_at',
       'start_station_name', 'start_station_id', 'end_station_name',
       'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng',
       'member_casual'],
      dtype='object')

Sample records:
            ride_id  rideable_type           started_at             ended_at  \
0  0744109F13385D1D  electric_bike  2024-01-15 15:18:07  2024-01-15 15:32:44   
1  B1488BFEF9118000   classic_bike  2024-01-13 15:32:50  2024-01-13 15:36:18   
2  95A2FE8E51B4C836   classic_bike  2024-01-19 13:11:00  2024-01-19 13:14:44   
3  95D9AFF6A1652DC1   classic_bike  2024-01-23 07:03:49  2024-01-23 07:07:11   
4  5F7408988A83B1B3   classic_bike  2024-01-01 16:46:10  2024-01-01 16:50:31   

  start_station_name start_station_id end_station_name end_station_id  \
0       Morris Canal            JC072      Oakland Ave          JC022   
1  JC Medical Center            JC110    Grove St PATH          JC115   
2       M