In [1]:
!pip install tqdm

Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m712.7 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: tqdm
Successfully installed tqdm-4.67.1
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
# Cell 1: Import required libraries and setup
import os
import requests
from pathlib import Path
from tqdm.notebook import tqdm
import pandas as pd
from datetime import datetime

print("Libraries imported successfully")


Libraries imported successfully


In [3]:

# Cell 2: Configuration
DATA_PATH = "/home/iceberg/notebooks/data"
YEAR = "2023"
BASE_URL = "https://d37ci6vzurychx.cloudfront.net/trip-data"
MONTHS = [f"{i:02d}" for i in range(1, 13)]  # 01 through 12

# Create data directory if it doesn't exist
data_dir = Path(DATA_PATH)
data_dir.mkdir(exist_ok=True)

print(f"Data directory: {DATA_PATH}")
print(f"Year to download: {YEAR}")
print(f"Total months to process: {len(MONTHS)}")


Data directory: /home/iceberg/notebooks/data
Year to download: 2023
Total months to process: 12


In [4]:

# Cell 3: Download function
def download_file(url, filepath, description="Downloading"):
    """Download a file with progress bar"""
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        
        # Get file size for progress bar
        file_size = int(response.headers.get('content-length', 0))
        
        # Show download progress
        with filepath.open('wb') as f, tqdm(
            desc=description,
            total=file_size,
            unit='iB',
            unit_scale=True,
            unit_divisor=1024,
        ) as pbar:
            for data in response.iter_content(chunk_size=8192):
                size = f.write(data)
                pbar.update(size)
                
        return True, None
    except Exception as e:
        return False, str(e)

# Cell 4: Download all files
download_summary = []

for month in MONTHS:
    filename = f"yellow_tripdata_{YEAR}-{month}.parquet"
    filepath = data_dir / filename
    url = f"{BASE_URL}/{filename}"
    
    print(f"\nProcessing {filename}")
    
    # Check if file already exists
    if filepath.exists():
        size_mb = filepath.stat().st_size / (1024 * 1024)
        print(f"File already exists ({size_mb:.1f} MB), skipping...")
        download_summary.append({
            'Month': month,
            'Status': 'Exists',
            'Size (MB)': f"{size_mb:.1f}",
            'Error': None
        })
        continue
    
    # Download file
    print(f"Downloading from {url}")
    success, error = download_file(url, filepath, description=f"Downloading {filename}")
    
    # Record result
    if success:
        size_mb = filepath.stat().st_size / (1024 * 1024)
        download_summary.append({
            'Month': month,
            'Status': 'Downloaded',
            'Size (MB)': f"{size_mb:.1f}",
            'Error': None
        })
    else:
        download_summary.append({
            'Month': month,
            'Status': 'Failed',
            'Size (MB)': 'N/A',
            'Error': error
        })

# Cell 5: Verify downloads
def verify_parquet_files():
    """Verify all downloaded parquet files"""
    verification_results = []
    
    files = sorted(data_dir.glob(f"yellow_tripdata_{YEAR}-*.parquet"))
    
    for file in files:
        size_mb = file.stat().st_size / (1024 * 1024)
        
        try:
            # Try to read the first row
            df = pd.read_parquet(file, rows=1)
            status = "Valid"
            error = None
        except Exception as e:
            status = "Invalid"
            error = str(e)
        
        verification_results.append({
            'File': file.name,
            'Size (MB)': f"{size_mb:.1f}",
            'Status': status,
            'Error': error
        })
    
    return pd.DataFrame(verification_results)

# Show download summary
print("\nDownload Summary:")
download_df = pd.DataFrame(download_summary)
print(download_df.to_string(index=False))

# Verify files
print("\nVerifying downloaded files:")
verification_df = verify_parquet_files()
print(verification_df.to_string(index=False))

# Show total size
total_size_gb = sum(
    file.stat().st_size for file in data_dir.glob(f"yellow_tripdata_{YEAR}-*.parquet")
) / (1024**3)
print(f"\nTotal size of downloaded files: {total_size_gb:.2f} GB")

# Cell 6: Preview data
def preview_data():
    """Preview the data from a downloaded file"""
    # Try to read the most recent file
    files = sorted(data_dir.glob(f"yellow_tripdata_{YEAR}-*.parquet"))
    if not files:
        print("No files found to preview")
        return
    
    latest_file = files[-1]
    print(f"\nPreviewing data from {latest_file.name}")
    
    try:
        df = pd.read_parquet(latest_file)
        print("\nDataset shape:", df.shape)
        print("\nColumns:", df.columns.tolist())
        print("\nFirst few rows:")
        print(df.head())
        print("\nData types:")
        print(df.dtypes)
    except Exception as e:
        print(f"Error previewing data: {str(e)}")

# Preview the data
preview_data()


Processing yellow_tripdata_2023-01.parquet
Downloading from https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet


Downloading yellow_tripdata_2023-01.parquet:   0%|          | 0.00/45.5M [00:00<?, ?iB/s]


Processing yellow_tripdata_2023-02.parquet
Downloading from https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet


Downloading yellow_tripdata_2023-02.parquet:   0%|          | 0.00/45.5M [00:00<?, ?iB/s]


Processing yellow_tripdata_2023-03.parquet
Downloading from https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet


Downloading yellow_tripdata_2023-03.parquet:   0%|          | 0.00/53.5M [00:00<?, ?iB/s]


Processing yellow_tripdata_2023-04.parquet
Downloading from https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-04.parquet


Downloading yellow_tripdata_2023-04.parquet:   0%|          | 0.00/51.7M [00:00<?, ?iB/s]


Processing yellow_tripdata_2023-05.parquet
Downloading from https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-05.parquet


Downloading yellow_tripdata_2023-05.parquet:   0%|          | 0.00/55.9M [00:00<?, ?iB/s]


Processing yellow_tripdata_2023-06.parquet
Downloading from https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-06.parquet


Downloading yellow_tripdata_2023-06.parquet:   0%|          | 0.00/52.5M [00:00<?, ?iB/s]


Processing yellow_tripdata_2023-07.parquet
Downloading from https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-07.parquet


Downloading yellow_tripdata_2023-07.parquet:   0%|          | 0.00/46.1M [00:00<?, ?iB/s]


Processing yellow_tripdata_2023-08.parquet
Downloading from https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-08.parquet


Downloading yellow_tripdata_2023-08.parquet:   0%|          | 0.00/45.9M [00:00<?, ?iB/s]


Processing yellow_tripdata_2023-09.parquet
Downloading from https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-09.parquet


Downloading yellow_tripdata_2023-09.parquet:   0%|          | 0.00/45.7M [00:00<?, ?iB/s]


Processing yellow_tripdata_2023-10.parquet
Downloading from https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-10.parquet


Downloading yellow_tripdata_2023-10.parquet:   0%|          | 0.00/56.3M [00:00<?, ?iB/s]


Processing yellow_tripdata_2023-11.parquet
Downloading from https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-11.parquet


Downloading yellow_tripdata_2023-11.parquet:   0%|          | 0.00/53.5M [00:00<?, ?iB/s]


Processing yellow_tripdata_2023-12.parquet
Downloading from https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-12.parquet


Downloading yellow_tripdata_2023-12.parquet:   0%|          | 0.00/54.2M [00:00<?, ?iB/s]


Download Summary:
Month     Status Size (MB) Error
   01 Downloaded      45.5  None
   02 Downloaded      45.5  None
   03 Downloaded      53.5  None
   04 Downloaded      51.7  None
   05 Downloaded      55.9  None
   06 Downloaded      52.5  None
   07 Downloaded      46.1  None
   08 Downloaded      45.9  None
   09 Downloaded      45.7  None
   10 Downloaded      56.3  None
   11 Downloaded      53.5  None
   12 Downloaded      54.2  None

Verifying downloaded files:
                           File Size (MB)  Status                                                  Error
yellow_tripdata_2023-01.parquet      45.5 Invalid read_table() got an unexpected keyword argument 'rows'
yellow_tripdata_2023-02.parquet      45.5 Invalid read_table() got an unexpected keyword argument 'rows'
yellow_tripdata_2023-03.parquet      53.5 Invalid read_table() got an unexpected keyword argument 'rows'
yellow_tripdata_2023-04.parquet      51.7 Invalid read_table() got an unexpected keyword argument 'rows