In [None]:
!pip install requests
!pip install bs4
!pip install fastparquet
!pip install pandas


Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2
Collecting fastparquet
  Downloading fastparquet-2024.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting cramjam>=2.3 (from fastparquet)
  Downloading cramjam-2.8.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Downloading fastparquet-2024.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cramjam-2.8.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: cramjam, fastparquet
Successfully installed 

In [None]:
import os
import requests
from bs4 import BeautifulSoup
from time import sleep
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import pandas as pd
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Create a session to handle retries
session = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
session.mount('http://', HTTPAdapter(max_retries=retries))
session.mount('https://', HTTPAdapter(max_retries=retries))

# Define the base URL for scraping and the directory in Google Drive to save the files
scrape_url = "https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page"
drive_dir = "/content/drive/My Drive/nyc_taxi_data_2019"

# Create the directory in Google Drive if it doesn't exist
os.makedirs(drive_dir, exist_ok=True)

# Function to download a file with retries
def download_file(url, output_path):
    try:
        response = session.get(url, stream=True)
        response.raise_for_status()  # Raise an error for bad status codes
        with open(output_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        print(f"Successfully downloaded {output_path}")
    except requests.exceptions.RequestException as e:
        print(f"Failed to download {url}: {e}")

# Scrape the website to find the correct URLs
response = session.get(scrape_url)
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a', href=True)

# Filter and download parquet files for the first six months of 2019
parquet_files = []
months = ['01']
for link in links:
    href = link['href']
    if any(f'2019-{month}' in href for month in months) and href.endswith('.parquet'):
        filename = href.split('/')[-1]
        url = href
        output_path = os.path.join(drive_dir, filename)
        download_file(url, output_path)
        parquet_files.append(output_path)
        sleep(1)  # Sleep for a short time between requests to avoid overloading the server

print("Download completed.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Successfully downloaded /content/drive/My Drive/nyc_taxi_data_2019/yellow_tripdata_2019-01.parquet
Successfully downloaded /content/drive/My Drive/nyc_taxi_data_2019/green_tripdata_2019-01.parquet
Successfully downloaded /content/drive/My Drive/nyc_taxi_data_2019/fhv_tripdata_2019-01.parquet
Download completed.
