In [61]:
pip install beautifulsoup4 lxml

Note: you may need to restart the kernel to use updated packages.


In [62]:
import requests
from bs4 import BeautifulSoup
import re
import os

In [63]:
url = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"

In [64]:
os.makedirs('data', exist_ok=True)

In [65]:
# 使用 requests 获取网页内容
response = requests.get(url)
if response.status_code != 200:
    print("请求失败，状态码:", response.status_code)
    exit()

In [66]:
# 使用 BeautifulSoup 解析网页内容
soup = BeautifulSoup(response.text, 'lxml')


Yellow_Taxi_pattern = re.compile(r'yellow_tripdata_202[0-4]-\d{2}.parquet', re.IGNORECASE)
HVFHV_pattern = re.compile(r'fhvhv_tripdata_202[0-4]-\d{2}.parquet', re.IGNORECASE)

# 查找所有符合 Yellow Taxi 的链接
Yellow_Taxi_links = [link.get('href') for link in soup.find_all('a', href=Yellow_Taxi_pattern)]
HVFHV_links = [link.get('href') for link in soup.find_all('a', href=HVFHV_pattern)]

# 合并两个链接列表
all_links = Yellow_Taxi_links + HVFHV_links

# 下载每个 Parquet 文件
for link in all_links:
    # 如果链接是相对路径，则将其转换为完整 URL
    file_url = link if link.startswith('http') else url + link
    file_name = os.path.join('data', file_url.split('/')[-1])
    
    # 下载文件
    print(f"Downloading {file_name} ...")
    file_response = requests.get(file_url)
    
    # 将文件内容写入本地文件
    with open(file_name, 'wb') as file:
        file.write(file_response.content)
    print(f"{file_name} 下载完成！")

Downloading data\yellow_tripdata_2024-01.parquet  ...
data\yellow_tripdata_2024-01.parquet  下载完成！
Downloading data\yellow_tripdata_2024-02.parquet  ...
data\yellow_tripdata_2024-02.parquet  下载完成！
Downloading data\yellow_tripdata_2024-03.parquet  ...
data\yellow_tripdata_2024-03.parquet  下载完成！
Downloading data\yellow_tripdata_2024-04.parquet ...
data\yellow_tripdata_2024-04.parquet 下载完成！
Downloading data\yellow_tripdata_2024-05.parquet ...
data\yellow_tripdata_2024-05.parquet 下载完成！
Downloading data\yellow_tripdata_2024-06.parquet ...
data\yellow_tripdata_2024-06.parquet 下载完成！
Downloading data\yellow_tripdata_2024-07.parquet ...
data\yellow_tripdata_2024-07.parquet 下载完成！
Downloading data\yellow_tripdata_2024-08.parquet ...
data\yellow_tripdata_2024-08.parquet 下载完成！
Downloading data\yellow_tripdata_2023-01.parquet ...
data\yellow_tripdata_2023-01.parquet 下载完成！
Downloading data\yellow_tripdata_2023-02.parquet ...
data\yellow_tripdata_2023-02.parquet 下载完成！
Downloading data\yellow_tripdata_2

In [67]:
pip install fastparquet

Note: you may need to restart the kernel to use updated packages.


In [68]:
pip install pyarrow




In [69]:
from pathlib import Path

# 获取文件的绝对路径
file_path = Path("your_file.txt").resolve()
print(file_path)

# 获取当前工作目录
current_directory = Path.cwd()
print(current_directory)


C:\Users\Lenovo\作业\4501\your_file.txt
C:\Users\Lenovo\作业\4501


In [70]:
import pandas as pd

try:
    df = pd.read_parquet(r'C:\Users\Lenovo\作业\4501\data\yellow_tripdata_2020-12.parquet', engine='pyarrow')
    print(df.head())
except Exception as e:
    print(f"读取 Parquet 文件失败: {e}")

   VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
0         1  2020-12-01 00:07:13   2020-12-01 00:18:12              1.0   
1         1  2020-12-01 00:41:19   2020-12-01 00:49:45              1.0   
2         2  2020-12-01 00:33:40   2020-12-01 01:00:35              1.0   
3         2  2020-12-01 00:02:15   2020-12-01 00:13:09              1.0   
4         2  2020-12-01 00:37:42   2020-12-01 00:45:11              1.0   

   trip_distance  RatecodeID store_and_fwd_flag  PULocationID  DOLocationID  \
0           7.60         1.0                  N           138           263   
1           1.60         1.0                  N           140           263   
2          16.74         2.0                  N           132           164   
3           4.16         1.0                  N           238            48   
4           2.22         1.0                  N           238            41   

   payment_type  fare_amount  extra  mta_tax  tip_amount  tolls_amount  \


In [71]:
import pandas as pd

try:
    df = pd.read_parquet(r'C:\Users\Lenovo\作业\4501\data\fhvhv_tripdata_2020-12.parquet', engine='pyarrow')
    print(df.head())
except Exception as e:
    print(f"读取 Parquet 文件失败: {e}")

  hvfhs_license_num dispatching_base_num originating_base_num  \
0            HV0003               B02764               B02764   
1            HV0003               B02764               B02764   
2            HV0005               B02510                 None   
3            HV0003               B02883               B02883   
4            HV0003               B02883               B02883   

     request_datetime   on_scene_datetime     pickup_datetime  \
0 2020-12-01 00:08:29 2020-12-01 00:11:29 2020-12-01 00:13:22   
1 2020-12-01 00:44:34 2020-12-01 00:47:00 2020-12-01 00:47:19   
2 2020-12-01 00:10:54                 NaT 2020-12-01 00:17:14   
3 2020-11-30 23:58:37 2020-12-01 00:00:01 2020-12-01 00:01:16   
4 2020-12-01 00:25:44 2020-12-01 00:30:41 2020-12-01 00:32:03   

     dropoff_datetime  PULocationID  DOLocationID  trip_miles  ...  sales_tax  \
0 2020-12-01 00:33:53            94            75        6.90  ...       1.79   
1 2020-12-01 00:57:01            75           164       

In [72]:
import pandas as pd
import glob

# 定义 Uber 标识符
uber_identifiers = ['HV0003']

# 获取所有 Parquet 文件路径
file_paths = glob.glob(r'C:\Users\Lenovo\作业\4501\data\fhvhv_tripdata_202[0-4]-*.parquet')

# 用于存储筛选后的数据
filtered_data = []

# 遍历文件路径并进行筛选
for file_path in file_paths:
    try:
        # 尝试读取文件
        df = pd.read_parquet(file_path, engine='pyarrow')
        
        # 筛选出包含 Uber 行程的数据
        uber_data = df[df['hvfhs_license_num'].isin(uber_identifiers)]
        
        # 将筛选后的数据写回原文件（覆盖）
        uber_data.to_parquet(file_path, index=False, engine='pyarrow')
        
        print(f"筛选后的 Uber 数据已写入文件 {file_path}，共 {len(uber_data)} 行数据。")
    
    except Exception as e:
        print(f"无法读取文件 {file_path}：{e}")


筛选后的 Uber 数据已写入文件 C:\Users\Lenovo\作业\4501\data\fhvhv_tripdata_2020-01.parquet，共 14582520 行数据。
筛选后的 Uber 数据已写入文件 C:\Users\Lenovo\作业\4501\data\fhvhv_tripdata_2020-02.parquet，共 15743610 行数据。
筛选后的 Uber 数据已写入文件 C:\Users\Lenovo\作业\4501\data\fhvhv_tripdata_2020-03.parquet，共 9836781 行数据。
筛选后的 Uber 数据已写入文件 C:\Users\Lenovo\作业\4501\data\fhvhv_tripdata_2020-04.parquet，共 3102835 行数据。
筛选后的 Uber 数据已写入文件 C:\Users\Lenovo\作业\4501\data\fhvhv_tripdata_2020-05.parquet，共 4359377 行数据。
筛选后的 Uber 数据已写入文件 C:\Users\Lenovo\作业\4501\data\fhvhv_tripdata_2020-06.parquet，共 5114308 行数据。
筛选后的 Uber 数据已写入文件 C:\Users\Lenovo\作业\4501\data\fhvhv_tripdata_2020-07.parquet，共 7081522 行数据。
筛选后的 Uber 数据已写入文件 C:\Users\Lenovo\作业\4501\data\fhvhv_tripdata_2020-08.parquet，共 7856499 行数据。
筛选后的 Uber 数据已写入文件 C:\Users\Lenovo\作业\4501\data\fhvhv_tripdata_2020-09.parquet，共 8847755 行数据。
筛选后的 Uber 数据已写入文件 C:\Users\Lenovo\作业\4501\data\fhvhv_tripdata_2020-10.parquet，共 9797775 行数据。
筛选后的 Uber 数据已写入文件 C:\Users\Lenovo\作业\4501\data\fhvhv_tripdata_2020-1

In [73]:
import pandas as pd
import glob
import os

# 获取所有 HVFHV 数据集的文件路径
file_paths = glob.glob(r'C:\Users\Lenovo\作业\4501\data\fhvhv_tripdata_202[0-4]-*.parquet')

# 创建新文件夹 'Sample_data'，如果不存在则创建
sample_folder = r'C:\Users\Lenovo\作业\4501\Sample_data'
os.makedirs(sample_folder, exist_ok=True)

# 抽样大小
sample_size = 385

# 遍历每个文件
for file_path in file_paths:
    try:
        # 读取文件
        df = pd.read_parquet(file_path, engine='pyarrow')
        
        # 筛选 Uber 行程数据
        uber_data = df[df['hvfhs_license_num'].isin(uber_identifiers)]
        
        # 随机抽样 385 行
        sample_data = uber_data.sample(n=sample_size, random_state=42)
        
        # 创建新文件路径
        file_name = os.path.basename(file_path)
        sample_file_path = os.path.join(sample_folder, file_name)
        
        # 将抽样数据写入新文件夹中的同名文件
        sample_data.to_parquet(sample_file_path, index=False, engine='pyarrow')
        
        print(f"{file_path} 中已抽取 {sample_size} 行数据，并保存至 {sample_file_path}")
    
    except Exception as e:
        # 跳过无法读取的文件，并打印错误信息
        print(f"无法处理文件 {file_path}：{e}")


C:\Users\Lenovo\作业\4501\data\fhvhv_tripdata_2020-01.parquet 中已抽取 385 行数据，并保存至 C:\Users\Lenovo\作业\4501\Sample_data\fhvhv_tripdata_2020-01.parquet
C:\Users\Lenovo\作业\4501\data\fhvhv_tripdata_2020-02.parquet 中已抽取 385 行数据，并保存至 C:\Users\Lenovo\作业\4501\Sample_data\fhvhv_tripdata_2020-02.parquet
C:\Users\Lenovo\作业\4501\data\fhvhv_tripdata_2020-03.parquet 中已抽取 385 行数据，并保存至 C:\Users\Lenovo\作业\4501\Sample_data\fhvhv_tripdata_2020-03.parquet
C:\Users\Lenovo\作业\4501\data\fhvhv_tripdata_2020-04.parquet 中已抽取 385 行数据，并保存至 C:\Users\Lenovo\作业\4501\Sample_data\fhvhv_tripdata_2020-04.parquet
C:\Users\Lenovo\作业\4501\data\fhvhv_tripdata_2020-05.parquet 中已抽取 385 行数据，并保存至 C:\Users\Lenovo\作业\4501\Sample_data\fhvhv_tripdata_2020-05.parquet
C:\Users\Lenovo\作业\4501\data\fhvhv_tripdata_2020-06.parquet 中已抽取 385 行数据，并保存至 C:\Users\Lenovo\作业\4501\Sample_data\fhvhv_tripdata_2020-06.parquet
C:\Users\Lenovo\作业\4501\data\fhvhv_tripdata_2020-07.parquet 中已抽取 385 行数据，并保存至 C:\Users\Lenovo\作业\4501\Sample_data\fhvhv_tripdata_2

In [59]:
import pandas as pd
import glob
import os


# 获取所有 HVFHV 数据集的文件路径
file_paths = glob.glob(r'C:\Users\Lenovo\作业\4501\data\yellow_tripdata_202[0-4]-*.parquet')

# 创建新文件夹 'Sample_data'，如果不存在则创建
sample_folder = r'C:\Users\Lenovo\作业\4501\Sample_data_yellow'
os.makedirs(sample_folder, exist_ok=True)

# 抽样大小
sample_size = 385

# 遍历每个文件
for file_path in file_paths:
    try:
        # 读取文件
        df = pd.read_parquet(file_path, engine='pyarrow')
        
        # 随机抽样 385 行
        sample_data = uber_data.sample(n=sample_size, random_state=42)
        
        # 创建新文件路径
        file_name = os.path.basename(file_path)
        sample_file_path = os.path.join(sample_folder, file_name)
        
        # 将抽样数据写入新文件夹中的同名文件
        sample_data.to_parquet(sample_file_path, index=False, engine='pyarrow')
        
        print(f"{file_path} 中已抽取 {sample_size} 行数据，并保存至 {sample_file_path}")
    
    except Exception as e:
        # 跳过无法读取的文件，并打印错误信息
        print(f"无法处理文件 {file_path}：{e}")

C:\Users\Lenovo\作业\4501\data\yellow_tripdata_2020-01.parquet 中已抽取 385 行数据，并保存至 C:\Users\Lenovo\作业\4501\Sample_data_yellow\yellow_tripdata_2020-01.parquet
C:\Users\Lenovo\作业\4501\data\yellow_tripdata_2020-02.parquet 中已抽取 385 行数据，并保存至 C:\Users\Lenovo\作业\4501\Sample_data_yellow\yellow_tripdata_2020-02.parquet
C:\Users\Lenovo\作业\4501\data\yellow_tripdata_2020-03.parquet 中已抽取 385 行数据，并保存至 C:\Users\Lenovo\作业\4501\Sample_data_yellow\yellow_tripdata_2020-03.parquet
C:\Users\Lenovo\作业\4501\data\yellow_tripdata_2020-04.parquet 中已抽取 385 行数据，并保存至 C:\Users\Lenovo\作业\4501\Sample_data_yellow\yellow_tripdata_2020-04.parquet
C:\Users\Lenovo\作业\4501\data\yellow_tripdata_2020-05.parquet 中已抽取 385 行数据，并保存至 C:\Users\Lenovo\作业\4501\Sample_data_yellow\yellow_tripdata_2020-05.parquet
C:\Users\Lenovo\作业\4501\data\yellow_tripdata_2020-06.parquet 中已抽取 385 行数据，并保存至 C:\Users\Lenovo\作业\4501\Sample_data_yellow\yellow_tripdata_2020-06.parquet
C:\Users\Lenovo\作业\4501\data\yellow_tripdata_2020-07.parquet 中已抽取 385 行数据，并保

In [60]:
try:
    df = pd.read_parquet(r'C:\Users\Lenovo\作业\4501\Sample_data_yellow\yellow_tripdata_2020-12.parquet', engine='pyarrow')
    print(df.head())
except Exception as e:
    print(f"读取 Parquet 文件失败: {e}")

  hvfhs_license_num dispatching_base_num originating_base_num  \
0            HV0003               B03404               B03404   
1            HV0003               B03404               B03404   
2            HV0003               B03404               B03404   
3            HV0003               B03404               B03404   
4            HV0003               B03404               B03404   

     request_datetime   on_scene_datetime     pickup_datetime  \
0 2024-08-26 19:27:31 2024-08-26 19:30:27 2024-08-26 19:30:47   
1 2024-08-10 14:01:58 2024-08-10 14:05:53 2024-08-10 14:06:14   
2 2024-08-25 09:11:10 2024-08-25 09:12:41 2024-08-25 09:14:29   
3 2024-08-01 22:01:46 2024-08-01 22:08:17 2024-08-01 22:08:33   
4 2024-08-15 11:56:40 2024-08-15 11:57:10 2024-08-15 11:58:30   

     dropoff_datetime  PULocationID  DOLocationID  trip_miles  ...  sales_tax  \
0 2024-08-26 19:33:49           123           123        0.54  ...       0.79   
1 2024-08-10 14:48:38           213           164       

In [43]:
# 加载包含区域信息的 Shapefile 文件
zone_gdf = gpd.read_file("taxi_zones.shp")

# 查看列名
print(zone_gdf.columns)


Index(['OBJECTID', 'Shape_Leng', 'Shape_Area', 'zone', 'LocationID', 'borough',
       'geometry'],
      dtype='object')


In [46]:
import pandas as pd
import geopandas as gpd
import glob
import os

# 加载包含区域信息的shapefile文件
zone_gdf = gpd.read_file("taxi_zones.shp")

# 计算每个区域的中心点，获得经纬度
zone_gdf['centroid'] = zone_gdf.geometry.centroid
zone_gdf['latitude'] = zone_gdf['centroid'].y
zone_gdf['longitude'] = zone_gdf['centroid'].x

# 选择需要的列：location_id, latitude, longitude
zone_gdf = zone_gdf[['LocationID', 'latitude', 'longitude']]


In [57]:
import pandas as pd
import geopandas as gpd
import glob
import os

# 指定包含所有 Parquet 文件的文件夹路径
folder_path = r'C:\Users\Lenovo\作业\4501\Sample_data'
all_files = glob.glob(os.path.join(folder_path, "*.parquet"))

# 加载 Shapefile 文件并计算每个区域的质心，获取经纬度
zone_gdf = gpd.read_file("nyc_zones.shp")
zone_gdf['centroid'] = zone_gdf.geometry.centroid
zone_gdf['latitude'] = zone_gdf['centroid'].y
zone_gdf['longitude'] = zone_gdf['centroid'].x

# 确保列名与 Parquet 文件中的一致
zone_gdf = zone_gdf[['LocationID', 'latitude', 'longitude']]

# 定义经纬度范围
lat_min, lon_min = 40.560445, -74.242330
lat_max, lon_max = 40.908524, -73.717047

for file in all_files:
    # 加载 Parquet 文件
    df = pd.read_parquet(file)
    
    # 将区域中心点与数据中的位置 ID 匹配，获取经纬度
    df = df.merge(zone_gdf, how='left', left_on='PULocationID', right_on='LocationID', suffixes=('', '_pickup'))
    df = df.merge(zone_gdf, how='left', left_on='DOLocationID', right_on='LocationID', suffixes=('_pickup', '_dropoff'))

    # 删除起点和终点相同且距离为零的记录
    df = df[~((df['PULocationID'] == df['DOLocationID']) & (df['trip_miles'] == 0))]

    # 保留必要的列
    relevant_columns = [
        'VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'trip_miles', 
        'fare_amount', 'latitude_pickup', 'longitude_pickup', 'latitude_dropoff', 'longitude_dropoff'
    ]
    df = df[relevant_columns]

    # 重命名列
    df.rename(columns={
        'tpep_pickup_datetime': 'PUDatetime',
        'tpep_dropoff_datetime': 'DODatetime',
        'trip_miles': 'trip_distance',
        'fare_amount': 'fare'
    }, inplace=True)

    # 设置每列合适的数据类型
    df['PUDatetime'] = pd.to_datetime(df['PUDatetime'])
    df['DODatetime'] = pd.to_datetime(df['DODatetime'])
    df['trip_distance'] = df['trip_distance'].astype(float)
    df['fare'] = df['fare'].astype(float)

    # 过滤掉不在指定经纬度范围内的记录
    df = df[
        (df['latitude_pickup'].between(lat_min, lat_max)) & 
        (df['longitude_pickup'].between(lon_min, lon_max)) &
        (df['latitude_dropoff'].between(lat_min, lat_max)) &
        (df['longitude_dropoff'].between(lon_min, lon_max))
    ]

    # 将清理后的数据保存为新的 Parquet 文件，文件名前加上前缀
    output_file = os.path.join(folder_path, "cleaned_" + os.path.basename(file))
    df.to_parquet(output_file, index=False)
    print(f"清理完成并保存: {output_file}")


KeyError: "['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'fare_amount'] not in index"

In [None]:
weather_df = pd.read_csv("daily_weather_data.csv")  # 加载每日天气数据

weather_df['date'] = pd.to_datetime(weather_df['date'])

weather_df.set_index('date', inplace=True)
hourly_weather_df = weather_df.resample('H').interpolate(method='linear')

hourly_weather_df.to_csv("hourly_weather_data.csv")
