In [1]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import glob

# 获取所有 CSV 文件的路径
file_paths = glob.glob('../data/2019-citibike-tripdata/10_October/201910-citibike-tripdata_*.csv')

# 读取并合并所有 CSV 文件
dataframes = [pd.read_csv(file) for file in file_paths]
citibike_df = pd.concat(dataframes, ignore_index=True)
print(citibike_df.columns)

Index(['tripduration', 'starttime', 'stoptime', 'start station id',
       'start station name', 'start station latitude',
       'start station longitude', 'end station id', 'end station name',
       'end station latitude', 'end station longitude', 'bikeid', 'usertype',
       'birth year', 'gender'],
      dtype='object')


In [2]:
# 计算每个站点的开始和结束骑行次数
start_station_counts = citibike_df.groupby(['start station id', 'start station name']).size().reset_index(name='ride start count')
end_station_counts = citibike_df.groupby(['end station id', 'end station name']).size().reset_index(name='ride end count')

# 合并开始和结束骑行次数
station_activity = pd.merge(start_station_counts, end_station_counts,
                            left_on='start station id', right_on='end station id',
                            how='outer').fillna(0)

# 计算总骑行活动
station_activity['ride activity'] = (station_activity['ride start count'] + station_activity['ride end count']).astype(int)
print(station_activity.head())

   start station id             start station name  ride start count  \
0              72.0               W 52 St & 11 Ave            3518.0   
1              79.0       Franklin St & W Broadway            2703.0   
2              82.0         St James Pl & Pearl St            1460.0   
3              83.0  Atlantic Ave & Fort Greene Pl            2016.0   
4             116.0                W 17 St & 8 Ave            7721.0   

   end station id               end station name  ride end count  \
0              72               W 52 St & 11 Ave            3421   
1              79       Franklin St & W Broadway            2746   
2              82         St James Pl & Pearl St            1450   
3              83  Atlantic Ave & Fort Greene Pl            2071   
4             116                W 17 St & 8 Ave            7839   

   ride activity  
0           6939  
1           5449  
2           2910  
3           4087  
4          15560  


In [3]:
# 统计唯一的起始站点和结束站点的个数
unique_start_stations = station_activity['start station id'].nunique()
unique_end_stations = station_activity['end station id'].nunique()

# 打印结果
print(f"Unique start stations: {unique_start_stations}")
print(f"Unique end stations: {unique_end_stations}")

# 或者，如果你想统计所有站点的总数（不考虑起始或结束），可以合并后去重
all_stations = pd.concat([station_activity['start station id'], station_activity['end station id']]).nunique()
print(f"Total unique stations: {all_stations}")

Unique start stations: 838
Unique end stations: 848
Total unique stations: 849


In [4]:

# 读取 Shapefile
shapefile_path = "../data/nyc2020_census/nyct2020.shp"
census_gdf = gpd.read_file(shapefile_path)
census_gdf

In [5]:
# 确保普查区和站点的 CRS 一致
census_gdf = census_gdf.to_crs(start_station_gdf.crs)

# 执行空间连接，找到每个起始站点所属的普查区
start_stations_in_census = gpd.sjoin(start_station_gdf, census_gdf, how="inner", op="within")

# 执行空间连接，找到每个终点站点所属的普查区
end_stations_in_census = gpd.sjoin(end_station_gdf, census_gdf, how="inner", op="within")

# 计算每个站点的开始和结束骑行次数
start_station_counts = citibike_df.groupby(['start station id', 'start station name']).size().reset_index(name='ride start count')
end_station_counts = citibike_df.groupby(['end station id', 'end station name']).size().reset_index(name='ride end count')

# 合并开始和结束骑行次数
station_activity = pd.merge(start_station_counts, end_station_counts,
                            left_on='start station id', right_on='end station id',
                            how='outer').fillna(0)

# 计算总骑行活动
station_activity['ride activity'] = (station_activity['ride start count'] + station_activity['ride end count']).astype(int)

# 将起始站点的活动与普查区关联，并计算普查区的骑行活动和站点数量
start_stations_with_activity = pd.merge(
    start_stations_in_census,
    station_activity[['start station id', 'ride activity']],
    left_on='start station id',
    right_on='start station id',
    how='left'
)

end_stations_with_activity = pd.merge(
    end_stations_in_census,
    station_activity[['end station id', 'ride activity']],
    left_on='end station id',
    right_on='end station id',
    how='left'
)

# 合并起始站点和终点站点的活动
all_stations_with_activity = pd.concat([
    start_stations_with_activity[['BoroCT2020', 'start station id', 'ride activity']],
    end_stations_with_activity[['BoroCT2020', 'end station id', 'ride activity']]
])

# 统计每个普查区的站点数量
station_count = all_stations_with_activity[['BoroCT2020', 'start station id']].drop_duplicates().groupby('BoroCT2020').size().reset_index(name='station count')

# 按普查区汇总骑行活动
census_ride_activity = all_stations_with_activity.groupby('BoroCT2020')['ride activity'].sum().reset_index()

# 将 shape_area 和 station count 添加到结果中
census_ride_activity = pd.merge(
    census_ride_activity,
    census_gdf[['BoroCT2020', 'Shape_Area']],
    left_on='BoroCT2020',
    right_on='BoroCT2020',
    how='left'
)

census_ride_activity = pd.merge(
    census_ride_activity,
    station_count,
    left_on='BoroCT2020',
    right_on='BoroCT2020',
    how='left'
)

# 输出结果
print(census_ride_activity.head())


NameError: name 'start_station_gdf' is not defined

In [41]:
# 加载人口统计数据
population_data = pd.read_csv("../data/nyc_censusdata_2020.csv")

# 检查数据加载情况
print(population_data.head())
# 去除逗号并将人口列转换为整数
population_data['Pop1'] = population_data['Pop1'].str.replace(',', '').astype(int)
# 将 BCT2020 字段转换为字符串类型
population_data['BCT2020'] = population_data['BCT2020'].astype(str)
# 检查数据格式
print(population_data.dtypes)


   Year GeoType Borough        GeoID  BCT2020   Pop1  Male P  FemP PopU5  \
0  2020  CT2020   Bronx  36005000100  2000100  3,772    94.4   5.6     3   
1  2020  CT2020   Bronx  36005000200  2000200  4,779    47.7  52.3   234   
2  2020  CT2020   Bronx  36005000400  2000400  6,272    47.3  52.7   275   
3  2020  CT2020   Bronx  36005001600  2001600  5,795    43.4  56.6   240   
4  2020  CT2020   Bronx  36005001901  2001901  2,292    50.4  49.6   158   

  Pop5t9  ... Pop80t84 Pop85pl MdAge PopU18 Pop65pl GQClgHsg    Fam  HUnits  \
0      1  ...        6       1  32.8     12      47        0      0       1   
1    273  ...       61      96  37.2  1,065     599        0  1,133   1,594   
2    374  ...       69      64  38.5  1,337     770        0  1,606   2,200   
3    366  ...      131     143  40.2  1,251     986        0  1,380   2,129   
4    152  ...        9       6  32.7    567      79        0    501   1,049   

   OcHU_1P  AvgHHSz  
0      NaN      NaN  
1     95.2     3.15  
2 

In [42]:
census_ride_activity.rename(columns={'BoroCT2020': 'BCT2020'}, inplace=True)
census_ride_activity['BCT2020'] = census_ride_activity['BCT2020'].astype(str)

# 合并骑行活动数据和人口统计数据
merged_data = pd.merge(population_data, census_ride_activity, on='BCT2020', how='left')
# 检查合并后的数据
print(merged_data.head())

   Year GeoType Borough        GeoID  BCT2020  Pop1  Male P  FemP PopU5  \
0  2020  CT2020   Bronx  36005000100  2000100  3772    94.4   5.6     3   
1  2020  CT2020   Bronx  36005000200  2000200  4779    47.7  52.3   234   
2  2020  CT2020   Bronx  36005000400  2000400  6272    47.3  52.7   275   
3  2020  CT2020   Bronx  36005001600  2001600  5795    43.4  56.6   240   
4  2020  CT2020   Bronx  36005001901  2001901  2292    50.4  49.6   158   

  Pop5t9  ... PopU18 Pop65pl GQClgHsg    Fam HUnits OcHU_1P  AvgHHSz  \
0      1  ...     12      47        0      0      1     NaN      NaN   
1    273  ...  1,065     599        0  1,133  1,594    95.2     3.15   
2    374  ...  1,337     770        0  1,606  2,200    95.9     2.97   
3    366  ...  1,251     986        0  1,380  2,129    95.9     2.73   
4    152  ...    567      79        0    501  1,049    94.2     2.29   

   ride activity  Shape_Area  station count  
0            NaN         NaN            NaN  
1            NaN        

In [43]:
merged_data.to_csv("merged_census_ride_activity.csv", index=False)

In [44]:
filtered_data = merged_data.dropna(subset=['ride activity','Shape_Area'])
filtered_data.to_csv("filtered_data_ride_activity.csv", index=False)

In [52]:
# 创建副本以避免 SettingWithCopyWarning
filtered_data = filtered_data.copy()

# 将可能包含字符串的数值列转换为数字类型
columns_to_convert = [
    'Pop15t19', 'Pop20t24', 'Pop25t29', 'Pop30t34', 'Pop35t39', 'Pop40t44',
    'Pop45t49', 'Pop50t54', 'Pop55t59', 'Pop60t64', 'PopU18', 'Pop65pl',
    'HUnits', 'Fam', 'AvgHHSz', 'GQClgHsg', 'ride activity', 'Shape_Area'
]

# 去除逗号并转换为数字
for col in columns_to_convert:
    filtered_data[col] = (
        filtered_data[col]
        .replace({',': ''}, regex=True)  # 去除逗号
        .astype(float)  # 转换为浮点数
    )


# 计算 pop19to64
filtered_data['pop19to64'] = (
    filtered_data['Pop15t19'] + filtered_data['Pop20t24'] + filtered_data['Pop25t29'] +
    filtered_data['Pop30t34'] + filtered_data['Pop35t39'] + filtered_data['Pop40t44'] +
    filtered_data['Pop45t49'] + filtered_data['Pop50t54'] + filtered_data['Pop55t59'] +
    filtered_data['Pop60t64']
)

# 将 station count 列重命名为 num_stations（如果存在）
filtered_data.rename(columns={'station count': 'num_stations'}, inplace=True)

# 准备自变量和因变量
variables = [
    'PopU18', 'pop19to64', 'Pop65pl', 'HUnits', 'Fam',
    'AvgHHSz', 'GQClgHsg', 'Shape_Area', 'num_stations'
]
target = 'ride activity'

# 创建一个新的 DataFrame，只包含自变量和因变量
final_data = filtered_data[variables + [target]]
final_data.fillna(0, inplace=True)
# 保存到 CSV 文件
final_data.to_csv('linear_model_data.csv', index=False)

print("数据已保存到 'linear_model_data.csv'")


数据已保存到 'linear_model_data.csv'


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data.fillna(0, inplace=True)


In [53]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# 自变量和因变量
X = final_data[variables]  # 自变量
y = final_data[target]     # 因变量

# 初始化模型
model = LinearRegression()

# 拟合模型
model.fit(X, y)

# 输出模型系数和截距
print("截距项:", model.intercept_)
print("回归系数:", model.coef_)

# 预测
y_pred = model.predict(X)

# 计算 R² 和均方误差
print("R²:", r2_score(y, y_pred))
print("均方误差 (MSE):", mean_squared_error(y, y_pred))


截距项: -20143375.746277392
回归系数: [ 8.65145434e+04 -3.59147675e+04  8.53407467e+03  1.21548420e+05
 -2.60700830e+05 -2.59702870e+07  1.21212890e+05 -3.68619069e+00
  5.92091401e+07]
R²: 0.33814776659456725
均方误差 (MSE): 2.3408791326254436e+16


In [56]:
for idx, col in enumerate(model.coef_):
    print(variables[idx].ljust(20), col)


PopU18               86514.54338162004
pop19to64            -35914.76754649207
Pop65pl              8534.074670093802
HUnits               121548.41953447071
Fam                  -260700.83040967246
AvgHHSz              -25970286.95563022
GQClgHsg             121212.8895676617
Shape_Area           -3.6861906888816063
num_stations         59209140.07654531
