In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

# 1. 读取数据
file_path = r"C:\Users\Lenovo\Desktop\数据导论\bike.csv"
data = pd.read_csv(file_path)

# 2. 剔除无用列 id
data.drop(columns=["id"], inplace=True)

# 3. 筛选出上海市数据，剔除 city 列
data = data[data['city'] == 1].drop(columns=["city"])

# 4. 处理 hour 列：6点-18点统一为1；19点-次日5点统一为0
data['hour'] = data['hour'].apply(lambda x: 1 if 6 <= x <= 18 else 0)

# 5. 提取 y 列为标签并转换为 numpy 列向量
y = data['y'].values.reshape(-1, 1)
data.drop(columns=['y'], inplace=True)

# 6. 将 DataFrame 转换为 Numpy 数组
X = data.values

# 7. 按 8:2 比例划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 8. 数据归一化
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)

y_train = scaler_y.fit_transform(y_train)
y_test = scaler_y.transform(y_test)

# 9. 构建线性回归模型并训练
model = LinearRegression()
model.fit(X_train, y_train)

# 10. 模型预测与评估
y_pred = model.predict(X_test)

# 反归一化预测值与真实值
y_pred_actual = scaler_y.inverse_transform(y_pred)
y_test_actual = scaler_y.inverse_transform(y_test)

# 计算 RMSE
rmse = np.sqrt(mean_squared_error(y_test_actual, y_pred_actual))
print(f"RMSE: {rmse}")



RMSE: 31.568478635113294
