In [1]:
import pandas as pd
import numpy as np

# 读入文件
data = pd.read_csv("bike.csv")
# 打印前几行，检查是否正确读入
print(data.head())

# 去除无用的‘id’列
data = data.drop(columns=['id'])
# 检查是否成功去除
print(data.head())

# 筛选出‘city’列中值为1的行，即上海，重新组成一个dataframe
data_shanghai = data[data['city']==1]
# 新dataframe中去除‘city’列
data_shanghai = data_shanghai.drop(columns=['city'])
# 检查是否成功去除
print(data_shanghai.head())

# 将‘hour’列中的数据变换
# 利用apply()函数，若列中数据在6-18的范围内，变换为1，其余变换为0
data_shanghai['hour'] = data_shanghai['hour'].apply(lambda x: 1 if 6<=x<=18 else 0)
# 检查是否变换成功
print(data_shanghai.head())

# 将dataframe中的‘y’列单独提取出来，转换为1个numpy向量
y_values = data_shanghai['y'].values.reshape(-1,1)
# 去除dataframe中的‘y’列
data_shanghai = data_shanghai.drop(columns=['y'])
# 检查是否成功提取和去除
print(data_shanghai.head())
print(y_values[:5])

# 将datafram转换成numpy数组
data_shanghai_array = data_shanghai.to_numpy()
print(data_shanghai_array[:5])

from sklearn.model_selection import train_test_split
X = data_shanghai_array
Y = y_values
# 按照训练集与测试集8:2的比例将原始数据集划分。
X_train,X_test,Y_train,Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print("训练集特征大小：", X_train.shape)
print("测试集特征大小：", X_test.shape)
print("训练集目标变量大小：", Y_train.shape)
print("测试集目标变量大小：", Y_test.shape)

from sklearn.preprocessing import MinMaxScaler
# 使用MinMaxScaler进行归一化处理
scaler = MinMaxScaler()
# 对训练集特征数据进行归一化
X_train_scaled = scaler.fit_transform(X_train)
# 对测试集特征数据进行归一化
X_test_scaled = scaler.transform(X_test)
# 对训练集目标数据进行归一化
Y_train_scaled = scaler.fit_transform(Y_train)
# 对测试集目标数据进行归一化
Y_test_scaled = scaler.transform(Y_test)

from sklearn.linear_model import LinearRegression
# 初始化线性回归模型
linear_model = LinearRegression()
# 使用归一化后的训练集数据训练模型
linear_model.fit(X_train_scaled, Y_train_scaled.flatten())
# 打印模型系数和截距
print("模型系数：", linear_model.coef_)
print("模型截距：", linear_model.intercept_)

# 使用模型对归一化后的测试集特征数据进行预测
Y_pred_scaled = linear_model.predict(X_test_scaled)
# scaler是之前用于归一化y_train和y_test的MinMaxScaler实例,使用模型对归一化后的测试集特征数据进行预测
Y_pred = scaler.inverse_transform(Y_pred_scaled.reshape(-1, 1)).flatten()
# 将测试集目标变量从归一化状态恢复到原始状态
Y_test = Y_test_scaled.flatten()  

from sklearn.metrics import mean_squared_error
# 计算均方根误差(RMSE)，评估模型性能
mse = mean_squared_error(Y_test, Y_pred)
rmse = mse ** 0.5
print("均方根误差(RMSE):", rmse)


   id  city  hour  is_workday  weather  temp_air  temp_body  wind   y
0   1     0    22           1        2       3.0        0.7     0  15
1   2     0    10           1        1      21.0       24.9     3  48
2   3     0     0           1        1      25.3       27.4     0  21
3   4     0     7           0        1      15.7       16.2     0  11
4   5     1    10           1        1      21.1       25.0     2  39
   city  hour  is_workday  weather  temp_air  temp_body  wind   y
0     0    22           1        2       3.0        0.7     0  15
1     0    10           1        1      21.0       24.9     3  48
2     0     0           1        1      25.3       27.4     0  21
3     0     7           0        1      15.7       16.2     0  11
4     1    10           1        1      21.1       25.0     2  39
    hour  is_workday  weather  temp_air  temp_body  wind   y
4     10           1        1      21.1       25.0     2  39
5      0           1        1      20.4       18.2     0  12
9