In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

# 读取数据
data = pd.read_csv("data/bike.csv")

# 剔除 id 列
data = data.drop("id", axis=1)

# 筛选出上海市的所有数据，然后剔除 city 列
data_shanghai = data[data["city"] == 1].drop("city", axis=1)

# 将 hour 列中原来 6 点 - 18 点统一为 1；19 点 - 次日 5 点统一为 0
data_shanghai["hour"] = np.where((data_shanghai["hour"] >= 6) & (data_shanghai["hour"] <= 18), 1, 0)

# 提取 y 列并转换为 numpy 列向量，剔除原先的 y 列
y = data_shanghai["y"].values.reshape(-1, 1)
data_shanghai = data_shanghai.drop("y", axis=1)

# 将 DataFrame 对象转换为 Numpy 数组
X = data_shanghai.values

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 归一化
scaler_X = MinMaxScaler()
X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)

scaler_y = MinMaxScaler()
y_train = scaler_y.fit_transform(y_train)
y_test = scaler_y.transform(y_test)

# 构建线性回归模型并训练
model = LinearRegression()
model.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = model.predict(X_test)

# 计算均方根误差（RMSE）
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("均方根误差（RMSE）:", rmse)

均方根误差（RMSE）: 0.1652799928539963


In [3]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt

# 加载鸢尾花数据集
iris = load_iris()

# 划分训练集和测试集，训练集比例为 0.2，随机种子为 42
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

# 以花萼长度为横轴，花萼宽度为纵轴绘制数据的散点图（此处未给出具体实现代码）

# 对鸢尾花数据的特征进行 PCA 降维，并且可视化降维后的结果（此处未给出具体实现代码）

# 以 K = 3 训练分类器
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train, y_train)

# 在测试集上进行测试
y_pred = knn.predict(x_test)

# 计算分类准确率
accuracy = knn.score(x_test, y_test)
print("分类准确率：", accuracy)

分类准确率： 1.0
