In [22]:
#工作综述：现在，我们拥有两份类似于表格的数据，我们称之为训练集与测试集。他们都记录了波士顿不同年份的房子信息，信息包括，房子的大小、售价、类型等等信息
#这些信息或有所缺漏、或有错误，但大体上是详尽且准确的。现在我们要做的是，利用训练集的数据，训练出一个房价预测模型，能够预测未来的房价
#而测试集的数据则用来评判训练出的模型的好坏
#基本的思路是，首先这是一个回归问题，我们罗列出经典的几类回归算法，通过简单的数据处理后，让他们各自经由训练集的训练得出相应的模型并在测试集中检验结果
#我们选出表现较好的模型，再定义一个计算最优参数的函数，计算能够使得该模型发挥最大作用的参数以此得到最优的测试结果



from sklearn import tree
from sklearn import linear_model
from sklearn import svm
from sklearn import neighbors
from sklearn import ensemble
import torch
from torch import nn
from sklearn.model_selection import cross_val_score
import numpy as np
#from import 语句就是经典的调用语句，sklearn是一个机器学习的库，里面有六大任务模块：分别是分类、回归、聚类、降维、模型选择和预处理方便我们对于不同的实际问题调用不同的深度学习算法。

#下面即是将本实验所需的算法进行罗列并将其放置到相应的变量中。以决策树回归为例，tree.DecisionTreeRegressor()即决策树回归算法，random_state=50为其中的参数，这里参数可以设定为任意值，之后的程序中会有对于最优参数的寻找，即在怎样的参数下，该模型能有最好的表现。此处我们先罗列常用的模型，找出在该类问题中表现较好的模型，之后再分析表现较好的模型的最优参数
#决策树回归
model_DT = tree.DecisionTreeRegressor(random_state=50)
#线性回归
model_Line = linear_model.LinearRegression()
#SVM回归
model_SVR = svm.SVR()
#knn回归
model_KNN = neighbors.KNeighborsRegressor()
#随机森林回归
model_RFR = ensemble.RandomForestRegressor(random_state=30)
#Adaboot回归
model_ABR = ensemble.AdaBoostRegressor(random_state=70)
#GBRT回归
model_GBR = ensemble.GradientBoostingRegressor(random_state=8)
#criterion='friedman_mse',learning_rate=1,max_depth=1,max_features='log2',min_samples_leaf=1,subsample=0.5,loss='ls',
#Bagging回归
model_BR = ensemble.BaggingRegressor(random_state=7)
#ExtraTree极端随机树回归
model_ETR = tree.ExtraTreeRegressor(random_state=33)

In [24]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
loss = nn.MSELoss()#定义一个均方损失函数的计算方法
# def rmse(actual,predict):
#     predict = np.array(predict)
#     actual = np.array(actual)
#
#     distance = predict - actual
#
#     square_distance = distance ** 2
#
#     mean_square_distance = square_distance.mean()
#
#     score = np.sqrt(mean_square_distance)
#
#     return score

def rmse(actual, predict):

    # .clamp里面，predict为输入值，1为输出下限，float定义了输出的数据类型，为了在取对数时进一步稳定该值，将小于1的值设置为1
    clipped_preds = torch.clamp(predict, 1, float('inf'))
    rmse = torch.sqrt(loss(torch.log(clipped_preds),
                           torch.log(actual)))
    return rmse.item()

rmse_score = make_scorer(rmse,greater_is_better=True)

In [25]:
#此处定义的函数即为找寻到特定模型下最优参数的函数
def get_best_model_and_accuracy(model, params, X, y):
    grid = GridSearchCV(model, # 要搜索的模型
                        params, # 要尝试的参数
                        scoring='neg_mean_squared_log_error',n_jobs=6,#固定的格式，scoring：以均方差开根号的平均值为评判标准。n_jobs表示运用计算机的6核进行处理
                        error_score=0.) # 如果报错，结果是0
    grid.fit(X, y) # 拟合模型和参数
    #对最终的结果的各类指标进行输出
    # 经典的性能指标
    print("Best Accuracy: {}".format(np.sqrt(-grid.best_score_)))
    # 得到最佳准确率的最佳参数
    print("Best Parameters: {}".format(grid.best_params_))
    # 拟合的平均时间（秒）
    print("Average Time to Fit (s): {}".format(round(grid.cv_results_['mean_fit_time'].mean(), 3)))
    # 预测的平均时间（秒）
    # 从该指标可以看出模型在真实世界的性能
    print("Average Time to Score (s): {}".format(round(grid.cv_results_['mean_score_time'].mean(), 3)))

In [26]:
#下面即对于现有的数据集进行简单的处理
import numpy as np
import pandas as pd
#分别将训练数据集和测试数据集读入train_data与test_data中
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
#此处即为将数据集和测试集结合在一起。iloc为根据标签的所在位置，从0开始计数，先选取行再选取列，由于在拼接时.csv的第一列为ID我们不需要，所以列数从第二列也就是“1”开始计数。
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))
# 若无法获得测试数据，则可根据训练数据计算均值和标准差。.index为建立索引列表。这部分什么意思呢，我们的数据集本身不可能是全部都是规范的数字的。就比方说，是否有阳台会影响房屋价格可是这个量仅能回答“是与否”，计算机不懂这样的语言，于是我们将这类的非数字类型的单列出来自成一列，“是”设为1，”否“设为0，以此形成正确输入。同时，因为每种数字数据的大小不同，就像“距离市中心的远近”这一指标可以达到数千，而房屋面积大多是几百，大的数据的输入肯定会对结果造成较大的影响，这样不同数据之间的影响程度就不一样了，于是我们将其全部减去同类的平均值再除以其平均数，进行-1到1的统一化。
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x: (x - x.mean()) / (x.std()))
# 在标准化数据之后，所有均值消失，因此我们通过。fillna可以将缺失值设置为0
all_features[numeric_features] = all_features[numeric_features].fillna(0)
# “Dummy_na=True”将“na”（缺失值）视为有效的特征值，并为其创建指示符特征
all_features = pd.get_dummies(all_features, dummy_na=True)
#上面部分是将训练集与测试集合并起来一起处理，下面的操作即是将二者分开
#我们首先通过shape得到训练集的行数接着以此行数为标准对现有的大的数据集进行切割得到训练集与测试集
all_features.shape
n_train = train_data.shape[0]
train_features = all_features[:n_train].values
test_features = all_features[n_train:].values
train_labels = train_data.SalePrice.values.reshape(-1, 1)

In [126]:
#下面即是对最开始列出的几种常规模型进行五折交叉验证，即随机将数据集分为5堆；选取一堆作为测试集，另外四堆作为训练集进行训练与验证，如此往复五次，每次选取的训练集不同，以此得出该模型的最终得分。


score = cross_val_score(model_DT,train_features,train_labels,cv=5,scoring='neg_mean_squared_log_error')
np.sqrt(-score).mean()

0.20639236316034765

In [127]:
score = cross_val_score(model_Line,train_features,train_labels,cv=5,scoring='neg_mean_squared_log_error')
np.sqrt(-score).mean()

Traceback (most recent call last):
  File "D:\application\anaconda3\envs\d2l\lib\site-packages\sklearn\model_selection\_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "D:\application\anaconda3\envs\d2l\lib\site-packages\sklearn\metrics\_scorer.py", line 103, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "D:\application\anaconda3\envs\d2l\lib\site-packages\sklearn\metrics\_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "D:\application\anaconda3\envs\d2l\lib\site-packages\sklearn\metrics\_regression.py", line 506, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.

Traceback (most recent call last):
  File "D:\application\anaconda3\envs\d2l\lib\site-packages\sklearn\model_selection\_validation.py", line 762, in _score
    scores = scorer(estimator, X_

nan

In [128]:
score = cross_val_score(model_SVR,train_features,train_labels,cv=5,scoring='neg_mean_squared_log_error')
np.sqrt(-score).mean()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.3993968370871633

In [129]:
score = cross_val_score(model_KNN,train_features,train_labels,cv=5,scoring='neg_mean_squared_log_error')
np.sqrt(-score).mean()

0.17242272128404598

In [130]:
score = cross_val_score(model_RFR,train_features,train_labels,cv=5,scoring='neg_mean_squared_log_error')
np.sqrt(-score).mean()

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


0.1468865191224885

In [131]:
score = cross_val_score(model_ABR,train_features,train_labels,cv=5,scoring='neg_mean_squared_log_error')
np.sqrt(-score).mean()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.19971055178606142

In [132]:
score = cross_val_score(model_GBR,train_features,train_labels,cv=5,scoring='neg_mean_squared_log_error')
np.sqrt(-score).mean()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.13147763236668816

In [133]:
score = cross_val_score(model_BR,train_features,train_labels,cv=5,scoring='neg_mean_squared_log_error')
np.sqrt(-score).mean()

  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)
  return column_or_1d(y, warn=True)


0.15593678824631066

In [134]:
score = cross_val_score(model_ETR,train_features,train_labels,cv=5,scoring='neg_mean_squared_log_error')
np.sqrt(-score).mean()

0.23009804916085633

In [84]:
#找到了在该种情况下表现较好的模型之后，就可以进一步找到该模型的最优参数来进行准确率的进一步提高。为了方便，此处列写了所有模型的最优参数的寻找方式。
#以model_RFR为例，通过参考其操作手册，列写出其相应的指标tree_params在利用get_best_model_and_accuracy函数进行最优参数的寻找
tree_params = {'max_depth':[None, 1, 3, 5, 7,10]}

get_best_model_and_accuracy(model_RFR,tree_params,train_features,train_labels)
# line_params = {'fit_intercept':[True,False],'normalize':[True,False]}
# get_best_model_and_accuracy(model_Line,line_params,train_features,train_labels)
# # svr_params = {'kernel':['linear','poly','rbf','sigmoid','precomputed'],'degree':[1,3,5],'epsilon':[0.05,0.1,0.15],'shrinking':[True,False]}
# # get_best_model_and_accuracy(model_SVR,svr_params,train_features,train_labels)
# knn_params = {'n_neighbors':[2,4,5,7,9],'weights':['uniform','distance'],'leaf_size':[15,30,45]}
# get_best_model_and_accuracy(model_KNN,knn_params,train_features,train_labels)
# rfr_param= { "n_estimators"      : [250, 300,400],
#            "criterion"         : ["gini", "entropy"],
#            "max_features"      : [3, 5],
#            "max_depth"         : [3,5,8,10,15, 20],
#            "min_samples_split" : [2, 4,6] ,
#            "bootstrap": [True, False]}
# get_best_model_and_accuracy(model_RFR,rfr_param,train_features,train_labels)


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

Best Accuracy: 0.1470840596206003
Best Parameters: {'max_depth': None}
Average Time to Fit (s): 0.884
Average Time to Score (s): 0.008


In [85]:
line_params = {'fit_intercept':[True,False],'normalize':[True,False]}
get_best_model_and_accuracy(model_Line,line_params,train_features,train_labels)

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


Traceback (most recent call last):
  File "D:\application\anaconda3\envs\d2l\lib\site-packages\sklearn\model_selection\_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "D:\application\anaconda3\envs\d2l\lib\site-packages\sklearn\metrics\_scorer.py", line 216, in __call__
    return self._score(
  File "D:\application\anaconda3\envs\d2l\lib\site-packages\sklearn\metrics\_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  F

Best Accuracy: -0.0
Best Parameters: {'fit_intercept': True, 'normalize': True}
Average Time to Fit (s): 0.091
Average Time to Score (s): 0.001


In [89]:
svr_params = {'kernel':['linear','poly','rbf','sigmoid'],'degree':[1,3,5],'epsilon':[0.05,0.1,0.15],'shrinking':[True,False]}
get_best_model_and_accuracy(model_SVR,svr_params,train_features,train_labels)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Best Accuracy: 0.3593221639228858
Best Parameters: {'degree': 1, 'epsilon': 0.15, 'kernel': 'linear', 'shrinking': True}
Average Time to Fit (s): 0.1
Average Time to Score (s): 0.03


  y = column_or_1d(y, warn=True)


In [87]:
knn_params = {'n_neighbors':[2,4,5,7,9],'weights':['uniform','distance'],'leaf_size':[15,30,45]}
get_best_model_and_accuracy(model_KNN,knn_params,train_features,train_labels)

Best Accuracy: 0.1698474854335671
Best Parameters: {'leaf_size': 15, 'n_neighbors': 9, 'weights': 'distance'}
Average Time to Fit (s): 0.002
Average Time to Score (s): 0.009


In [88]:
rfr_param= { "n_estimators"      : [250, 300,400],
           # "criterion"         : ["gini", "entropy"],
           "max_features"      : [3, 5],
           "max_depth"         : [3,5,8,10,15, 20],
           "min_samples_split" : [2, 4,6] ,
           "bootstrap": [True, False]}
get_best_model_and_accuracy(model_RFR,rfr_param,train_features,train_labels)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

Best Accuracy: 0.15939510687748476
Best Parameters: {'bootstrap': False, 'max_depth': 20, 'max_features': 5, 'min_samples_split': 2, 'n_estimators': 400}
Average Time to Fit (s): 0.291
Average Time to Score (s): 0.024


In [92]:
abr_param={'loss':['linear','square','exponential'],'n_estimators':[20,30,50,70,80],'learning_rate':[0.3,0.5,0.7,0.9,1]}
get_best_model_and_accuracy(model_ABR,abr_param,train_features,train_labels)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Best Accuracy: 0.19565318659202202
Best Parameters: {'learning_rate': 0.5, 'loss': 'exponential', 'n_estimators': 70}
Average Time to Fit (s): 0.414
Average Time to Score (s): 0.008


In [14]:
gbr_param ={'loss':['ls','lad','huber','quantile'],'learning_rate':[0.001, 0.01, 0.1, 1],'subsample':[0.5, 0.6, 0.7, 0.8, 0.9, 1],'criterion':['friedman_mse','mse'],'max_depth':[3, 5, 8, 15, 25, 30, None],'min_samples_leaf':[1, 2, 5, 10],'max_features':['log2', 'sqrt', None]}
get_best_model_and_accuracy(model_GBR,gbr_param,train_features,train_labels)

Best Accuracy: -0.0
Best Parameters: {'criterion': 'friedman_mse', 'learning_rate': 1, 'loss': 'ls', 'max_depth': 3, 'max_features': 'log2', 'min_samples_leaf': 1, 'subsample': 0.5}
Average Time to Fit (s): 1.907
Average Time to Score (s): 0.002


  y = column_or_1d(y, warn=True)


In [27]:
model_GBR.fit(train_features,train_labels)
result = model_GBR.predict(test_features)

  y = column_or_1d(y, warn=True)


In [136]:
#此代码在找到最优模型与参数之后运行，以最优的模型与参数计算出预测的房价
# model_RFR.fit(train_features,train_labels)
# result = model_RFR.predict(test_features)
# model_Lin = linear_model.LinearRegression(normalize=True)
# model_Lin.fit(train_features,train_labels)
# result = model_Lin.predict(test_features)
# result





In [28]:
#最后，回归开始处理的数据集上来，将预测出的结果插入到表格中完成工作。
test_data['SalePrice'] = pd.Series(result.reshape(1, -1)[0])
submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
submission.to_csv('submission.csv', index=False)

array([127440.66, 154730.5 , 181318.03, ..., 152345.  , 112422.  ,
       224918.13])

In [1]:
!conda env list

# conda environments:
#
base                     D:\application\anaconda3
d2l                   *  D:\application\anaconda3\envs\d2l
pytorch                  D:\application\anaconda3\envs\pytorch
scrape                   D:\application\anaconda3\envs\scrape
venv                     D:\application\anaconda3\envs\venv

