# 数据概况
这是艾姆斯爱荷华州的房价数据，我现在的目的是用回归的方法预测下房价。数据字典见
https://ww2.amstat.org/publications/jse/v19n3/decock/DataDocumentation.txt 

In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
AmesHousing=pd.read_csv("AmesHousing.tsv", delimiter="\t")
AmesHousing.head(5)

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


## 一 、先制作一个Demo模型

In [3]:
# 清洗数据
# data 数据集
# return 清洗后的数据集
def transform_features(data):
    return data

In [4]:
# 特征选取
# data 数据集
# return 特征列的数据
def select_features(data):
    return data[["Gr Liv Area","SalePrice"]]

In [5]:
# 训练模型并测试误差
# return 误差RMSE
def train_and_test(data):
    # 清理包含空值的行
    data = data.dropna(axis=0)
    
    # 创建特征列，只包含数字类型的特征
    features = data.select_dtypes(include=["integer", "float"]).columns.drop("SalePrice")
    
    # 将数据分割成训练集和测试集
    train = data[0:1460]
    test = data[1460:]

    # 训练模型并作出预测值
    lr = linear_model.LinearRegression()
    lr.fit(train[features], train["SalePrice"])
    p = lr.predict(test[features])
    
    return mean_squared_error(p, test["SalePrice"])**(1/2)

In [6]:
transform_df = transform_features(AmesHousing)
filtered_df = select_features(transform_df)
rmse = train_and_test(filtered_df)
rmse

57088.25161263909

## 二、调整模型

###### 1、特征工程
- 处理缺失值:
    - 所有的列:
        - 删除缺失值超过5%的列
    - 文本列:
        - 删除缺失值超过1以上的列
    - 数字列:
        - 用该列中最多的数字，来填充该列的缺失值

In [8]:
def transform_df(data):
    # ------------ 删除缺失值超过5%的列 ------------
    # 每一列的缺失值组成的series
    miss_values_rows_num_serise = data.isnull().sum()

    # 删除缺失值超过5%的列
    cols = miss_values_rows_num_serise[miss_values_rows_num_serise > (data.shape[0]*0.05)].index
    data = data.drop(cols, axis=1)
    
    # ------------ 删除文本列中缺失值超过1以上的列 ------------
    # 选取只有文本列的列名
    cols = data.select_dtypes(include=["object"]).columns

    # 求出每一列的空值数量
    t = data[cols].isnull().sum()
    # 删除空值数量>1的列
    data = data.drop( t[t>1].index , axis=1)
    
    # ------------- 用该列中最多的数字，来填充该列的缺失值 -------------
    # 获取只有数字列的列名
    cols = data.select_dtypes(include="number").columns

    # 获取每一列中出现次数最多的数字 组建成一个字典
    col_val = {}
    for col in cols:
        col_val[col] = data[col].value_counts().index[0]

    # 用重复次数最多的数字填充NaN
    data = data.fillna(col_val)
    
    # ------------- 用SBrkr来填充Electrical列 -------------
    dics = {}
    dics["Electrical"] = "SBrkr"
    data = data.fillna(dics)
    
    # ------------- 建立 year sale - Year Remod 装修年新列 -------------
    data["year sale - Year Remod"] = data["Yr Sold"] - data["Year Remod/Add"]
    
    # 新列中有负数，将其删除
    error_list = data["year sale - Year Remod"][data["year sale - Year Remod"] < 0].index
    data = data.drop(error_list, axis=0)

    # ------------- 建立 year sale - year build （出售年 - 建造年）新列 -------------
    data["year sale - year build"] = data["Yr Sold"] - data["Year Built"]

    # 第2180行是负数，不符合逻辑，房子没建好就卖了？
    #data = data.drop(2180, axis=0)
    return data

###### 2、特征选择
- 数字列中选取相关系数最高的特征
- 将文字列转为虚拟列，然后选取相关系数最高的特征

In [73]:
data = transform_df(AmesHousing)

da = data.select_dtypes(include="object")
t=pd.get_dummies(da)
data=pd.concat([data, t] , axis=1)
np.abs(data[].corr()["SalePrice"]).sort_values()

Foundation_Wood           0.000069
Roof Matl_Metal           0.000177
Sale Type_ConLI           0.000459
Condition 1_RRAn          0.001067
Condition 2_RRAe          0.002136
Sale Type_CWD             0.002787
House Style_2.5Unf        0.004106
Roof Style_Flat           0.004186
Roof Matl_ClyTile         0.004805
Roof Matl_Tar&Grv         0.004872
Lot Config_Corner         0.005558
BsmtFin SF 2              0.006127
Roof Style_Shed           0.006437
Neighborhood_Greens       0.008364
Neighborhood_SawyerW      0.008733
Utilities_NoSeWa          0.010011
Roof Matl_Roll            0.010126
Sale Type_VWD             0.010126
Neighborhood_Landmrk      0.010126
Condition 2_RRAn          0.010148
Roof Style_Mansard        0.011178
Exterior 2nd_BrkFace      0.011822
Lot Config_FR3            0.013769
Roof Matl_Membran         0.014052
Condition 1_RRNn          0.014083
Lot Config_FR2            0.014380
Exterior 1st_BrkFace      0.015505
Kitchen Qual_Po           0.016952
Heating_OthW        

In [9]:
def select_features(data):
    t=np.abs(data.corr()["SalePrice"]).sort_values()
    cols = t[t>=0.6].index
    return data[cols]

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Area,Street,Lot Shape,Land Contour,Utilities,Lot Config,...,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice,year sale - Year Remod,year sale - year build
0,1,526301100,20,RL,31770,Pave,IR1,Lvl,AllPub,Corner,...,0,0,0,5,2010,WD,Normal,215000,50,50
1,2,526350040,20,RH,11622,Pave,Reg,Lvl,AllPub,Inside,...,120,0,0,6,2010,WD,Normal,105000,49,49
2,3,526351010,20,RL,14267,Pave,IR1,Lvl,AllPub,Corner,...,0,0,12500,6,2010,WD,Normal,172000,52,52
3,4,526353030,20,RL,11160,Pave,Reg,Lvl,AllPub,Corner,...,0,0,0,4,2010,WD,Normal,244000,42,42
4,5,527105010,60,RL,13830,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,3,2010,WD,Normal,189900,12,13


- 对于数字类型的列而言，生成相关系数矩阵图
- 将部分列转为分类类型
- 将部分数字列转为分类类型

In [40]:
# 训练模型并测试误差
# return 误差RMSE
def train_and_test(data):
    # 清理包含空值的行
    data = data.dropna(axis=0)
    
    # 创建特征列，只包含数字类型的特征
    features = data.select_dtypes(include=["integer", "float"]).columns.drop("SalePrice")
    
    # 将数据分割成训练集和测试集
    train = data[0:1460]
    test = data[1460:]

    # 训练模型并作出预测值
    lr = linear_model.LinearRegression()
    lr.fit(train[features], train["SalePrice"])
    p = lr.predict(test[features])
    
    return mean_squared_error(p, test["SalePrice"])**(1/2)

In [47]:
def select_features(data):
    t=np.abs(data.corr()["SalePrice"]).sort_values()
    cols = t[t>=0.6].index
    return data[cols]

# 训练模型

In [48]:
data=transform_df(AmesHousing)
data=select_features(data)
train_and_test(data)

38360.024509138195