# 数据概况
这是艾姆斯爱荷华州的房价数据，我现在的目的是用回归的方法预测下房价。数据字典见
https://ww2.amstat.org/publications/jse/v19n3/decock/DataDocumentation.txt 

In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
AmesHousing=pd.read_csv("AmesHousing.tsv", delimiter="\t")
AmesHousing.head(5)

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [38]:
# 清洗数据
# data 数据集
# return 清洗后的数据集
def transform_features(data):
    return data

In [39]:
# 特征选取
# data 数据集
# return 特征列的数据
def select_features(data):
    return data[["Gr Liv Area","SalePrice"]]

In [40]:
# 训练模型并测试误差
# return 误差RMSE
def train_and_test(data):
    # 清理包含空值的行
    data = data.dropna(axis=0)
    
    # 创建特征列，只包含数字类型的特征
    features = data.select_dtypes(include=["integer", "float"]).columns.drop("SalePrice")
    
    # 将数据分割成训练集和测试集
    train = data[0:1460]
    test = data[1460:]

    # 训练模型并作出预测值
    lr = linear_model.LinearRegression()
    lr.fit(train[features], train["SalePrice"])
    p = lr.predict(test[features])
    
    return mean_squared_error(p, test["SalePrice"])**(1/2)

In [41]:
transform_df = transform_features(AmesHousing)
filtered_df = select_features(transform_df)
rmse = train_and_test(filtered_df)
rmse

57088.25161263909

# 特征工程
- 处理缺失值:
    - 所有的列:
        - 删除缺失值超过5%的列
    - 文本列:
        - 删除缺失值超过1以上的列
    - 数字列:
        - 用该列中最多的数字，来填充该列的缺失值

删除缺失值超过5%的列

In [42]:
# 总行数 
AmesHousing_rows_num = AmesHousing.shape[0]

# 每一列的缺失值组成的series
miss_values_rows_num_serise = AmesHousing.isnull().sum()

# 删除
cols=miss_values_rows_num_serise[miss_values_rows_num_serise > (AmesHousing_rows_num*0.05)].index
t=AmesHousing.drop(cols, axis=1)

t.shape

(2930, 71)

删除文本列中缺失值超过1以上的列

# 特征选择

# 训练模型