In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 載入資料集

In [2]:
# Load in the train datasets
train = pd.read_csv('Data/train.csv')
test = pd.read_csv('Data/test.csv')

In [4]:
print(train.shape)
train.head(1)

(1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500


In [5]:
print(test.shape)
test.head(1)

(1459, 80)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal


In [6]:
# 組合train和test做前處理
# data[data.Id>1460]分割train和test
data = pd.concat([train, test], axis=0, sort=False)
data['SalePrice'] = data['SalePrice'].fillna(0.0) #隨便補結果，不要是nan
data = data.reset_index() #重新設定index，以免產生錯誤
data = data.drop(columns=['index'])#刪除index欄位
data.shape

(2919, 81)

# Data clean - 缺值處理

In [7]:
#觀察缺值狀況
data.isnull().sum().sort_values(ascending=False).head(3)

PoolQC         2909
MiscFeature    2814
Alley          2721
dtype: int64

In [8]:
#超過一定比例則刪除
drop_feature = 0.8
data = data.dropna(thresh=drop_feature*data.shape[0],axis=1)
data.shape

(2919, 76)

In [9]:
#觀察缺值狀況
data.isnull().sum().sort_values(ascending=False).head(3)

LotFrontage     486
GarageYrBlt     159
GarageFinish    159
dtype: int64

In [10]:
#觀查單筆資料缺失狀況
#num_na_data = pd.DataFrame({ 'num': [data.isnull()[i:i+1].sum().sum() for i in range(len(data))] })
#num_na_data['num'].value_counts()

In [11]:
#補眾數
for i in range(data.shape[1]):
    data[data.keys()[i]] = data[data.keys()[i]].fillna(data[data.keys()[i]].mode()[0])

In [12]:
#觀查單筆資料缺失狀況
#num_na_data = pd.DataFrame({ 'num': [data.isnull()[i:i+1].sum().sum() for i in range(len(data))] })
#num_na_data['num'].value_counts()

# One hot

In [13]:
#剩餘的特徵數量
num_feature = data.shape[1]
feature = data.keys().copy()
num_feature

76

In [14]:
#當資料是文字，進行one-hot-encoder
for i in range(num_feature):
    if isinstance(data.values[0,i], str): #type是str時 one-hot-encoder
        df = pd.get_dummies(data.values[:,i])
        data = pd.concat([data, df], axis=1, sort=False)
        df.iloc[0:0]

In [15]:
#觀查one-hot後的結果
data.shape

(2919, 310)

In [16]:
#刪除文字特徵
i = 0
while i < num_feature:
    if isinstance(data.values[0,i], str): #type是str時 one-hot-encoder
        data = data.drop(columns=[data.keys()[i]])
    else:
        i = i + 1;

In [17]:
#觀查刪除後的結果
data.shape

(2919, 272)

In [18]:
#再切分為train和test
train = data[data.Id<=1460]
test = data[data.Id>1460].drop(columns=['SalePrice']) #清除亂補的解答

# 分割資料+套入模型

In [19]:
#分割資料
X = train.drop(columns=['SalePrice'])
y = train['SalePrice']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=100)

In [20]:
# svm
#from sklearn.svm import SVR
#model = SVR(kernel='linear', C=1)
#model = SVR(kernel='poly', C=0.1, degree=2)

In [21]:
# xgboost
import xgboost
model = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7)

In [22]:
#訓練+預測資料
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

ValueError: could not convert string to float: 'Normal'

In [None]:
#觀察結果
from sklearn import metrics
mae = metrics.mean_absolute_error(y_pred, y_test)
mse = metrics.mean_squared_error(y_pred, y_test)
print('mae = ', mae)
print('mse = ', mse)
print('rms = ', mse**0.5)
print('R2:%.2f'%metrics.r2_score(y_pred, y_test))

# 產生預測資料

In [None]:
pred = model.predict(test.values)

In [None]:
submission = pd.read_csv('Data/submission.csv')
pd.DataFrame({ 'Id': submission.Id, 'SalePrice':pred })
StackingSubmission = pd.DataFrame({ 'Id': submission.Id, 'SalePrice': pred })
StackingSubmission.to_csv("Data/submission.csv", index=False)