In [1]:
# このノートブックでは、baselineとの違いを分かりやすくするため、変更した点にのみ注釈を付けている

import pandas as pd
train = pd.read_csv("../../data/kaggle/house_prices/train.csv")
test = pd.read_csv("../../data/kaggle/house_prices/test.csv")
sample =  pd.read_csv("../../data/kaggle/house_prices/sample_submission.csv")

In [2]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
sample.head()

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.68357
3,1464,179317.477511
4,1465,150730.079977


In [4]:
numeric_train = train.select_dtypes(include=["int64", "float64"])

# improvedでは、数字列のみではなく、ダミー変数を使用し数字列に変換できる文字列も使用
# そのために文字のパターンが限られる(数字列に変換しやすい文字列)を特定する

object_train = train.select_dtypes(include=["object"])
object_train.nunique().sort_values()

Street            2
Alley             2
Utilities         2
CentralAir        2
MasVnrType        3
LandSlope         3
PavedDrive        3
GarageFinish      3
PoolQC            3
ExterQual         4
BsmtQual          4
BsmtExposure      4
BsmtCond          4
KitchenQual       4
LandContour       4
LotShape          4
Fence             4
MiscFeature       4
ExterCond         5
Electrical        5
HeatingQC         5
BldgType          5
MSZoning          5
LotConfig         5
GarageQual        5
GarageCond        5
FireplaceQu       5
Heating           6
BsmtFinType1      6
BsmtFinType2      6
RoofStyle         6
Foundation        6
GarageType        6
SaleCondition     6
Functional        7
HouseStyle        8
RoofMatl          8
Condition2        8
SaleType          9
Condition1        9
Exterior1st      15
Exterior2nd      16
Neighborhood     25
dtype: int64

In [5]:
row_pattern = object_train.nunique()[object_train.nunique() < 10].index
# .indexはindexの名前を取り出す

# 何故必要？

# row_pattern =object_train.nunique()[object_train.nunique() < 10].index
# のrow_pattern = object_train.nunique()[object_train.nunique() < 10]はindexに列名があるだけではなく、数値を持つSeries。
# よって.idexが無しだとdummy_object = pd.get_dummies(object_train[row_pattern],drop_first=True,dtype=int)
# のobject_train[row_pattern]では、列名＋数値の、列名として解釈できないSeriesを渡されてしまい、エラーが起こる。

# よって、.idexを付ける事で、
# row_pattern = object_train.nunique()[object_train.nunique() < 10]から、
# indexの列名のみを取り出し(つまり実質的に数値の方を排除し)列名を参照させている

dummy_object = pd.get_dummies(object_train[row_pattern],drop_first=True,dtype=int)
# 文字列を0,1で表現
# ダミー変数を1列減らす(→一列無くても判断は出来るので)

# 上のコードは
# row_pattern = object_train.loc[:, object_train.nunique() < 10]
# dummy_object = pd.get_dummies(row_pattern, drop_first=True, dtype=int)
# のように書くことができ、可読性ではこちらの方が高い

# しかし上記の様に、変数にSeriesを代入し、pd.get_dummies(df[変数],…)とする方法には以下のメリットが存在する
# 再利用、再検証しやすい

#loc[行条件、列条件]   :は全て利用

numeric_train = pd.concat([numeric_train, dummy_object], axis=1)
# 今回はnumeric_trainとdummy_objectの行の数が等しいため、pd.mergeよりpd.concatの方が適切

In [6]:
numeric_train.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,0,0,0,1,0,0,0,1,0
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,0,0,0,0,1,0,0,0,1,0
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,0,0,0,1,0,0,0,1,0
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,0,0,0,1,0,0,0,0,0
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,0,0,0,0,1,0,0,0,1,0


In [7]:
print(len(numeric_train))
numeric_train.isnull().sum().sort_values(ascending=False)

1460


LotFrontage              259
GarageYrBlt               81
MasVnrArea                 8
Id                         0
LotArea                    0
                        ... 
SaleCondition_AdjLand      0
SaleCondition_Alloca       0
SaleCondition_Family       0
SaleCondition_Normal       0
SaleCondition_Partial      0
Length: 193, dtype: int64

In [8]:
# baselineでは1列ずつ欠損を中央値で補完
# しかし、この方法では欠損のある列が多い程、作業が増える
# よって今回は、数値列の欠損を一括で補完

numeric_train = numeric_train.fillna(numeric_train.median())
numeric_train.isnull().sum().sort_values(ascending=False)

Id                       0
MSSubClass               0
LotFrontage              0
LotArea                  0
OverallQual              0
                        ..
SaleCondition_AdjLand    0
SaleCondition_Alloca     0
SaleCondition_Family     0
SaleCondition_Normal     0
SaleCondition_Partial    0
Length: 193, dtype: int64

In [9]:
import numpy as np
from sklearn import linear_model
import sklearn.model_selection

X = numeric_train
y = np.log1p(numeric_train["SalePrice"])

del X["SalePrice"]

X_train,X_test,y_train,y_test = sklearn.model_selection.train_test_split(X,y,random_state=0)
model = linear_model.LinearRegression()
model.fit(X_train,y_train)

print(model.score(X_test,y_test))
print(model.score(X_train,y_train))
# 評価用での精度が約64%,学習用での精度が約95%
# → 過学習/baseline同様にチューニングを試みる

0.6397218198583694
0.945274262122169


In [10]:
from sklearn.linear_model import Ridge

model = Ridge(alpha=17.2)
model.fit(X_train, y_train)
# 係数が大きくなりすぎたら、ペナルティを与え過学習を抑制

print(model.score(X_test, y_test))
print(model.score(X_train, y_train))
# 評価用が約78%,学習用が約92%に　　alphaは17.2が最も精度が高かった

0.7817385960856161
0.9170165420846024


In [11]:
numeric_test = test.select_dtypes(include=["int64", "float64"])
object_test = test.select_dtypes(include=["object"])
dummy_object_test = pd.get_dummies(object_test[row_pattern],drop_first=True,dtype=int)
# 先程のrow_patternを再利用
numeric_test = pd.concat([numeric_test, dummy_object_test], axis=1)
numeric_test = numeric_test.fillna(numeric_test.median())
numeric_test.isnull().sum().sort_values(ascending=False)

Id                       0
MSSubClass               0
LotFrontage              0
LotArea                  0
OverallQual              0
                        ..
SaleCondition_AdjLand    0
SaleCondition_Alloca     0
SaleCondition_Family     0
SaleCondition_Normal     0
SaleCondition_Partial    0
Length: 177, dtype: int64

In [12]:
# Lenth(列の数)がtrainの時と不一致 → 合わせる

numeric_test= numeric_test.reindex(columns=X.columns, fill_value=0)
# numeric_testの列をXの列(つまりtrainの列)に合わせる

# fill_value=0 　ダミー変数では存在しない物は0で表すため、
# trainに合わせて増えた列(カテゴリの存在しない列)を0で埋めても意味は破綻しない

In [13]:
numeric_test = numeric_test.fillna(numeric_test.median())
numeric_test.isnull().sum().sort_values(ascending=False)

Id                       0
MSSubClass               0
LotFrontage              0
LotArea                  0
OverallQual              0
                        ..
SaleCondition_AdjLand    0
SaleCondition_Alloca     0
SaleCondition_Family     0
SaleCondition_Normal     0
SaleCondition_Partial    0
Length: 192, dtype: int64

In [14]:
pred_log = model.predict(numeric_test)
pred = np.expm1(pred_log)

In [15]:
submission_improved = pd.DataFrame({"Id": test["Id"],"SalePrice": pred})
submission_improved.to_csv("../../outputs/kaggle_house_price/submission_improved.csv", index=False)