# 事前環境設定

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import statsmodels.api as stat
import warnings
from sklearn.metrics import mean_squared_error
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))
warnings.simplefilter("ignore")
%matplotlib inline

In [None]:
df_train = pd.read_csv('./train.csv')

## 例：立地を元のモデルに追加してみる

# 可視化

In [None]:
# 地域別価格の中央値
df_train['SalePrice'].groupby(df_train['Neighborhood']).median().sort_values().plot('barh')

In [None]:
# 地域別価格のサマリー
df_train.groupby('Neighborhood').SalePrice.median().describe()

In [None]:
# 価格の中央値によりA、Bラベルを付ける
nb_bins=df_train.loc[:,['Neighborhood','SalePrice']]
nb_bins.loc[nb_bins['SalePrice']>=179900,'bins']='A'
nb_bins.loc[nb_bins['SalePrice']<179900,'bins']='B'

In [None]:
# 地域の組み分けリストの作成
nb_list=pd.DataFrame()
for i in nb_bins['Neighborhood']:
    bins = nb_bins.loc[nb_bins['Neighborhood']==i,'bins'].describe()['top']
    AP=pd.DataFrame([i,bins]).T
    nb_list = nb_list.append(AP)
nb_list=nb_list.drop_duplicates()
nb_list.index=range(len(nb_list))
nb_list.columns=['Neighborhood','bin']
nb_list

In [None]:
#　作成されたリストのチェック
len(nb_list)==len(df_train['Neighborhood'].value_counts())

In [None]:
def preprocess(train_flg,nb_list):
    if train_flg:
        df = pd.read_csv('./train.csv')
        df.drop(df[df['GrLivArea']>=4000].index,inplace=True)
        df_y=df['SalePrice']
    else:
        df = pd.read_csv('./test.csv')
    cols=['GrLivArea','YearBuilt','OverallCond','BsmtQual','Neighborhood']
    df_x = df.loc[:,cols]
    # 立地を地域リストにより変換
    for i in range(len(nb_list)):
        df_x.loc[df_x['Neighborhood']==nb_list.loc[i,'Neighborhood'],'Neighborhood'] = nb_list.loc[i,'bin']
        
    df_x.loc[df_x['BsmtQual'].isnull()==True,'BsmtQual']='NA'
    df_x=pd.get_dummies(df_x)
    if train_flg:
        return df_x,df_y
    else:
        return df_x,df['Id']

In [None]:
df_x,df_y=preprocess(True,nb_list)

## 線形回帰

In [None]:
clf = stat.OLS(df_y,df_x)
result = clf.fit()
result.summary()

In [None]:
clf=LinearRegression()
clf.fit(df_x,df_y)
print('RMSE:{0}'.format(rmse(clf.predict(df_x),df_y)))
sns.regplot(clf.predict(df_x),df_y)
plt.title('pred vs real')
plt.show()
coef=pd.Series(clf.coef_, index = df_x.columns)
coef.sort_values(ascending=True).plot('barh')
plt.title('Coefficients in the Linear Model')

# テストセットの予測

In [None]:
df_test_x,df_test_id = preprocess(False,nb_list)

In [None]:
# 上記の線形モデルで価格を予測、CSV出力
df_test_x=pd.get_dummies(df_test_x)
y_pred=clf.predict(df_test_x)
pred_df=pd.DataFrame(y_pred, index=df_test_id, columns=["SalePrice"])
pred_df.to_csv('./output.csv', header=True, index_label='Id')

# Appendix　変換辞書の作成

## 例：販売月（MoSold）

In [None]:
df_train.groupby(['MoSold']).SalePrice.count().plot('bar')

In [None]:
mo_bins = {
    1:1,
    2:1,
    3:2,
    4:2,
    5:3,
    6:3,
    7:3,
    8:2,
    9:1,
    10:1,
    11:1,
    12:1
}
def get_bins(x):
    x = mo_bins[x]
    return x
df_train['MoSold'].apply(get_bins)