# 事前環境設定

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import statsmodels.api as stat
import warnings
from sklearn.metrics import mean_squared_error
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))
warnings.simplefilter("ignore")
%matplotlib inline

In [None]:
df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test.csv')

# 相関係数表

In [None]:
corrmat = df_train.corr()
corrmat

In [None]:
# 相関関係表によりheatmapを作成
f,ax = plt.subplots(figsize=(12,9))
sns.heatmap(corrmat,vmax=.8,square=True)
plt.show()

## 例：価格と強い相関のある変数（数値型）10個抽出、中の一つを前回のモデルに追加

In [None]:
corrmat['SalePrice'] = abs(corrmat['SalePrice'])
ana_list = corrmat['SalePrice'].sort_values().tail(4)
ana_list.plot(kind='barh')

# 可視化

In [None]:
plt.scatter(df_train['SalePrice'],df_train['GarageCars'])
plt.show();

In [None]:
df_train.groupby('GarageCars',as_index=False).SalePrice.median()

In [None]:
df_train.groupby('GarageCars',as_index=False).GarageArea.median()

In [None]:
len(df_train.loc[df_train['GarageCars']==4])

In [None]:
plt.hist(df_train['GarageCars'])
plt.show()
plt.plot(df_train.groupby('GarageCars').GarageArea.mean())
plt.plot(df_test.groupby('GarageCars').GarageArea.mean())

In [None]:
df_train.drop(df_train.loc[df_train['GarageCars']==4].index,inplace=True)
df_train.index=range(len(df_train))

In [None]:
df_train.drop(df_train[df_train['GrLivArea']>=4000].index,inplace=True)
df_train_shrinked=df_train.loc[:,['GrLivArea','YearBuilt','OverallCond','BsmtQual','GarageCars','SalePrice']]
df_x=df_train_shrinked
df_y=df_train_shrinked['SalePrice']
del df_x['SalePrice']

for i in df_x.columns:
    if sum(df_x[i].isnull())>0:
        print(i)
df_x.loc[df_x['BsmtQual'].isnull()==True,'BsmtQual']='NA'
df_x=pd.get_dummies(df_x)
df_x.head(2)

In [None]:
def preprocess(train_flg):
    if train_flg:
        df = pd.read_csv('./train.csv')
        df.drop(df[df['GrLivArea']>=4000].index,inplace=True)
        df.drop(df.loc[df['GarageCars']==4].index,inplace=True)
        df.index=range(len(df))
        df_y=df['SalePrice']
    else:
        df = pd.read_csv('./test.csv')
    cols=['GrLivArea','YearBuilt','OverallCond','BsmtQual','GarageCars']
    df_x = df.loc[:,cols]
    df_x.loc[df_x['BsmtQual'].isnull()==True,'BsmtQual']='NA'
    df_x=pd.get_dummies(df_x)
    if train_flg:
        return df_x,df_y
    else:
        return df_x

In [None]:
df_x,df_y=preprocess(True)

In [None]:
clf = stat.OLS(df_y,df_x)
result = clf.fit()
result.summary()

In [None]:
clf=LinearRegression()
clf.fit(df_x,df_y)
print('RMSE:{0}'.format(rmse(clf.predict(df_x),df_y)))
sns.regplot(clf.predict(df_x),df_y)
plt.title('pred vs real')
plt.show()
coef=pd.Series(clf.coef_, index = df_x.columns)
coef.sort_values(ascending=True).plot('barh')
plt.title('Coefficients in the Linear Model')

# テストセットの予測

In [None]:
df_test_x = preprocess(False)

# テストセットの欠損値検出

In [None]:
for i in df_test_x.columns:
    if df_test_x[i].isnull().sum()>0:
        print(i,df_test_x[i].isnull().sum(),df_test_x[i].dtype)

In [None]:
df_test_x.loc[df_test_x['GarageCars'].isnull()]

In [None]:
df_test.loc[1116]

In [None]:
df_test_x.loc[df_test_x['GarageCars'].isnull(),'GarageCars']=0

In [None]:
# 個別の処理も統合処理関数に追加
def preprocess(train_flg):
    if train_flg:
        df = pd.read_csv('./train.csv')
        df.drop(df[df['GrLivArea']>=4000].index,inplace=True)
        df.drop(df.loc[df['GarageCars']==4].index,inplace=True)
        df.index=range(len(df))
        df_y=df['SalePrice']
    else:
        df = pd.read_csv('./test.csv')
    cols=['GrLivArea','YearBuilt','OverallCond','BsmtQual','GarageCars']
    df_x = df.loc[:,cols]
    df_x.loc[df_x['BsmtQual'].isnull()==True,'BsmtQual']='NA'
    df_x.loc[df_x['GarageCars'].isnull()==True,'GarageCars']=0
    df_x=pd.get_dummies(df_x)
    if train_flg:
        return df_x,df_y
    else:
        return df_x,df['Id']

In [None]:
df_test_x,df_test_id=preprocess(False)

In [None]:
# 上記の線形モデルで価格を予測、CSV出力
y_pred=clf.predict(df_test_x)
pred_df=pd.DataFrame(y_pred, index=df_test_id, columns=["SalePrice"])
pred_df.to_csv('./output.csv', header=True, index_label='Id')

#### 線形回帰以外に、使うデータによりもっと精度のいいモデルもいくつかあります。
#### sklearnにはRandomForest、GradientBoosting、以外にXGBoost、LightGBMなど、お時間のある方はお試してみてください。
#### インストールとチューニングの時間を含んで考えると、RandomForestとGradientBoostingはオススメです。
#### これらのモデルはモデルの中身がどうなっているかは明白ではありませんが、どの変数がモデル精度向上に貢献したかを出力することができます（.feature\_importances\_）。