ライブラリのインポート

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from scipy.stats import norm

%matplotlib inline

データの読み込み

In [2]:
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

In [3]:
df_train.head()

In [4]:
df_train.tail()

In [5]:
df_test.head()

In [7]:
df_train.columns

In [14]:
df_train.isnull().sum()

In [15]:
df_test.isnull().sum()

In [10]:
df_train.info()

In [11]:
df_test.info()

In [13]:
df_train.describe(include='O')

データの前処理

In [7]:
# ["SalePrice"]の確認：describe
df_train["SalePrice"].describe()

In [9]:
sns.distplot(df_train["SalePrice"]);

In [11]:
# 尤度と歪度
print(f"歪度:{df_train['SalePrice'].skew()}")
print(f"尤度:{df_train['SalePrice'].kurt()}")

In [12]:
# 住居面積と販売価格の散布図
var = "GrLivArea"
data = pd.concat([df_train["SalePrice"], df_train[var]], axis=1)
data.plot.scatter(x=var, y="SalePrice", ylim=(0, 800000));

In [13]:
# 地下室の総面積と住宅価格
var = "TotalBsmtSF"
data = pd.concat([df_train["SalePrice"], df_train[var]], axis=1)
data.plot.scatter(x=var, y="SalePrice", ylim=(0, 800000));

- 数値型と小数点型のみのデータでヒートマップを作成

In [15]:
df_train2 = df_train.select_dtypes(include=["int64", "float64"])

In [18]:
df_test2 = df_test.select_dtypes(include=["int64", "float64"])

In [17]:
# ヒートマップ
plt.figure(figsize=(12,9))
sns.heatmap(df_train2.corr(), annot=True)
plt.title("Correlation Heatmap")
plt.show()

In [20]:
# SalesPriceとの相関の強さ（相関係数）を降順にソート
corr = df_train2.corr()
corr.sort_values("SalePrice", ascending=False)

In [19]:
# ヒートマップ
plt.figure(figsize=(12,9))
sns.heatmap(df_test2.corr(), annot=True)
plt.title("Correlation Heatmap")
plt.show()

In [21]:
corr_test = df_test2.corr()
corr.sort_values("SalePrice", ascending=False)

In [3]:
connection_df = pd.concat((df_train, df_test))
connection_df.head()

In [4]:
connection_df.tail()

In [5]:
connection_df.info()

In [6]:
connection_df.isnull().sum()

# 文字型の欠損値処理

In [7]:
# object型の欠損値処理
connection_df["PoolQC"] = connection_df["PoolQC"].fillna("None")
connection_df["MiscFeature"] = connection_df["MiscFeature"].fillna("None")
connection_df["Alley"] = connection_df["Alley"].fillna("None")
connection_df["Fence"] = connection_df["Fence"].fillna("None")
connection_df["FireplaceQu"] = connection_df["FireplaceQu"].fillna("None")

In [8]:
for col in ("GarageType", "GarageFinish", "GarageQual", "GarageCond"):
    connection_df[col] = connection_df[col].fillna("None")

In [9]:
for col in ("BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2"):
    connection_df[col] = connection_df[col].fillna("None")

In [10]:
connection_df["MasVnrType"] = connection_df["MasVnrType"].fillna("None")

In [11]:
connection_df.isnull().sum()

# 数値型の欠損値処理

In [12]:
for col in ("BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "BsmtFullBath", "BsmtHalfBath", "TotalBsmtSF"):
    connection_df[col] = connection_df[col].fillna(0)

In [13]:
connection_df["MasVnrArea"] = connection_df["MasVnrArea"].fillna(0)
connection_df["GarageYrBlt"] = connection_df["GarageYrBlt"].fillna(0)

In [14]:
for col in ("GarageCars", "GarageArea"):
    connection_df[col] = connection_df[col].fillna(0)

In [15]:
connection_df.isnull().sum()

# 欠損値自体に意味のない文字型を最頻値で補完

In [16]:
connection_df["MSZoning"].value_counts()

In [17]:
connection_df["MSZoning"] = connection_df["MSZoning"].fillna(connection_df["MSZoning"].mode()[0])

In [18]:
connection_df["Functional"].value_counts()

In [19]:
connection_df["Functional"] = connection_df["Functional"].fillna("Typ")

In [20]:
connection_df["Electrical"] = connection_df["Electrical"].fillna(connection_df["Electrical"].mode()[0])
connection_df["KitchenQual"] = connection_df["KitchenQual"].fillna(connection_df["KitchenQual"].mode()[0])
connection_df["Exterior1st"] = connection_df["Exterior1st"].fillna(connection_df["Exterior1st"].mode()[0])
connection_df["Exterior2nd"] = connection_df["Exterior2nd"].fillna(connection_df["Exterior2nd"].mode()[0])
connection_df["SaleType"] = connection_df["SaleType"].fillna(connection_df["SaleType"].mode()[0])

In [21]:
connection_df["Utilities"].value_counts()

In [22]:
connection_df = connection_df.drop(["Utilities"], axis=1)

In [24]:
connection_df["LotFrontage"].value_counts()

In [25]:
connection_df["LotFrontage"] = connection_df["LotFrontage"].fillna(connection_df["LotFrontage"].mode()[0])

In [26]:
# 欠損値の有無確認
connection_df.isnull().sum()

In [27]:
connection_df

# エンコーディング

In [28]:
from sklearn.preprocessing import LabelEncoder
cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')

for col in cols:
    lbl = LabelEncoder()
    lbl.fit(list(connection_df[col].values))
    connection_df[col] = lbl.transform(list(connection_df[col].values))

In [29]:
connection_df[['FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold']].head(10)

In [30]:
connection_df = pd.get_dummies(connection_df, drop_first=True)

# 学習データの分割

In [31]:
# データの分割
train_df = connection_df[:len(df_train)]
test_df = connection_df[len(df_train):].drop(columns=['SalePrice'])

# X_trainには、SalePriceを除いたtrain_dfを代入
X_train = train_df.drop("SalePrice", axis=1)

# y_trainには、SalePriceのみが入ったtrain_dfを代入
y_train = train_df["SalePrice"]

# X_testにはtrain_dfを代入
X_test = test_df

print(X_train.shape, y_train.shape, X_test.shape)

In [32]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

def run_cv(model):
    """
    
    :param model: 
    :return: 
    """
    cv = KFold(n_splits=3, random_state=42, shuffle=True)
    rmse_results = []
    models = []
    
    for trn_index, val_index in cv.split(X_train):
        X_trn, X_val = X_train.loc[trn_index], X_train.loc[val_index]
        y_trn, y_val = y_train[trn_index], y_train[val_index]
        
        # モデルの学習
        model.fit(X_trn, y_trn)
        pred = model.predict(X_val)
        
        # モデル制度の算出
        rmse = np.sqrt(mean_squared_error(y_val, pred))
        print("RMSE:", rmse)
        models.append(model)
    
    print(rmse_results)
    print("Average RMSE:", np.mean(rmse_results))
    return models

In [35]:
import lightgbm as lgb

lgb_params = {
    "objective":"regression",
    "metric": "rmse"
}

cv = KFold(n_splits=3, random_state=42, shuffle=True)
rmse_results = []
lgbm_models = []
# テストデータに対する予測結果を格納するための空の配列
test_preds = np.zeros(len(X_test))

for trn_index, val_index in cv.split(X_train, y_train):
    X_trn, X_val = X_train.loc[trn_index], X_train.loc[val_index]
    y_trn, y_val = y_train[trn_index], y_train[val_index]
    
    train_lgb = lgb.Dataset(X_trn, y_trn)
    validation_lgb = lgb.Dataset(X_val, y_val)
    model = lgb.train(
        lgb_params, train_lgb, 
        num_boost_round=1000, valid_sets=[train_lgb, validation_lgb], 

        )
    pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, pred))
    print("RMSE:", rmse)
    rmse_results.append(rmse)
    lgbm_models.append(model)

    test_preds += model.predict(X_test) / cv.n_splits

print(rmse_results)
print("Average:", np.mean(rmse_results))

# データの提出

In [36]:
submission = pd.read_csv("sample_submission.csv")
submission["SalePrice"] = test_preds

# 提出ファイルの出力
submission.to_csv("submission.csv", index=False)