In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# 아이오와 주 에임스(Ames)에 있는 주거용 주택을 설명하는 79가지 변수로 각 주택의 최종 가격을 예측합니다.

# 데이터 분석 처리 단계 

# 1단계 : 탐색적 자료 분석 (EDA : Exploratory Data Analysis)
- 시각화와 기술 통계(Descriptive statistics)를 통해서 데이터를 이해하는 단계이다.

## 1 데이터 읽어오기

### 경로지정

In [2]:
dirname = './data/ames_house_prices'

### 데이터 읽어오기

In [3]:
df_train = pd.read_csv(dirname + '/train.csv')
df_test = pd.read_csv(dirname + '/test.csv')

### 데이터 형태 확인 

In [4]:
df_train.shape, df_test.shape

((1460, 81), (1459, 80))

### 데이터 정보 확인하기 

In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

# 2단계 : 전처리 (Preprocessing)

데이터를 정제하고 가공해서 머신 러닝 모델의 입력에 적합한 형태로 바꿔주는 단계이다.

- Data Cleaning
  - Deduplication
  - Outlier detection
  - Other cleaning techniques

- For model
  - Feature extraction
  - Feature scaling
  - Dummification
  - Dimensionality reduction

탐색적 자료 분석과 전처리는 순차적이라기 보다 반복적인 관계이다. EDA를 통해 어떤 전처리가 필요한지 알 수 있고 전처리를 통해 EDA를 수월하게 할 수 있다.

## 범주형 널값 처리 

In [6]:
cols_fillna = ['PoolQC','MiscFeature','Alley','Fence','MasVnrType','FireplaceQu',
               'GarageQual','GarageCond','GarageFinish','GarageType', 'Electrical',
               'KitchenQual', 'SaleType', 'Functional', 'Exterior2nd', 'Exterior1st',
               'BsmtExposure','BsmtCond','BsmtQual','BsmtFinType1','BsmtFinType2',
               'MSZoning', 'Utilities']

for col in cols_fillna:
    df_train[col].fillna('None',inplace=True)
    df_test[col].fillna('None',inplace=True)

### 수치형 널값 처리 

In [7]:
df_train.select_dtypes(include='number').isnull().sum()

Id                 0
MSSubClass         0
LotFrontage      259
LotArea            0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
MasVnrArea         8
BsmtFinSF1         0
BsmtFinSF2         0
BsmtUnfSF          0
TotalBsmtSF        0
1stFlrSF           0
2ndFlrSF           0
LowQualFinSF       0
GrLivArea          0
BsmtFullBath       0
BsmtHalfBath       0
FullBath           0
HalfBath           0
BedroomAbvGr       0
KitchenAbvGr       0
TotRmsAbvGrd       0
Fireplaces         0
GarageYrBlt       81
GarageCars         0
GarageArea         0
WoodDeckSF         0
OpenPorchSF        0
EnclosedPorch      0
3SsnPorch          0
ScreenPorch        0
PoolArea           0
MiscVal            0
MoSold             0
YrSold             0
SalePrice          0
dtype: int64

In [8]:
df_test.select_dtypes(include='number').isnull().sum()

Id                 0
MSSubClass         0
LotFrontage      227
LotArea            0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
MasVnrArea        15
BsmtFinSF1         1
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
1stFlrSF           0
2ndFlrSF           0
LowQualFinSF       0
GrLivArea          0
BsmtFullBath       2
BsmtHalfBath       2
FullBath           0
HalfBath           0
BedroomAbvGr       0
KitchenAbvGr       0
TotRmsAbvGrd       0
Fireplaces         0
GarageYrBlt       78
GarageCars         1
GarageArea         1
WoodDeckSF         0
OpenPorchSF        0
EnclosedPorch      0
3SsnPorch          0
ScreenPorch        0
PoolArea           0
MiscVal            0
MoSold             0
YrSold             0
dtype: int64

### 수치값은 널값을 평균으로 처리 

In [9]:
df_train.fillna(df_train.select_dtypes(include='number').mean(), inplace=True)
df_test.fillna(df_test.select_dtypes(include='number').mean(), inplace=True)

In [10]:
df_train_num = df_train.select_dtypes(include='number')
df_test_num = df_test.select_dtypes(include='number')

In [11]:
df_train_num.shape, df_test_num.shape

((1460, 38), (1459, 37))

## 파이캐럿 세팅 

In [12]:
sg = df_train_num.copy()

In [13]:
from pycaret.regression import *

In [14]:
sup = setup(sg, target = "SalePrice", train_size = 0.8)

Unnamed: 0,Description,Value
0,Session id,729
1,Target,SalePrice
2,Target type,Regression
3,Original data shape,"(1460, 38)"
4,Transformed data shape,"(1460, 38)"
5,Transformed train set shape,"(1168, 38)"
6,Transformed test set shape,"(292, 38)"
7,Numeric features,37
8,Preprocess,True
9,Imputation type,simple


## 파이케럿 모델 확인 

In [15]:
comp = compare_models(sort = 'RMSE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,17572.8819,911034231.4089,29164.7592,0.8452,0.1412,0.1006,0.245
gbr,Gradient Boosting Regressor,17200.0918,972186588.0239,29331.6217,0.823,0.1377,0.0982,0.206
rf,Random Forest Regressor,18147.3548,950438374.3108,29904.8733,0.8392,0.1467,0.1057,0.252
et,Extra Trees Regressor,17489.5141,990570557.3247,30061.5231,0.8326,0.1423,0.1007,0.212
xgboost,Extreme Gradient Boosting,18313.7397,1074531039.766,31217.5598,0.8261,0.1488,0.1047,0.209
llar,Lasso Least Angle Regression,21199.1617,1157025708.6939,33163.6471,0.8042,0.2003,0.1278,0.154
ridge,Ridge Regression,21219.9355,1158938622.6205,33190.5916,0.8037,0.2009,0.1279,0.156
lasso,Lasso Regression,21224.223,1159156904.368,33192.0734,0.8037,0.201,0.128,0.156
lr,Linear Regression,21224.9577,1159200802.1273,33192.5061,0.8037,0.201,0.128,0.439
en,Elastic Net,21443.7729,1202754252.7303,33970.1597,0.7966,0.1875,0.1265,0.161


Processing:   0%|          | 0/81 [00:00<?, ?it/s]

## 모델 선택 

In [16]:
comp