# 아이오와 주 에임스(Ames)에 있는 주거용 주택을 설명하는 79가지 변수로 각 주택의 최종 가격을 예측합니다.

# 데이터 분석 처리 단계 

# 1단계 : 탐색적 자료 분석 (EDA : Exploratory Data Analysis)
- 시각화와 기술 통계(Descriptive statistics)를 통해서 데이터를 이해하는 단계이다.

## 1-1 모듈 사용하기 

In [1]:
import pandas as pd
import numpy as np

## 1-2 데이터 읽어오기

In [2]:
dirname = './data/ames_house_prices'

In [3]:
df_train = pd.read_csv(dirname + '/train.csv')
df_test = pd.read_csv(dirname + '/test.csv')

In [4]:
df_train.shape, df_test.shape

((1460, 81), (1459, 80))

In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [6]:
#check the numbers of samples and features
print("The train data size before dropping Id feature is : {} ".format(df_train.shape))
print("The test data size before dropping Id feature is : {} ".format(df_test.shape))

#Save the 'Id' column
train_ID = df_train['Id']
test_ID = df_test['Id']

#Now drop the  'Id' colum since it's unnecessary for  the prediction process.
df_train.drop("Id", axis = 1, inplace = True)
df_test.drop("Id", axis = 1, inplace = True)

#check again the data size after dropping the 'Id' variable
print("\nThe train data size after dropping Id feature is : {} ".format(df_train.shape)) 
print("The test data size after dropping Id feature is : {} ".format(df_test.shape))

The train data size before dropping Id feature is : (1460, 81) 
The test data size before dropping Id feature is : (1459, 80) 

The train data size after dropping Id feature is : (1460, 80) 
The test data size after dropping Id feature is : (1459, 79) 


# 2단계 : 전처리 (Preprocessing)

데이터를 정제하고 가공해서 머신 러닝 모델의 입력에 적합한 형태로 바꿔주는 단계이다.

- Data Cleaning
  - Deduplication
  - Outlier detection
  - Other cleaning techniques

- For model
  - Feature extraction
  - Feature scaling
  - Dummification
  - Dimensionality reduction

탐색적 자료 분석과 전처리는 순차적이라기 보다 반복적인 관계이다. EDA를 통해 어떤 전처리가 필요한지 알 수 있고 전처리를 통해 EDA를 수월하게 할 수 있다.

In [7]:
cols_fillna = ['PoolQC','MiscFeature','Alley','Fence','MasVnrType','FireplaceQu',
               'GarageQual','GarageCond','GarageFinish','GarageType', 'Electrical',
               'KitchenQual', 'SaleType', 'Functional', 'Exterior2nd', 'Exterior1st',
               'BsmtExposure','BsmtCond','BsmtQual','BsmtFinType1','BsmtFinType2',
               'MSZoning', 'Utilities']

for col in cols_fillna:
    df_train[col].fillna('None',inplace=True)
    df_test[col].fillna('None',inplace=True)

In [8]:
df_train.select_dtypes(include='number').isnull().sum()

MSSubClass         0
LotFrontage      259
LotArea            0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
MasVnrArea         8
BsmtFinSF1         0
BsmtFinSF2         0
BsmtUnfSF          0
TotalBsmtSF        0
1stFlrSF           0
2ndFlrSF           0
LowQualFinSF       0
GrLivArea          0
BsmtFullBath       0
BsmtHalfBath       0
FullBath           0
HalfBath           0
BedroomAbvGr       0
KitchenAbvGr       0
TotRmsAbvGrd       0
Fireplaces         0
GarageYrBlt       81
GarageCars         0
GarageArea         0
WoodDeckSF         0
OpenPorchSF        0
EnclosedPorch      0
3SsnPorch          0
ScreenPorch        0
PoolArea           0
MiscVal            0
MoSold             0
YrSold             0
SalePrice          0
dtype: int64

In [9]:
df_test.select_dtypes(include='number').isnull().sum()

MSSubClass         0
LotFrontage      227
LotArea            0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
MasVnrArea        15
BsmtFinSF1         1
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
1stFlrSF           0
2ndFlrSF           0
LowQualFinSF       0
GrLivArea          0
BsmtFullBath       2
BsmtHalfBath       2
FullBath           0
HalfBath           0
BedroomAbvGr       0
KitchenAbvGr       0
TotRmsAbvGrd       0
Fireplaces         0
GarageYrBlt       78
GarageCars         1
GarageArea         1
WoodDeckSF         0
OpenPorchSF        0
EnclosedPorch      0
3SsnPorch          0
ScreenPorch        0
PoolArea           0
MiscVal            0
MoSold             0
YrSold             0
dtype: int64

In [10]:
df_train.fillna(df_train.select_dtypes(include='number').mean(), inplace=True)
df_test.fillna(df_test.select_dtypes(include='number').mean(), inplace=True)

In [11]:
df_train_num = df_train.select_dtypes(include='number')
df_test_num = df_test.select_dtypes(include='number')

In [12]:
df_train_num.shape, df_test_num.shape

((1460, 37), (1459, 36))

In [13]:
## 로그값으로 처리 

In [14]:
sg = df_train_num.copy()

In [15]:
from pycaret.regression import *

In [16]:
sup = setup(sg, target = "SalePrice", train_size = 0.8)

Unnamed: 0,Description,Value
0,Session id,4733
1,Target,SalePrice
2,Target type,Regression
3,Original data shape,"(1460, 37)"
4,Transformed data shape,"(1460, 37)"
5,Transformed train set shape,"(1168, 37)"
6,Transformed test set shape,"(292, 37)"
7,Numeric features,36
8,Preprocess,True
9,Imputation type,simple


In [17]:
comp = compare_models(sort = 'RMSE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,16801.8744,797375964.369,27575.2857,0.858,0.1363,0.0975,0.055
et,Extra Trees Regressor,17457.0543,819943784.618,28263.7355,0.8576,0.1435,0.1027,0.071
rf,Random Forest Regressor,18038.639,922242605.0745,29657.3918,0.8354,0.1479,0.1062,0.092
lightgbm,Light Gradient Boosting Machine,17681.2987,927454379.5839,30040.8627,0.841,0.142,0.1001,0.079
xgboost,Extreme Gradient Boosting,18913.1776,961112596.8013,30541.1432,0.8338,0.1479,0.1079,0.052
ada,AdaBoost Regressor,24402.6169,1293177226.4162,35661.9824,0.7784,0.2036,0.1613,0.034
llar,Lasso Least Angle Regression,22903.7252,1625729091.2452,38047.2398,0.7038,0.2159,0.1368,0.014
ridge,Ridge Regression,22933.4672,1628741765.7012,38073.1105,0.7031,0.2171,0.1371,0.014
lasso,Lasso Regression,22942.5896,1629412792.8488,38080.0628,0.703,0.2173,0.1371,0.014
lr,Linear Regression,22943.3592,1629439061.4498,38080.5319,0.703,0.2173,0.1371,0.248


Processing:   0%|          | 0/81 [00:00<?, ?it/s]