In [363]:
# 시험환경 세팅 (코드 변경 X)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, shuffle=True, random_state=2021)
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[id_name, target])
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[id_name, target])
    return X_train, X_test, y_train, y_test 
    
df = pd.read_csv("data/house/train.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='SalePrice', id_name='Id')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1168, 79), (292, 79), (1168, 2), (292, 2))

## 데이터 분석

In [366]:
X_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
81,120,RM,32.0,4500,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,3,2006,WD,Normal
1418,20,RL,71.0,9204,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,8,2008,COD,Normal
1212,30,RL,50.0,9340,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,8,2009,WD,Normal
588,20,RL,65.0,25095,Pave,,IR1,Low,AllPub,Inside,...,60,0,,,,0,6,2009,WD,Partial
251,120,RM,44.0,4750,Pave,,IR1,HLS,AllPub,Inside,...,153,0,,,,0,12,2007,WD,Family


In [368]:
import pandas as pd
X_train.shape, X_test.shape

((1168, 79), (292, 79))

In [370]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1168 entries, 81 to 1140
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1168 non-null   int64  
 1   MSZoning       1168 non-null   object 
 2   LotFrontage    956 non-null    float64
 3   LotArea        1168 non-null   int64  
 4   Street         1168 non-null   object 
 5   Alley          70 non-null     object 
 6   LotShape       1168 non-null   object 
 7   LandContour    1168 non-null   object 
 8   Utilities      1168 non-null   object 
 9   LotConfig      1168 non-null   object 
 10  LandSlope      1168 non-null   object 
 11  Neighborhood   1168 non-null   object 
 12  Condition1     1168 non-null   object 
 13  Condition2     1168 non-null   object 
 14  BldgType       1168 non-null   object 
 15  HouseStyle     1168 non-null   object 
 16  OverallQual    1168 non-null   int64  
 17  OverallCond    1168 non-null   int64  
 18  YearBuilt   

In [372]:
X_train.isnull().sum().sort_values(ascending=False)

PoolQC           1163
MiscFeature      1124
Alley            1098
Fence             937
MasVnrType        703
                 ... 
Heating             0
MSZoning            0
CentralAir          0
1stFlrSF            0
SaleCondition       0
Length: 79, dtype: int64

In [374]:
X_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
81,120,RM,32.0,4500,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,3,2006,WD,Normal
1418,20,RL,71.0,9204,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,8,2008,COD,Normal
1212,30,RL,50.0,9340,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,8,2009,WD,Normal
588,20,RL,65.0,25095,Pave,,IR1,Low,AllPub,Inside,...,60,0,,,,0,6,2009,WD,Partial
251,120,RM,44.0,4750,Pave,,IR1,HLS,AllPub,Inside,...,153,0,,,,0,12,2007,WD,Family


In [376]:
y_train.head()

Unnamed: 0,Id,SalePrice
81,82,153500
1418,1419,124000
1212,1213,113000
588,589,143000
251,252,235000


In [378]:
X_train.drop(['PoolQC','MiscFeature','Alley'],axis=1,inplace=True)

In [380]:
X_train.shape,X_test.shape

((1168, 76), (292, 79))

In [382]:
X_test.drop(['PoolQC','MiscFeature','Alley'],axis=1,inplace=True)

In [384]:
num_cols=X_train.select_dtypes(['float64','int64']).columns
cat_cols=X_train.select_dtypes(['O']).columns

In [389]:
X_train.shape,X_test.shape

((1168, 76), (292, 76))

In [391]:
for col in num_cols:
    X_train[col]= X_train[col].fillna(X_train[col].median())
    X_test[col]= X_test[col].fillna(X_test[col].median())

In [393]:
for col in cat_cols:
    X_train[col]= X_train[col].fillna(X_train[col].mode()[0])
    X_test[col]= X_test[col].fillna(X_test[col].mode()[0])

In [395]:
X_train.isnull().sum().sum()

0

In [397]:
# 결측치처리완료.
# 범주형변수 원핫인코딩 후 정렬필요
X_train.shape,X_test.shape

((1168, 76), (292, 76))

In [399]:
X_train= pd.get_dummies(X_train)
X_test=pd.get_dummies(X_test)

In [401]:
X_train.shape,X_test.shape

((1168, 275), (292, 246))

In [403]:
X_train.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
81,120,32.0,4500,6,5,1998,1998,443.0,1201,0,...,False,False,False,True,False,False,False,False,True,False
1418,20,71.0,9204,5,5,1963,1963,0.0,25,872,...,False,False,False,False,False,False,False,False,True,False
1212,30,50.0,9340,4,6,1941,1950,0.0,344,0,...,False,False,False,True,False,False,False,False,True,False
588,20,65.0,25095,5,8,1968,2003,0.0,1324,0,...,False,False,False,True,False,False,False,False,False,True
251,120,44.0,4750,8,5,2006,2007,481.0,1573,0,...,False,False,False,True,False,False,False,True,False,False


In [405]:
X_train, X_test= X_train.align(X_test,join='left',axis=1, fill_value=0)

In [357]:
help(X_train.align)

Help on method align in module pandas.core.generic:

align(other: 'NDFrameT', join: 'AlignJoin' = 'outer', axis: 'Axis | None' = None, level: 'Level | None' = None, copy: 'bool_t | None' = None, fill_value: 'Hashable | None' = None, method: 'FillnaOptions | None | lib.NoDefault' = <no_default>, limit: 'int | None | lib.NoDefault' = <no_default>, fill_axis: 'Axis | lib.NoDefault' = <no_default>, broadcast_axis: 'Axis | None | lib.NoDefault' = <no_default>) -> 'tuple[Self, NDFrameT]' method of pandas.core.frame.DataFrame instance
    Align two objects on their axes with the specified join method.

    Join method is specified for each axis Index.

    Parameters
    ----------
    other : DataFrame or Series
    join : {'outer', 'inner', 'left', 'right'}, default 'outer'
        Type of alignment to be performed.

        * left: use only keys from left frame, preserve key order.
        * right: use only keys from right frame, preserve key order.
        * outer: use union of keys from 

In [407]:
X_train.shape,X_test.shape

((1168, 275), (292, 275))

In [413]:
y_train.columns

Index(['Id', 'SalePrice'], dtype='object')

In [409]:
# 학습 준비
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [417]:
rf=RandomForestRegressor(random_state=42,n_estimators=500)
rf.fit(X_train,y_train['SalePrice'])
pred=rf.predict(X_test)

In [422]:
mse=mean_squared_error(y_test['SalePrice'],pred,squared=False)



In [426]:
r2= r2_score(y_test['SalePrice'],pred)
r2

0.8935629483411527