In [171]:
# 시험환경 세팅 (코드 변경 X)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, shuffle=True, random_state=2021)
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[id_name, target])
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[id_name, target])
    return X_train, X_test, y_train, y_test 
    
df = pd.read_csv("data/house/train.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='SalePrice', id_name='Id')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1168, 79), (292, 79), (1168, 2), (292, 2))

## 데이터 분석

In [173]:
X_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
81,120,RM,32.0,4500,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,3,2006,WD,Normal
1418,20,RL,71.0,9204,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,8,2008,COD,Normal
1212,30,RL,50.0,9340,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,8,2009,WD,Normal
588,20,RL,65.0,25095,Pave,,IR1,Low,AllPub,Inside,...,60,0,,,,0,6,2009,WD,Partial
251,120,RM,44.0,4750,Pave,,IR1,HLS,AllPub,Inside,...,153,0,,,,0,12,2007,WD,Family


In [175]:
import pandas as pd
X_train.shape, X_test.shape

((1168, 79), (292, 79))

In [177]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1168 entries, 81 to 1140
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1168 non-null   int64  
 1   MSZoning       1168 non-null   object 
 2   LotFrontage    956 non-null    float64
 3   LotArea        1168 non-null   int64  
 4   Street         1168 non-null   object 
 5   Alley          70 non-null     object 
 6   LotShape       1168 non-null   object 
 7   LandContour    1168 non-null   object 
 8   Utilities      1168 non-null   object 
 9   LotConfig      1168 non-null   object 
 10  LandSlope      1168 non-null   object 
 11  Neighborhood   1168 non-null   object 
 12  Condition1     1168 non-null   object 
 13  Condition2     1168 non-null   object 
 14  BldgType       1168 non-null   object 
 15  HouseStyle     1168 non-null   object 
 16  OverallQual    1168 non-null   int64  
 17  OverallCond    1168 non-null   int64  
 18  YearBuilt   

In [179]:
X_train.isnull().sum().sort_values().tail(3)

Alley          1098
MiscFeature    1124
PoolQC         1163
dtype: int64

## 심각한 결측치 제거

In [182]:
X_train.drop(columns=['PoolQC','MiscFeature','Alley'],inplace=True)
X_test.drop(columns=['PoolQC','MiscFeature','Alley'],inplace=True)

In [184]:
X_train.shape, X_test.shape

((1168, 76), (292, 76))

## 결측치 보간

In [187]:
num_cols=X_train.select_dtypes(['float64','int64']).columns
cat_cols=X_train.select_dtypes(['object']).columns

In [189]:
X_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,Fence,MiscVal,MoSold,YrSold,SaleType,SaleCondition
81,120,RM,32.0,4500,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,,0,3,2006,WD,Normal
1418,20,RL,71.0,9204,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,,0,8,2008,COD,Normal
1212,30,RL,50.0,9340,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,,0,8,2009,WD,Normal
588,20,RL,65.0,25095,Pave,IR1,Low,AllPub,Inside,Sev,...,0,0,60,0,,0,6,2009,WD,Partial
251,120,RM,44.0,4750,Pave,IR1,HLS,AllPub,Inside,Mod,...,0,0,153,0,,0,12,2007,WD,Family


In [191]:
for col in num_cols:
    X_train[col]=X_train[col].fillna(X_train[col].mean())
    X_test[col]=X_test[col].fillna(X_test[col].mean())

In [193]:
X_train.isnull().sum().sum(), X_test.isnull().sum().sum()

(2584, 669)

In [195]:
for col in cat_cols:
    X_train[col]=X_train[col].fillna(X_train[col].mode()[0])
    X_test[col]=X_test[col].fillna(X_test[col].mode()[0])

In [197]:
X_train.isnull().sum().sum(), X_test.isnull().sum().sum()

(0, 0)

## 범주형 변수 원핫인코딩

In [199]:
X_train.shape, X_test.shape

((1168, 76), (292, 76))

In [201]:
X_train= pd.get_dummies(X_train)
X_test= pd.get_dummies(X_test)

In [203]:
X_train.shape, X_test.shape

((1168, 275), (292, 246))

## train test align 작업

In [206]:
X_train, X_test = X_train.align(X_test,join='left',fill_value=0,axis=1)

In [208]:
X_train.shape, X_test.shape

((1168, 275), (292, 275))

## 랜덤포레스트 학습

In [210]:
from sklearn.ensemble import RandomForestRegressor

In [212]:
rf = RandomForestRegressor(random_state=42, n_estimators=500)

In [220]:
y_train.head()

Unnamed: 0,Id,SalePrice
81,82,153500
1418,1419,124000
1212,1213,113000
588,589,143000
251,252,235000


In [216]:
# y_train.pop('Id')

In [222]:
rf.fit(X_train,y_train['SalePrice'])
pred= rf.predict(X_test)

In [225]:
from sklearn.metrics import mean_squared_error, r2_score

In [227]:
rmse= mean_squared_error(y_test['SalePrice'],pred,squared=False)
r2= r2_score(y_test['SalePrice'],pred)



In [233]:
print("RMSE:", rmse)
print("R²:", r2)

RMSE: 24844.269928350943
R²: 0.8942271397027238
