In [2]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# for visualization
import matplotlib.pyplot as plt

# for evlauation
from sklearn.metrics import accuracy_score, f1_score

# K-fold cross validation
from sklearn.model_selection import KFold

In [42]:
df_train = pd.read_csv("./data/train.csv")
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [43]:
df_train = pd.read_csv("./data/train.csv")
df_test = pd.read_csv("./data/test.csv")
df_test.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

In [35]:
# "Alley", "MasVnrType", "PoolQC", "Fence", "MiscFeature" 열 삭제
columns_to_drop = ["Alley", "MasVnrType", "PoolQC", "Fence", "MiscFeature"]
df_train = df_train.drop(columns=columns_to_drop, axis=1)


In [36]:
from sklearn.preprocessing import LabelEncoder

# LabelEncoder 객체 생성
label_encoder = LabelEncoder()

# 값 형식이 object인 컬럼들 선정
columns_to_encode = ['MSZoning','Street', 'LotShape', 
                     'LandContour', 'Utilities', 'LotConfig', 
                     'SaleType', 'SaleCondition', 'LandSlope', 
                     'Neighborhood', 'Condition1', 'Condition2', 
                     'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
                     'Exterior1st', 'Exterior2nd', 'ExterQual',
                     'ExterCond','Foundation','BsmtQual','BsmtCond','BsmtExposure',
                     'BsmtFinType1','BsmtFinType2','Heating','HeatingQC',
                     'CentralAir','Electrical','KitchenQual','Functional',
                     'FireplaceQu','GarageType','GarageFinish',
                     'GarageQual',	'GarageCond','PavedDrive',
                     'YrSold', 'SaleType','SaleCondition']

# 데이터프레임의 특정 열에 라벨 인코딩 적용
for column in columns_to_encode:
    df_train[column] = label_encoder.fit_transform(df_train[column])

In [37]:
correlation_matrix = df_train.corr()
corr = abs(correlation_matrix["SalePrice"])
corr.sort_values(ascending=False).iloc[ :26]

SalePrice       1.000000
OverallQual     0.790982
GrLivArea       0.708624
GarageCars      0.640409
ExterQual       0.636884
GarageArea      0.623431
BsmtQual        0.620886
TotalBsmtSF     0.613581
1stFlrSF        0.605852
KitchenQual     0.589189
FullBath        0.560664
GarageFinish    0.549247
TotRmsAbvGrd    0.533723
YearBuilt       0.522897
YearRemodAdd    0.507101
GarageYrBlt     0.486362
MasVnrArea      0.477493
Fireplaces      0.466929
FireplaceQu     0.459605
GarageType      0.415283
HeatingQC       0.400178
BsmtFinSF1      0.386420
Foundation      0.382479
LotFrontage     0.351799
WoodDeckSF      0.324413
2ndFlrSF        0.319334
Name: SalePrice, dtype: float64

In [38]:
corr.sort_values(ascending=False).keys()

Index(['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'ExterQual',
       'GarageArea', 'BsmtQual', 'TotalBsmtSF', '1stFlrSF', 'KitchenQual',
       'FullBath', 'GarageFinish', 'TotRmsAbvGrd', 'YearBuilt', 'YearRemodAdd',
       'GarageYrBlt', 'MasVnrArea', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'HeatingQC', 'BsmtFinSF1', 'Foundation', 'LotFrontage', 'WoodDeckSF',
       '2ndFlrSF', 'OpenPorchSF', 'BsmtExposure', 'HalfBath', 'LotArea',
       'LotShape', 'CentralAir', 'Electrical', 'PavedDrive', 'BsmtFullBath',
       'RoofStyle', 'BsmtUnfSF', 'SaleCondition', 'Neighborhood', 'HouseStyle',
       'BedroomAbvGr', 'MSZoning', 'KitchenAbvGr', 'RoofMatl', 'EnclosedPorch',
       'ExterCond', 'Functional', 'ScreenPorch', 'Exterior2nd', 'Exterior1st',
       'BsmtFinType1', 'Heating', 'PoolArea', 'Condition1', 'BldgType',
       'MSSubClass', 'OverallCond', 'LotConfig', 'SaleType', 'LandSlope',
       'MoSold', '3SsnPorch', 'Street', 'YrSold', 'LowQualFinSF', 'GarageCond',

In [39]:
  
from preprocess import get_X, get_y
from utils import CustomDataset
import torch
from torch import nn
from torch.utils.data import DataLoader

df_train = pd.read_csv("./data/train.csv")
df_test = pd.read_csv("./data/test.csv")


feature_list = ['OverallQual', 'GrLivArea', 'GarageCars', 'ExterQual',
       'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'KitchenQual',
       'FullBath',  'TotRmsAbvGrd', 'YearBuilt', 'YearRemodAdd', 'Fireplaces',
       'HeatingQC', 'BsmtFinSF1', 'Foundation', 'WoodDeckSF',
       '2ndFlrSF']
X_trn = get_X(df_train, features=feature_list)
y_trn = get_y(df_train, feature="SalePrice")[:,np.newaxis]
X_tst = get_X(df_test, features=feature_list)

ds = CustomDataset(X_trn, y_trn)
dl = DataLoader(ds, batch_size=32, shuffle=True)

ds_tst = CustomDataset(X_tst)
dl_tst = DataLoader(ds_tst, batch_size=32)

In [17]:
import torchmetrics
from typing import Optional, List
def evaluate(
  model:nn.Module,
  criterion:callable,
  data_loader:DataLoader,
  device:str,
  metric:Optional[torchmetrics.metric.Metric]=None,
  multi_metrics: List[torchmetrics.metric.Metric]=None
) -> float:
  '''evaluate
  
  Args:
      model: model
      criterions: list of criterion functions
      data_loader: data loader
      device: device
  '''
  model.eval()
  total_loss = 0.
  with torch.inference_mode():
    for X, y in data_loader:
      X, y = X.to(device), y.to(device)
      output = model(X)
      total_loss += criterion(output, y).item() * len(y)
      if metric is not None:
        metric.update(output, y)
      if multi_metrics is not None:
        for metric in multi_metrics:
          metric.update(output, y)
  return total_loss/len(data_loader.dataset)

# 결과 분석
- model_1: {'MSE': 126.5949478149414, 'MAE': 9.202899551391601, 'RMSE': 11.228694915771484}
    - deep model(4 linear layers), use dropout
- model_2: {'MSE': 20.10561414082845, 'MAE': 2.4160611412525177, 'RMSE': 4.328095411618551}
    - deep model(4 linear layers), no use dropout
- model_3: {'MSE': 17.09292084312439, 'MAE': 2.4224738261699676, 'RMSE': 4.039131463209788}
    - shallow model(2 linear layers), no use dropout

In [25]:
from nn import ANN
print(X_trn.shape)
device = torch.device("cpu")
model_1 = ANN(X_trn.shape[-1], hidden_dim=[128, 128, 64, 32], activation="relu", use_dropout=True, drop_ratio = 0.5).to(device)
model_1.load_state_dict(torch.load("./pretrained_weight/dense_use_dropout.pth"))
model_1.eval()

model_2 = ANN(X_trn.shape[-1], hidden_dim=[128, 128, 64, 32], activation="relu", use_dropout=False, drop_ratio = 0.5).to(device)
model_2.load_state_dict(torch.load("./pretrained_weight/dense_wo_dropout.pth"))
model_2.eval()

model_3 = ANN(X_trn.shape[-1], hidden_dim=[64, 32], activation="relu", use_dropout=True, drop_ratio = 0.5).to(device)
model_3.load_state_dict(torch.load("./pretrained_weight/shallow_use_dropout.pth"))
model_3.eval()


(1460, 33)


ANN(
  (dropout): Dropout(p=0.5, inplace=False)
  (identity): Identity()
  (activation): ReLU()
  (relu): ReLU()
  (module_list): ModuleList(
    (0): Linear(in_features=33, out_features=64, bias=True)
    (1): Dropout(p=0.5, inplace=False)
    (2): ReLU()
    (3): Linear(in_features=64, out_features=32, bias=True)
    (4): Dropout(p=0.5, inplace=False)
    (5): ReLU()
    (6): Linear(in_features=32, out_features=1, bias=True)
    (7): ReLU()
  )
)

In [40]:
result = []
with torch.inference_mode():
    for X in dl_tst:
      X = X[0].to(device)
      output = (model_3(X) * 10000).squeeze().tolist()
      result.extend(output)

[126455.9453125, 166153.28125, 175623.796875, 186396.203125, 181911.09375, 171928.640625, 167871.6875, 152497.390625, 185757.46875, 134829.71875, 216294.953125, 96830.375, 94521.2734375, 158273.34375, 113650.7421875, 349249.875, 239836.84375, 279653.25, 288997.84375, 525616.3125, 307710.34375, 198709.1875, 178992.328125, 168820.796875, 170480.1875, 198145.40625, 321055.3125, 254406.109375, 176229.484375, 220419.328125, 189865.875, 111560.7421875]
[178437.671875, 289244.96875, 304357.1875, 230274.984375, 199077.828125, 162483.078125, 163711.359375, 163463.9375, 181872.34375, 149941.890625, 257877.734375, 230091.703125, 231139.53125, 158805.796875, 291013.4375, 193938.640625, 150389.84375, 147808.40625, 155676.65625, 181502.15625, 140419.28125, 152428.34375, 191578.75, 149878.546875, 156245.9375, 149327.90625, 231597.265625, 135715.390625, 142326.171875, 176375.828125, 107518.109375, 122897.8125]
[124292.5, 124565.1171875, 89580.3515625, 129642.0546875, 151649.421875, 175793.46875, 14444

In [32]:
test_id = df_test.Id.tolist()
col_name = ['Id', 'SalePrice']
list_df = pd.DataFrame(zip(test_id, result), columns=col_name)
list_df.to_csv("Result.csv", index=False)