In [None]:
!pip install pytorch-tabular

#역자 추가 코드
!pip install scikit_learn



In [None]:
from pytorch_tabular import TabularModel
from pytorch_tabular.models import (
    FTTransformerConfig,
    TabNetModelConfig,
    TabTransformerConfig
)
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig
from pytorch_tabular.models.common.heads import LinearHeadConfig

# Classification

## Download the dataset
The "Adult" dataset, also known as the "Census Income" or "adult.data" dataset, is widely used in machine learning for tasks that involve classifying two different categories. It was created by Barry Becker from data collected by the United States Census Bureau in 1994. The main goal with this data is to predict if a person's income is over $50,000 a year based on various other pieces of information.


In [None]:
import pandas as pd

url = "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
column_names = ['age', 'workclass', 'fnlwgt', 'education',
                'education-num', 'marital-status', 'occupation',
                'relationship', 'race', 'sex', 'capital-gain',
                'capital-loss', 'hours-per-week', 'native-country', 'income']
data = pd.read_csv(url, names=column_names)

# 데이터프레임 data를 csv 파일 형태로 저장
data.to_csv('adult.csv', index=False)


In [None]:
### 위 코드의 단순 점검 코드

"""
import pandas as pd

url = (
    "http://archive.ics.uci.edu/ml/machine-learning-databases"
    "/adult/adult.data"
)


column_names = ['age', 'workclass', 'fnlwgt', 'education',
                'education-num', 'marital-status', 'occupation',
                'relationship', 'race', 'sex', 'capital-gain',
                'capital-loss', 'hours-per-week', 'native-country', 'income']
data = pd.read_csv(url, names=column_names)

# Save the dataframe into a CSV file
data.to_csv('adult.csv', index=False)
"""

## Create train, test split

In [None]:
# 데이터를 학습 및 테스트 데이터셋으로 분리
train = data.sample(frac=0.8, random_state=0)
test = data.drop(train.index)

# 범주형 컬럼(변수)명 및 숫자형 컬럼(변수)명 기재
cat_col_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
num_col_names = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
target_col_name = ["income"]


## Set up the configurations
This is a critical step in the procedure. You'll need to supply four configurations (most of them come with sensible default values), which will guide the rest of the process.

1. DataConfig - This is where you specify the names of the target, categorical, and numerical columns, as well as any transformations that need to be done.

2. ModelConfig - Each model has its own specific configuration. This config not only determines the model we'll train but also allows you to set the model's hyperparameters.

3. TrainerConfig - This config allows you to tailor the training process by setting parameters such as batch size, number of epochs, early stopping criteria, etc. Most of these parameters are taken directly from PyTorch Lightning and are passed to the underlying Trainer object during the training process.

4. OptimizerConfig - This configuration allows you to define and utilize various optimizers and learning rate schedulers. Standard PyTorch Optimizers and Learning Rate Schedulers are supported. If you want to use custom optimizers, you can override this by using the parameter in the fit method. Remember, the custom optimizer should be compatible with PyTorch.

In [None]:
# 데이터 설정
data_config = DataConfig(
    target=target_col_name,
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
    continuous_feature_transform="quantile_normal",
    normalize_continuous_features=True
)

# Trainer 설정
trainer_config = TrainerConfig(
    auto_lr_find=True,
    batch_size=256,
    max_epochs=100,
    early_stopping="valid_loss",
    early_stopping_mode="min",
    early_stopping_patience=5,
    checkpoints="valid_loss",
    load_best=True
)

# 옵티마이저 설정
optimizer_config = OptimizerConfig()

# 모델 설정
head_config = LinearHeadConfig(
    layers="",
    dropout=0.1,
    initialization="kaiming"
).__dict__

In the following section, we will train our classifier with 3 models, TabTransformer, FT Transformer and Tabnet

# TabTransformer

In [None]:
model_config = TabTransformerConfig(
    task="classification",
    head = "LinearHead",       # 선형 헤드
    head_config = head_config, # 선형 헤드 설정
    learning_rate = 1e-3
)

In [None]:
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)

INFO:pytorch_tabular.tabular_model:Experiment Tracking is turned off


In [None]:
# 런타임 2분 소요
tabular_model.fit(train=train)
tabular_model.evaluate(test)

INFO:lightning_fabric.utilities.seed:Seed set to 42
INFO:pytorch_tabular.tabular_model:Preparing the DataLoaders
INFO:pytorch_tabular.tabular_datamodule:Setting up the datamodule for classification task
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

INFO:pytorch_lightning.tuner.lr_finder:LR finder stopped early after 96 steps due to diverging loss.
INFO:pytorch_lightning.tuner.lr_finder:Learning rate set to 0.0005248074602497723
INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/.lr_find_d09f7089-93c5-4d04-80b1-311a1dd71abb.ckpt
INFO:pytorch_lightning.utilities.rank_zero:Restored all states from the checkpoint at /content/.lr_find_d09f7089-93c5-4d04-80b1-311a1dd71abb.ckpt
INFO:pytorch_tabular.tabular_model:Suggested LR: 0.0005248074602497723. For plot and detailed analysis, use `find_learning_rate` method.
INFO:pytorch_tabular.tabular_model:Training Started
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

INFO:pytorch_tabular.tabular_model:Training the model completed
INFO:pytorch_tabular.tabular_model:Loading the best model
  return torch.load(f, map_location=map_location)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded

Output()

[{'test_loss_0': 0.33141422271728516,
  'test_loss': 0.33141422271728516,
  'test_accuracy': 0.8430589437484741}]

In [None]:
prediction=tabular_model.predict(test)
prediction.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

Unnamed: 0,income_ <=50K_probability,income_ >50K_probability,income_prediction
10,0.321736,0.678264,>50K
13,0.887247,0.112753,<=50K
19,0.90389,0.09611,<=50K
28,0.672418,0.327582,<=50K
40,0.962929,0.037071,<=50K


# FT Transformer

In [None]:
# 런타임 1분 30초 소요
model_config = FTTransformerConfig(
    task="classification",
    learning_rate = 1e-3,
    head = "LinearHead",       # 선형 헤드
    head_config = head_config, # 선형 헤드 설정
)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)
tabular_model.fit(train=train)
tabular_model.evaluate(test)

INFO:pytorch_tabular.tabular_model:Experiment Tracking is turned off
INFO:lightning_fabric.utilities.seed:Seed set to 42
INFO:pytorch_tabular.tabular_model:Preparing the DataLoaders
INFO:pytorch_tabular.tabular_datamodule:Setting up the datamodule for classification task
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = d

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_steps=100` reached.
INFO:pytorch_lightning.tuner.lr_finder:Learning rate set to 0.003981071705534969
INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/.lr_find_2feffb40-37cc-42a6-9edb-b43fbf8ce89c.ckpt
INFO:pytorch_lightning.utilities.rank_zero:Restored all states from the checkpoint at /content/.lr_find_2feffb40-37cc-42a6-9edb-b43fbf8ce89c.ckpt
INFO:pytorch_tabular.tabular_model:Suggested LR: 0.003981071705534969. For plot and detailed analysis, use `find_learning_rate` method.
INFO:pytorch_tabular.tabular_model:Training Started
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

INFO:pytorch_tabular.tabular_model:Training the model completed
INFO:pytorch_tabular.tabular_model:Loading the best model
  return torch.load(f, map_location=map_location)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded

Output()

[{'test_loss_0': 0.3286218047142029,
  'test_loss': 0.3286218047142029,
  'test_accuracy': 0.849662184715271}]

In [None]:
prediction=tabular_model.predict(test)
prediction.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

Unnamed: 0,income_ <=50K_probability,income_ >50K_probability,income_prediction
10,0.410153,0.589847,>50K
13,0.850288,0.149712,<=50K
19,0.765744,0.234256,<=50K
28,0.696519,0.303481,<=50K
40,0.880957,0.119043,<=50K


In [None]:
# TabNet

In [None]:
# 런타임 2분 소요
model_config = TabNetModelConfig(
    task="classification",
    learning_rate = 1e-3,
    head = "LinearHead",       # 선형 헤드
    head_config = head_config, # 선형 헤드 설정
)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)
tabular_model.fit(train=train)
tabular_model.evaluate(test)

INFO:pytorch_tabular.tabular_model:Experiment Tracking is turned off
INFO:lightning_fabric.utilities.seed:Seed set to 42
INFO:pytorch_tabular.tabular_model:Preparing the DataLoaders
INFO:pytorch_tabular.tabular_datamodule:Setting up the datamodule for classification task
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = d

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_steps=100` reached.
INFO:pytorch_lightning.tuner.lr_finder:Learning rate set to 0.02089296130854041
INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/.lr_find_080979e0-efa2-482f-8622-f6b0907862fa.ckpt
INFO:pytorch_lightning.utilities.rank_zero:Restored all states from the checkpoint at /content/.lr_find_080979e0-efa2-482f-8622-f6b0907862fa.ckpt
INFO:pytorch_tabular.tabular_model:Suggested LR: 0.02089296130854041. For plot and detailed analysis, use `find_learning_rate` method.
INFO:pytorch_tabular.tabular_model:Training Started
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

INFO:pytorch_tabular.tabular_model:Training the model completed
INFO:pytorch_tabular.tabular_model:Loading the best model
  return torch.load(f, map_location=map_location)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded

Output()

[{'test_loss_0': 0.32562941312789917,
  'test_loss': 0.32562941312789917,
  'test_accuracy': 0.8475123047828674}]

In [None]:
prediction=tabular_model.predict(test)
prediction.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

Unnamed: 0,income_ <=50K_probability,income_ >50K_probability,income_prediction
10,0.435515,0.564485,>50K
13,0.905861,0.094139,<=50K
19,0.793829,0.206171,<=50K
28,0.777998,0.222002,<=50K
40,0.916356,0.083644,<=50K


# Regression Problem

The Ames Housing dataset describes the sale of individual residential properties in Ames, Iowa from 2006 to 2010. It contains a large number of explanatory variables (over 80) involved in assessing home values, offering a rich set of variables for predictive modeling.

The variables involved cover a wide range of aspects, including:

1. General characteristics of the property, such as the type of dwelling, the zone where it is located, its proximity to various amenities and roads, and the overall shape and layout of the property and lot.
2. Specific features of the house, such as the type of roof, exterior, masonry, and foundation.
3. The overall quality and condition of various aspects of the house, from the exterior to the heating.
4. Information about various areas of the house, like the basement, garage, and porch, and the presence of a pool.
The number and quality of rooms, bedrooms, kitchens, and bathrooms.
5. Information about the sale, such as the type of sale, the condition of sale, and the month and year of the sale.

The target variable is the final price at which the property was sold. This makes it a regression problem if we want to build a machine learning model to predict the sale price based on the rest of the variables.

## Download the dataset

In [None]:
import pandas as pd

url = "https://raw.githubusercontent.com/wblakecannon/ames/master/data/housing.csv"
ames_df = pd.read_csv(url)



In [None]:
"""
import pandas as pd

url = (
    "https://raw.githubusercontent.com/wblakecannon/ames/master/data/"
    "housing.csv"
)
ames_df = pd.read_csv(url)
"""

## Specify the continous and categorical variables
 Note: You could further optimize it.

In [None]:
# 범주형 컬럼(변수) 및 숫자형 컬럼(변수) 리스트
cat_cols = ['Garage Yr Blt', 'Mo Sold', 'Yr Sold','Open Porch SF', 'Enclosed Porch', '3Ssn Porch', 'Screen Porch','Wood Deck SF','Fireplaces','Year Remod/Add','Year Built','Overall Cond','Overall Qual','MS SubClass', 'MS Zoning', 'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating', 'Heating QC', 'Central Air', 'Electrical', 'Kitchen Qual', 'Functional', 'Fireplace Qu', 'Garage Type', 'Garage Finish', 'Garage Qual', 'Garage Cond', 'Paved Drive', 'Pool QC', 'Fence', 'Misc Feature', 'Sale Type', 'Sale Condition']
num_cols = ['Lot Frontage', 'Lot Area',   'Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'TotRms AbvGrd',   'Garage Cars', 'Garage Area',   'Pool Area', 'Misc Val']
target_col = ['SalePrice']

## Perform Null Value Imputation
1. Replace with Mode for categorical varibale
2. Replace with median for Continous variable

Note: You could further optimize this

> Indented block



In [None]:
for col in cat_cols:
    ames_df[col].fillna(ames_df[col].mode()[0], inplace=True)

# 연속형 컬럼(변수)에서 NaN을 중위수로 대체
for col in num_cols+target_col:
    ames_df[col].fillna(ames_df[col].median(), inplace=True)
ames_df = ames_df.dropna()

# 처음 몇 행의 값 확인
print(ames_df.shape)
print(ames_df.head())

(2930, 83)
   Unnamed: 0  Order        PID  MS SubClass MS Zoning  Lot Frontage  \
0           0      1  526301100           20        RL         141.0   
1           1      2  526350040           20        RH          80.0   
2           2      3  526351010           20        RL          81.0   
3           3      4  526353030           20        RL          93.0   
4           4      5  527105010           60        RL          74.0   

   Lot Area Street Alley Lot Shape  ... Pool Area Pool QC  Fence Misc Feature  \
0     31770   Pave  Grvl       IR1  ...         0      Ex  MnPrv         Shed   
1     11622   Pave  Grvl       Reg  ...         0      Ex  MnPrv         Shed   
2     14267   Pave  Grvl       IR1  ...         0      Ex  MnPrv         Gar2   
3     11160   Pave  Grvl       Reg  ...         0      Ex  MnPrv         Shed   
4     13830   Pave  Grvl       IR1  ...         0      Ex  MnPrv         Shed   

  Misc Val Mo Sold Yr Sold Sale Type Sale Condition  SalePrice  
0   

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ames_df[col].fillna(ames_df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ames_df[col].fillna(ames_df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which

## Perform Min-max scalar

In [None]:
from sklearn.preprocessing import MinMaxScaler

# df를 여러분의 데이터프레임이라고 가정하고,
# 여러분이 스케일링하고 싶은 컬럼(변수)은 'cols_to_scale' 리스트에 있다고 가정
scaler = MinMaxScaler()
cols_to_scale=num_cols+target_col
# scaler를 'cols_to_scale'의 컬럼(변수)로 적합(fit)
scaler.fit(ames_df[cols_to_scale])

# 컬럼 트랜스폼(transform)
ames_df[cols_to_scale] = scaler.transform(ames_df[cols_to_scale])

In [None]:
print(cat_cols)
print(num_cols)
print(target_col)

['Garage Yr Blt', 'Mo Sold', 'Yr Sold', 'Open Porch SF', 'Enclosed Porch', '3Ssn Porch', 'Screen Porch', 'Wood Deck SF', 'Fireplaces', 'Year Remod/Add', 'Year Built', 'Overall Cond', 'Overall Qual', 'MS SubClass', 'MS Zoning', 'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating', 'Heating QC', 'Central Air', 'Electrical', 'Kitchen Qual', 'Functional', 'Fireplace Qu', 'Garage Type', 'Garage Finish', 'Garage Qual', 'Garage Cond', 'Paved Drive', 'Pool QC', 'Fence', 'Misc Feature', 'Sale Type', 'Sale Condition']
['Lot Frontage', 'Lot Area', 'Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area',

## Train, Test split

In [None]:
train = ames_df.sample(frac=0.8, random_state=0)
test = ames_df.drop(train.index)

## Define Model Configuration

In [None]:
# 데이터 설정
data_config = DataConfig(
    target=target_col,
    continuous_cols=num_cols,
    categorical_cols=cat_cols,
    continuous_feature_transform="quantile_normal",
    normalize_continuous_features=True
)

# Trainer 설정
trainer_config = TrainerConfig(
    auto_lr_find=True,
    batch_size=256,
    max_epochs=100,
    early_stopping="valid_loss",
    early_stopping_mode="min",
    early_stopping_patience=5,
    checkpoints="valid_loss",
    load_best=True
)

# 옵티마이저 설정
optimizer_config = OptimizerConfig()

# 모델 설정
head_config = LinearHeadConfig(
    layers="",
    dropout=0.1,
    initialization="kaiming"
).__dict__


In [None]:
# 런타임 30초 소요
model_config = FTTransformerConfig(
    task="regression",
    learning_rate = 1e-3,
    head = "LinearHead",       # 선형 헤드
    head_config = head_config, # 선형 헤드 설정
)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)
tabular_model.fit(train=train)
tabular_model.evaluate(test)

INFO:pytorch_tabular.tabular_model:Experiment Tracking is turned off
INFO:lightning_fabric.utilities.seed:Seed set to 42
INFO:pytorch_tabular.tabular_model:Preparing the DataLoaders
INFO:pytorch_tabular.tabular_datamodule:Setting up the datamodule for regression task
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[co

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_steps=100` reached.
INFO:pytorch_lightning.tuner.lr_finder:Learning rate set to 0.0002511886431509582
INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/.lr_find_80a12da3-cba0-4540-954a-9f6ad25d1751.ckpt
INFO:pytorch_lightning.utilities.rank_zero:Restored all states from the checkpoint at /content/.lr_find_80a12da3-cba0-4540-954a-9f6ad25d1751.ckpt
INFO:pytorch_tabular.tabular_model:Suggested LR: 0.0002511886431509582. For plot and detailed analysis, use `find_learning_rate` method.
INFO:pytorch_tabular.tabular_model:Training Started
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

INFO:pytorch_tabular.tabular_model:Training the model completed
INFO:pytorch_tabular.tabular_model:Loading the best model
  return torch.load(f, map_location=map_location)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded

Output()

[{'test_loss': 0.0031051933765411377,
  'test_mean_squared_error': 0.0031051933765411377}]

In [None]:
prediction=tabular_model.predict(test)
prediction.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

Unnamed: 0,SalePrice_prediction
0,0.235486
3,0.416103
7,0.237009
21,0.223465
24,0.223221


In [None]:
prediction.head(2)

Unnamed: 0,SalePrice_prediction
0,0.235486
3,0.416103


In [None]:
test.columns

Index(['Unnamed: 0', 'Order', 'PID', 'MS SubClass', 'MS Zoning',
       'Lot Frontage', 'Lot Area', 'Street', 'Alley', 'Lot Shape',
       'Land Contour', 'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood',
       'Condition 1', 'Condition 2', 'Bldg Type', 'House Style',
       'Overall Qual', 'Overall Cond', 'Year Built', 'Year Remod/Add',
       'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd',
       'Mas Vnr Type', 'Mas Vnr Area', 'Exter Qual', 'Exter Cond',
       'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure',
       'BsmtFin Type 1', 'BsmtFin SF 1', 'BsmtFin Type 2', 'BsmtFin SF 2',
       'Bsmt Unf SF', 'Total Bsmt SF', 'Heating', 'Heating QC', 'Central Air',
       'Electrical', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF',
       'Gr Liv Area', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath',
       'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'Kitchen Qual',
       'TotRms AbvGrd', 'Functional', 'Fireplaces', 'Fireplace Qu',
       'Garage Type', 'G

In [None]:
from sklearn.metrics import r2_score

r2 = r2_score(test['SalePrice'], prediction['SalePrice_prediction'])

print(f"R2 Score: {r2}")

R2 Score: 0.7278051923572773
