In [1]:
!pip install tab-transformer-pytorch

Collecting tab-transformer-pytorch
  Downloading tab_transformer_pytorch-0.3.0-py3-none-any.whl.metadata (690 bytes)
Downloading tab_transformer_pytorch-0.3.0-py3-none-any.whl (6.9 kB)
Installing collected packages: tab-transformer-pytorch
Successfully installed tab-transformer-pytorch-0.3.0


In [18]:
import torch
import torch.nn as nn
from tab_transformer_pytorch import TabTransformer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

df = pd.read_csv('train.csv').iloc[:,1:]
df

Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,,2.0,-6.0,1.0
1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,1.0
2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,2.0
3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,Sales staff,2.0,-37.0,0.0
4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,Managers,2.0,-26.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,F,N,N,2,225000.0,State servant,Secondary / secondary special,Married,House / apartment,-12079,-1984,1,0,0,0,Core staff,4.0,-2.0,1.0
26453,F,N,Y,1,180000.0,Working,Higher education,Separated,House / apartment,-15291,-2475,1,0,0,0,,2.0,-47.0,2.0
26454,F,Y,N,0,292500.0,Working,Secondary / secondary special,Civil marriage,With parents,-10082,-2015,1,0,0,0,Core staff,2.0,-25.0,2.0
26455,M,N,Y,0,171000.0,Working,Incomplete higher,Single / not married,House / apartment,-10145,-107,1,0,0,0,Laborers,1.0,-59.0,2.0


In [19]:
df.columns

Index(['gender', 'car', 'reality', 'child_num', 'income_total', 'income_type',
       'edu_type', 'family_type', 'house_type', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'FLAG_MOBIL', 'work_phone', 'phone', 'email', 'occyp_type',
       'family_size', 'begin_month', 'credit'],
      dtype='object')

In [20]:
columns = ['gender', 'car', 'reality', 'child_num', 'income_total',
       'income_type', 'edu_type', 'family_type', 'house_type', 'DAYS_BIRTH',
       'DAYS_EMPLOYED', 'FLAG_MOBIL', 'work_phone', 'phone', 'email',
       'occyp_type', 'family_size', 'begin_month', 'credit']
categ_selected_columns = df.select_dtypes(include="object").columns

categ_indices = [columns.index(col) for col in categ_selected_columns]
print(categ_indices)

con_selected_columns = df.select_dtypes(exclude="object").columns
con_indices = [columns.index(col) for col in con_selected_columns][:-1]
print(con_indices)

[0, 1, 2, 5, 6, 7, 8, 15]
[3, 4, 9, 10, 11, 12, 13, 14, 16, 17]


In [21]:
unique_values_count = {col: df[col].nunique() for col in categ_selected_columns}
unique_values_count.values()

dict_values([2, 2, 2, 5, 5, 5, 6, 18])

In [22]:
df.dropna(inplace=True)

label_encoders = {}
for col in categ_selected_columns:
    label_encoders[col] = LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col])


In [23]:
X = df.drop('credit', axis=1).values
y = df['credit'].values

In [24]:
X.shape

(18286, 18)

In [25]:
y

array([1., 2., 0., ..., 2., 2., 2.])

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y, shuffle=True)

scaler = StandardScaler()
X_train[:, con_indices] = scaler.fit_transform(X_train[:, con_indices])
X_test[:, con_indices] = scaler.transform(X_test[:, con_indices])

X_train_categ = torch.tensor(X_train[:, categ_indices], dtype=torch.int64)
X_train_cont = torch.tensor(X_train[:, con_indices], dtype=torch.float32)
X_test_categ = torch.tensor(X_test[:, categ_indices], dtype=torch.int64)
X_test_cont = torch.tensor(X_test[:, con_indices], dtype=torch.float32)

y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)


In [None]:
# TabTransformer 모델 정의
model = TabTransformer(
    categories=[2, 2, 2, 5, 5, 5, 6, 18],
    num_continuous=X_train_cont.shape[1],  # 연속형 변수의 개수
    dim=4,  # 모델 차원
    dim_out=3,  # 이진 분류 출력
    depth=3,  # 모델 깊이
    heads=8,  # 멀티헤드 어텐션 헤드 수
    attn_dropout=0.1,  # 어텐션 드롭아웃
    ff_dropout=0.1,  # 피드포워드 드롭아웃
    mlp_hidden_mults=(4, 2),  # MLP의 히든 레이어 크기 비율
    mlp_act=nn.ReLU(),  # MLP의 활성화 함수
)

# 모델 학습 준비
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 학습 루프
num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    # 예측 및 손실 계산
    y_pred = model(X_train_categ, X_train_cont)  # 범주형 및 연속형 변수 모두 전달
    #print(y_pred, y_train_tensor)
    loss = criterion(y_pred, y_train_tensor)

    # 역전파 및 최적화
    loss.backward()
    optimizer.step()

    if (epoch+1) % 5 == 0:
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}')

model.eval()
with torch.no_grad():
    logits = model(X_test_categ, X_test_cont)
    predictions = torch.argmax(logits, dim=1)  # 가장 높은 확률을 가진 클래스를 예측

    accuracy = (predictions == y_test).float().mean()
    print(f'Accuracy: {accuracy.item()}')


Epoch 5/50, Loss: 0.9026
