In [16]:
import numpy as np
import pandas as pd

from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import accuracy_score, classification_report

In [17]:
data = pd.read_csv("income.csv")
data

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income_>50K
0,67,Private,366425,Doctorate,16,Divorced,Exec-managerial,Not-in-family,White,Male,99999,0,60,United-States,1
1,17,Private,244602,12th,8,Never-married,Other-service,Own-child,White,Male,0,0,15,United-States,0
2,31,Private,174201,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,1
3,58,State-gov,110199,7th-8th,4,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States,0
4,25,State-gov,149248,Some-college,10,Never-married,Other-service,Not-in-family,Black,Male,0,0,40,United-States,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43952,52,Private,68982,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,50,United-States,1
43953,19,Private,116562,HS-grad,9,Never-married,Other-service,Own-child,White,Female,0,0,40,United-States,0
43954,30,Private,197947,Some-college,10,Divorced,Sales,Not-in-family,White,Male,0,0,58,United-States,0
43955,46,Private,97883,Bachelors,13,Never-married,Sales,Not-in-family,White,Female,0,0,35,United-States,0


In [18]:
# Menghapus baris dengan data duplikat
data.drop_duplicates(inplace=True)
data.dropna(inplace=True)

In [19]:
for column in data.select_dtypes(exclude=[np.number]).columns: 
    data = pd.concat([data, pd.get_dummies(data[column], prefix=column, drop_first=True)], axis=1)
    data.drop(column, axis=1, inplace=True)

In [20]:
X = data.drop('income_>50K', axis=1)
y = data['income_>50K']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

mm = MinMaxScaler()
X_train = mm.fit_transform(X_train)
X_test = mm.transform(X_test)

clf = TabNetClassifier()
clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], patience=10, max_epochs=20)



epoch 0  | loss: 0.53888 | val_0_auc: 0.78697 | val_1_auc: 0.78718 |  0:00:05s
epoch 1  | loss: 0.42545 | val_0_auc: 0.81084 | val_1_auc: 0.81984 |  0:00:11s
epoch 2  | loss: 0.39365 | val_0_auc: 0.8385  | val_1_auc: 0.84956 |  0:00:17s
epoch 3  | loss: 0.38451 | val_0_auc: 0.8334  | val_1_auc: 0.84111 |  0:00:23s
epoch 4  | loss: 0.37971 | val_0_auc: 0.86385 | val_1_auc: 0.86987 |  0:00:29s
epoch 5  | loss: 0.37724 | val_0_auc: 0.86417 | val_1_auc: 0.8708  |  0:00:34s
epoch 6  | loss: 0.3715  | val_0_auc: 0.87279 | val_1_auc: 0.8803  |  0:00:40s
epoch 7  | loss: 0.36252 | val_0_auc: 0.87439 | val_1_auc: 0.88244 |  0:00:46s
epoch 8  | loss: 0.35536 | val_0_auc: 0.87737 | val_1_auc: 0.88525 |  0:00:51s
epoch 9  | loss: 0.35334 | val_0_auc: 0.87977 | val_1_auc: 0.88691 |  0:00:57s
epoch 10 | loss: 0.3514  | val_0_auc: 0.88866 | val_1_auc: 0.89555 |  0:01:02s
epoch 11 | loss: 0.34706 | val_0_auc: 0.88745 | val_1_auc: 0.89046 |  0:01:08s
epoch 12 | loss: 0.34605 | val_0_auc: 0.88899 | val_



In [22]:
predictions = clf.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.85


In [23]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.94      0.90      6137
           1       0.76      0.56      0.65      2001

    accuracy                           0.85      8138
   macro avg       0.81      0.75      0.77      8138
weighted avg       0.84      0.85      0.84      8138



In [24]:
prediction = clf.predict(X_test)
comparison_df = pd.DataFrame({'True Label': y_test, 'Predicted Label': prediction})
print(comparison_df.head(10)) 

       True Label  Predicted Label
14093           0                0
43864           0                0
30514           1                0
4709            0                0
26857           0                0
1497            0                0
15002           0                0
11034           0                0
2682            0                0
21779           0                1
