# **Import Library**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# **Preprocessing**

**Dataset Source**: https://archive.ics.uci.edu/dataset/2/adult

**Data Loading**

In [2]:
!pip install ucimlrepo



In [3]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
adult = fetch_ucirepo(id=2)

# data (as pandas dataframes)
X = adult.data.features
y = adult.data.targets
#melakukan pengabungan variabel target dan fitur
df = pd.concat([X, y], axis=1)
#menampilkan 5 baris pertama menggunakan fungsi df.head
df.head()


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       47879 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int64 
 5   marital-status  48842 non-null  object
 6   occupation      47876 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capital-gain    48842 non-null  int64 
 11  capital-loss    48842 non-null  int64 
 12  hours-per-week  48842 non-null  int64 
 13  native-country  48568 non-null  object
 14  income          48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [5]:
df.describe(include='all')

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
count,48842.0,47879,48842.0,48842,48842.0,48842,47876,48842,48842,48842,48842.0,48842.0,48842.0,48568,48842
unique,,9,,16,,7,15,6,5,2,,,,42,4
top,,Private,,HS-grad,,Married-civ-spouse,Prof-specialty,Husband,White,Male,,,,United-States,<=50K
freq,,33906,,15784,,22379,6172,19716,41762,32650,,,,43832,24720
mean,38.643585,,189664.1,,10.078089,,,,,,1079.067626,87.502314,40.422382,,
std,13.71051,,105604.0,,2.570973,,,,,,7452.019058,403.004552,12.391444,,
min,17.0,,12285.0,,1.0,,,,,,0.0,0.0,1.0,,
25%,28.0,,117550.5,,9.0,,,,,,0.0,0.0,40.0,,
50%,37.0,,178144.5,,10.0,,,,,,0.0,0.0,40.0,,
75%,48.0,,237642.0,,12.0,,,,,,0.0,0.0,45.0,,


In [6]:
#mengecek jumlah missing value
df.isnull().sum()

age                 0
workclass         963
fnlwgt              0
education           0
education-num       0
marital-status      0
occupation        966
relationship        0
race                0
sex                 0
capital-gain        0
capital-loss        0
hours-per-week      0
native-country    274
income              0
dtype: int64

In [7]:
# Hapus baris yang memiliki missing value
print("Rows before dropping any with missing values:", df.shape[0])
print("Rows with missing values:", df.isnull().any(axis=1).sum())
df.dropna(inplace=True)
print("Rows after dropping any with missing values:", df.shape[0])


Rows before dropping any with missing values: 48842
Rows with missing values: 1221
Rows after dropping any with missing values: 47621


In [8]:
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64

In [9]:
# pengecekan duplikasi data
print("Number of duplicate rows:", df.duplicated().sum())

Number of duplicate rows: 29


In [10]:
# Menghapus duplikasi data
df.drop_duplicates(inplace=True)

# Menampilkan jumlah baris setelah menghapus duplikasi
print("Number of rows after removing duplicates:", df.shape[0])

Number of rows after removing duplicates: 47592


In [11]:
from sklearn.preprocessing import LabelEncoder

# Buat encoder
le = LabelEncoder()

# Encode hanya kolom target 'income'
df['income'] = le.fit_transform(df['income'])

print("DataFrame after Label Encoding on target [income]:")
df.head()


DataFrame after Label Encoding on target [income]:


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [12]:
df.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,income
count,47592.0,47592.0,47592.0,47592.0,47592.0,47592.0,47592.0
mean,38.644541,189731.0,10.091444,1091.80253,87.907022,40.602895,0.801248
std,13.557307,105571.7,2.56758,7489.46073,404.127864,12.259716,0.977323
min,17.0,12285.0,1.0,0.0,0.0,1.0,0.0
25%,28.0,117584.8,9.0,0.0,0.0,40.0,0.0
50%,37.0,178282.0,10.0,0.0,0.0,40.0,0.0
75%,48.0,237713.0,12.0,0.0,0.0,45.0,1.0
max,90.0,1490400.0,16.0,99999.0,4356.0,99.0,3.0


In [13]:
print("Unique values in the 'income' column before encoding:")
display(adult.data.targets['income'].unique())

Unique values in the 'income' column before encoding:


array(['<=50K', '>50K', '<=50K.', '>50K.'], dtype=object)

Disini variabel target(income) berisi  0 - 3 ('<=50K', '>50K', '<=50K.', '>50K.' )


In [14]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = df.drop('income', axis=1)
y = df['income']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (33314, 14)
Shape of X_test: (14278, 14)
Shape of y_train: (33314,)
Shape of y_test: (14278,)


In [15]:
# Label encoding hanya untuk kolom bertipe object di X_train
from sklearn.preprocessing import LabelEncoder

encoders = {}

# Pilih kolom kategorikal dari X_train
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Fit encoder pada train dan transform train + test
for col in categorical_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])  # Encode X_train
    X_test[col] = le.transform(X_test[col])  # Transform X_test menggunakan encoder yang sama
    encoders[col] = le  # Simpan encoder untuk referensi

print("X_train after Label Encoding:")
print(X_train.head())

print("\nX_test after Label Encoding:")
print(X_test.head())

X_train after Label Encoding:
       age  workclass  fnlwgt  education  education-num  marital-status  \
28343   50          4  196193          9             13               2   
15046   34          4  248754          1              7               4   
12281   55          4  158641         11              9               6   
43947   19          4  136758         15             10               4   
19202   46          4   99086         15             10               2   

       occupation  relationship  race  sex  capital-gain  capital-loss  \
28343           3             0     4    1             0             0   
15046           7             1     4    1             0             0   
12281           1             1     4    0             0             0   
43947          13             3     4    0             0             0   
19202           1             5     4    0             0             0   

       hours-per-week  native-country  
28343              40             

In [16]:
print("Class distribution in the training set (y_train):")
display(y_train.value_counts())

Class distribution in the training set (y_train):


income
0    17305
1     7990
2     5447
3     2572
Name: count, dtype: int64

In [17]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Shape of X_train after SMOTE:", X_train_resampled.shape)
print("Shape of y_train after SMOTE:", y_train_resampled.shape)
print("\nClass distribution in the training set after SMOTE:")
display(y_train_resampled.value_counts())

Shape of X_train after SMOTE: (69220, 14)
Shape of y_train after SMOTE: (69220,)

Class distribution in the training set after SMOTE:


income
0    17305
1    17305
2    17305
3    17305
Name: count, dtype: int64

In [18]:
from sklearn.preprocessing import StandardScaler

# Apply StandardScaler to the training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)

print("Shape of scaled training data:", X_train_scaled.shape)
print("\nFirst 5 rows of scaled training data:")
display(X_train_scaled[:5])

Shape of scaled training data: (69220, 14)

First 5 rows of scaled training data:


array([[ 0.82127481,  0.1040366 ,  0.06482522, -0.39527689,  1.09886607,
        -0.26111664, -0.99647419, -0.72672615,  0.41423727,  0.72895957,
        -0.19670309, -0.25131591, -0.19306661, -5.89064372],
       [-0.52644424,  0.1040366 ,  0.57544864, -2.83387461, -1.4348388 ,
         1.32415669,  0.07727772, -0.04026348,  0.41423727,  0.72895957,
        -0.19670309, -0.25131591, -0.19306661,  0.28950969],
       [ 1.24243702,  0.1040366 , -0.29998768,  0.21437254, -0.59027051,
         2.90943003, -1.53335014, -0.04026348,  0.41423727, -1.3718182 ,
        -0.19670309, -0.25131591, -0.19306661,  0.28950969],
       [-1.78993084,  0.1040366 , -0.51257823,  1.4336714 , -0.16798637,
         1.32415669,  1.68790558,  1.33266185,  0.41423727, -1.3718182 ,
        -0.19670309, -0.25131591, -2.90913868,  0.28950969],
       [ 0.48434505,  0.1040366 , -0.87855692,  1.4336714 , -0.16798637,
        -0.26111664, -1.53335014,  2.70558718,  0.41423727, -1.3718182 ,
        -0.19670309, -0.25

Saya hanya menerapkan metode tersebut pada data train, karena menurut saya SMOTE atau teknik pembuatan data sintetis hanya boleh dilakukan di tahap training agar tidak terjadi data leakage. Hal yang sama berlaku untuk StandardScaler, di mana proses scaling juga hanya boleh dilakukan di data latih. Alasannya sederhana: tidak boleh ada kebocoran data (no data leakage), sehingga model tidak memiliki akses ke informasi dari data validasi atau test set selama pelatihan. Selain itu, pendekatan ini juga lebih merepresentasikan kondisi nyata, di mana model nantinya akan menghadapi data baru yang belum pernah dilihat sebelumnya.