# **Import Library**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# **Preprocessing**

**Dataset Source**: https://archive.ics.uci.edu/dataset/2/adult

**Data Loading**

In [2]:
!pip install ucimlrepo



In [3]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
adult = fetch_ucirepo(id=2)

# data (as pandas dataframes)
X = adult.data.features
y = adult.data.targets
#melakukan pengabungan variabel target dan fitur
df = pd.concat([X, y], axis=1)
#menampilkan 5 baris pertama menggunakan fungsi df.head
df.head()


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       47879 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int64 
 5   marital-status  48842 non-null  object
 6   occupation      47876 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capital-gain    48842 non-null  int64 
 11  capital-loss    48842 non-null  int64 
 12  hours-per-week  48842 non-null  int64 
 13  native-country  48568 non-null  object
 14  income          48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [5]:
df.describe(include='all')

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
count,48842.0,47879,48842.0,48842,48842.0,48842,47876,48842,48842,48842,48842.0,48842.0,48842.0,48568,48842
unique,,9,,16,,7,15,6,5,2,,,,42,4
top,,Private,,HS-grad,,Married-civ-spouse,Prof-specialty,Husband,White,Male,,,,United-States,<=50K
freq,,33906,,15784,,22379,6172,19716,41762,32650,,,,43832,24720
mean,38.643585,,189664.1,,10.078089,,,,,,1079.067626,87.502314,40.422382,,
std,13.71051,,105604.0,,2.570973,,,,,,7452.019058,403.004552,12.391444,,
min,17.0,,12285.0,,1.0,,,,,,0.0,0.0,1.0,,
25%,28.0,,117550.5,,9.0,,,,,,0.0,0.0,40.0,,
50%,37.0,,178144.5,,10.0,,,,,,0.0,0.0,40.0,,
75%,48.0,,237642.0,,12.0,,,,,,0.0,0.0,45.0,,


In [6]:
#mengecek jumlah missing value
df.isnull().sum()

Unnamed: 0,0
age,0
workclass,963
fnlwgt,0
education,0
education-num,0
marital-status,0
occupation,966
relationship,0
race,0
sex,0


In [7]:
# Imputasi: mean untuk numerik, modus (most frequent) untuk kategorikal
print("Rows before imputation:", df.shape[0])
print("Missing per column (before):")
print(df.isnull().sum()[df.isnull().sum() > 0])

# mean untuk kolom numerik
num_cols = df.select_dtypes(include=[np.number]).columns
if len(num_cols):
    df[num_cols] = df[num_cols].fillna(df[num_cols].mean())

# modus untuk kolom kategorikal
cat_cols = df.select_dtypes(exclude=[np.number]).columns
for col in cat_cols:
    if df[col].isnull().any():
        mode = df[col].mode(dropna=True)
        if not mode.empty:
            df[col].fillna(mode.iloc[0], inplace=True)

print("Missing per column (after):")
print(df.isnull().sum()[df.isnull().sum() > 0] if df.isnull().any().any() else "No missing values remaining.")

Rows before imputation: 48842
Missing per column (before):
workclass         963
occupation        966
native-country    274
dtype: int64
Missing per column (after):
No missing values remaining.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mode.iloc[0], inplace=True)


In [8]:
# pengecekan duplikasi data
print("Number of duplicate rows:", df.duplicated().sum())

Number of duplicate rows: 29


In [9]:
# Menghapus duplikasi data
df.drop_duplicates(inplace=True)

# Menampilkan jumlah baris setelah menghapus duplikasi
print("Number of rows after removing duplicates:", df.shape[0])

Number of rows after removing duplicates: 48813


In [11]:
from sklearn.preprocessing import LabelEncoder

# Pilih kolom kategorikal
categorical_cols = df.select_dtypes(include=['object']).columns

# Terapkan Label Encoding
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

print("DataFrame after Label Encoding:")
df.head()

DataFrame after Label Encoding:


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,0
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,0
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,0
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,0
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,0


In [17]:
df.describe()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
count,48813.0,48813.0,48813.0,48813.0,48813.0,48813.0,48813.0,48813.0,48813.0,48813.0,48813.0,48813.0,48813.0,48813.0,48813.0
mean,38.647348,3.949214,189667.9,10.288243,10.078688,2.618462,6.775797,1.443427,3.667998,0.66849,1079.708705,87.554299,40.425051,36.969865,0.812202
std,13.709005,1.357832,105606.2,3.874402,2.570257,1.507746,4.15187,1.602395,0.846049,0.470761,7454.185982,403.118605,12.390954,7.268302,0.972871
min,17.0,0.0,12285.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25%,28.0,4.0,117555.0,9.0,9.0,2.0,3.0,0.0,4.0,0.0,0.0,0.0,40.0,39.0,0.0
50%,37.0,4.0,178140.0,11.0,10.0,2.0,7.0,1.0,4.0,1.0,0.0,0.0,40.0,39.0,0.0
75%,48.0,4.0,237620.0,12.0,12.0,4.0,10.0,3.0,4.0,1.0,0.0,0.0,45.0,39.0,1.0
max,90.0,8.0,1490400.0,15.0,16.0,6.0,14.0,5.0,4.0,1.0,99999.0,4356.0,99.0,41.0,3.0


In [10]:
print("Unique values in the 'income' column before encoding:")
display(adult.data.targets['income'].unique())

Unique values in the 'income' column before encoding:


array(['<=50K', '>50K', '<=50K.', '>50K.'], dtype=object)

Disini variabel target(income) berisi  0 - 3 ('<=50K', '>50K', '<=50K.', '>50K.' )


In [12]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = df.drop('income', axis=1)
y = df['income']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (34169, 14)
Shape of X_test: (14644, 14)
Shape of y_train: (34169,)
Shape of y_test: (14644,)


In [13]:
print("Class distribution in the training set (y_train):")
display(y_train.value_counts())

Class distribution in the training set (y_train):


Unnamed: 0_level_0,count
income,Unnamed: 1_level_1
0,17303
1,8747
2,5451
3,2668


In [14]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Shape of X_train after SMOTE:", X_train_resampled.shape)
print("Shape of y_train after SMOTE:", y_train_resampled.shape)
print("\nClass distribution in the training set after SMOTE:")
display(y_train_resampled.value_counts())

Shape of X_train after SMOTE: (69212, 14)
Shape of y_train after SMOTE: (69212,)

Class distribution in the training set after SMOTE:


Unnamed: 0_level_0,count
income,Unnamed: 1_level_1
0,17303
2,17303
3,17303
1,17303


In [15]:
from sklearn.preprocessing import StandardScaler

# Apply StandardScaler to the training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)

print("Shape of scaled training data:", X_train_scaled.shape)
print("\nFirst 5 rows of scaled training data:")
display(X_train_scaled[:5])

Shape of scaled training data: (69212, 14)

First 5 rows of scaled training data:


array([[ 2.36543561,  0.10100954, -0.62177078, -1.89389701, -3.10651061,
        -0.27824876,  0.05562554, -0.73163988,  0.41669856,  0.72319938,
        -0.19220588,  4.75493661, -0.16715887,  0.28827147],
       [ 0.13453517, -2.38632636, -1.49632443, -0.38217625,  1.09783697,
        -0.27824876,  0.86285679, -0.73163988,  0.41669856,  0.72319938,
        -0.19220588, -0.25310875, -0.16715887,  0.28827147],
       [ 0.21716111, -1.5572144 , -1.46376631,  0.5248562 ,  1.51827172,
        -0.27824876,  0.86285679, -0.73163988,  0.41669856,  0.72319938,
        -0.19220588, -0.25310875, -0.16715887,  0.28827147],
       [ 1.29129836, -1.5572144 , -0.8124889 ,  0.22251205, -0.58390207,
        -0.27824876,  1.93916512, -0.73163988,  0.41669856,  0.72319938,
        -0.19220588, -0.25310875, -0.16715887,  0.28827147],
       [-0.03071672,  0.10100954,  0.00563577,  0.22251205, -0.58390207,
        -0.27824876, -1.02068279, -0.73163988,  0.41669856,  0.72319938,
        -0.19220588, -0.25

Saya hanya menerapkan metode tersebut pada data train, karena menurut saya SMOTE atau teknik pembuatan data sintetis hanya boleh dilakukan di tahap training agar tidak terjadi data leakage. Hal yang sama berlaku untuk StandardScaler, di mana proses scaling juga hanya boleh dilakukan di data latih. Alasannya sederhana: tidak boleh ada kebocoran data (no data leakage), sehingga model tidak memiliki akses ke informasi dari data validasi atau test set selama pelatihan. Selain itu, pendekatan ini juga lebih merepresentasikan kondisi nyata, di mana model nantinya akan menghadapi data baru yang belum pernah dilihat sebelumnya.