# 3. Transformação

In [51]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [52]:
df = pd.read_csv("./data/processed/adult_processed.csv")
print(f"Shape do dataset: {df.shape}")
df.head()

Shape do dataset: (45194, 13)


Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [53]:
y = (df["income"] == ">50K").astype(int)
X = df.drop(columns=["income"])

print(f"Shape de X: {X.shape}")
print(f"Shape de y: {y.shape}")
print("\nDistribuição da variável target:")
print(y.value_counts())
print("\nProporção:")
print(y.value_counts(normalize=True))

Shape de X: (45194, 12)
Shape de y: (45194,)

Distribuição da variável target:
income
0    33988
1    11206
Name: count, dtype: int64

Proporção:
income
0    0.752047
1    0.247953
Name: proportion, dtype: float64


In [54]:
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

print(f"Variáveis numéricas ({len(numerical_cols)}):")
print(numerical_cols)
print(f"\nVariáveis categóricas ({len(categorical_cols)}):")
print(categorical_cols)

Variáveis numéricas (4):
['age', 'capital-gain', 'capital-loss', 'hours-per-week']

Variáveis categóricas (8):
['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']


In [55]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

print(f"Shape de X_train: {X_train.shape}")
print(f"Shape de X_test: {X_test.shape}")
print(f"Shape de y_train: {y_train.shape}")
print(f"Shape de y_test: {y_test.shape}")

print("\nDistribuição em treino:")
print(y_train.value_counts(normalize=True))
print("\nDistribuição em teste:")
print(y_test.value_counts(normalize=True))

Shape de X_train: (36155, 12)
Shape de X_test: (9039, 12)
Shape de y_train: (36155,)
Shape de y_test: (9039,)

Distribuição em treino:
income
0    0.75204
1    0.24796
Name: proportion, dtype: float64

Distribuição em teste:
income
0    0.752074
1    0.247926
Name: proportion, dtype: float64


In [56]:
print("Aplicando One-Hot Encoding...")

X_train_encoded = pd.get_dummies(X_train, columns=categorical_cols, drop_first=False)
X_test_encoded = pd.get_dummies(X_test, columns=categorical_cols, drop_first=False)

print("\nShape após encoding:")
print(f"X_train_encoded: {X_train_encoded.shape}")
print(f"X_test_encoded: {X_test_encoded.shape}")

Aplicando One-Hot Encoding...

Shape após encoding:
X_train_encoded: (36155, 102)
X_test_encoded: (9039, 101)


In [57]:
print("Alinhando colunas entre treino e teste...")

X_train_encoded, X_test_encoded = X_train_encoded.align(
    X_test_encoded,
    join="left",
    axis=1,
    fill_value=0,
)

print("\nShape após alinhamento:")
print(f"X_train_encoded: {X_train_encoded.shape}")
print(f"X_test_encoded: {X_test_encoded.shape}")
print(f"\nColunas idênticas: {X_train_encoded.columns.equals(X_test_encoded.columns)}")

Alinhando colunas entre treino e teste...

Shape após alinhamento:
X_train_encoded: (36155, 102)
X_test_encoded: (9039, 102)

Colunas idênticas: True


In [58]:
print("Colunas numéricas a serem normalizadas:")
print(numerical_cols)

existing_num_cols = [col for col in numerical_cols if col in X_train_encoded.columns]
print("\nColunas numéricas presentes:")
print(existing_num_cols)

Colunas numéricas a serem normalizadas:
['age', 'capital-gain', 'capital-loss', 'hours-per-week']

Colunas numéricas presentes:
['age', 'capital-gain', 'capital-loss', 'hours-per-week']


In [59]:
standard = True

scaler = StandardScaler() if standard else MinMaxScaler()

X_train_encoded[existing_num_cols] = scaler.fit_transform(X_train_encoded[existing_num_cols])
X_test_encoded[existing_num_cols] = scaler.transform(X_test_encoded[existing_num_cols])

print("\nEstatísticas das variáveis numéricas no treino:")
print(X_train_encoded[existing_num_cols].describe())


Estatísticas das variáveis numéricas no treino:
                age  capital-gain  capital-loss  hours-per-week
count  3.615500e+04  3.615500e+04  3.615500e+04    3.615500e+04
mean   1.662617e-16  2.358322e-18 -1.660652e-17   -5.581362e-17
std    1.000014e+00  1.000014e+00  1.000014e+00    1.000014e+00
min   -1.629085e+00 -1.468068e-01 -2.194940e-01   -3.310955e+00
25%   -7.969652e-01 -1.468068e-01 -2.194940e-01   -7.714343e-02
50%   -1.161397e-01 -1.468068e-01 -2.194940e-01   -7.714343e-02
75%    6.403332e-01 -1.468068e-01 -2.194940e-01    3.374478e-01
max    3.893166e+00  1.312316e+01  9.067077e+00    4.815033e+00


In [60]:
output_dir = "./data/processed/"

X_train_encoded.to_csv(f"{output_dir}X_train.csv", index=False)
X_test_encoded.to_csv(f"{output_dir}X_test.csv", index=False)
y_train.to_csv(f"{output_dir}y_train.csv", index=False, header=True)
y_test.to_csv(f"{output_dir}y_test.csv", index=False, header=True)