In [130]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [131]:
df_training = pd.read_csv("../data/processed/training_set_cleaned.csv")
X_test = pd.read_csv("../data/processed/test_set_cleaned.csv")

In [132]:
y_train = df_training[["h1n1_vaccine", "seasonal_vaccine"]]
y_train_1 = y_train["h1n1_vaccine"]
y_train_2 = y_train["seasonal_vaccine"]
X_train = df_training.drop(columns=["h1n1_vaccine", "seasonal_vaccine"])

print(f"Shape de X: {X_train.shape}")
print(f"Shape de y_1: {y_train_1.shape}")
print(f"Shape de y_2: {y_train_2.shape}")
print("\nDistribuição da variável target (h1n1_vaccine):")
print(y_train_1.value_counts())
print("\nProporção:")
print(y_train_1.value_counts(normalize=True))
print("\nDistribuição da variável target (seasonal_vaccine):")
print(y_train_2.value_counts())
print("\nProporção:")
print(y_train_2.value_counts(normalize=True))

Shape de X: (6437, 35)
Shape de y_1: (6437,)
Shape de y_2: (6437,)

Distribuição da variável target (h1n1_vaccine):
h1n1_vaccine
0    4502
1    1935
Name: count, dtype: int64

Proporção:
h1n1_vaccine
0    0.699394
1    0.300606
Name: proportion, dtype: float64

Distribuição da variável target (seasonal_vaccine):
seasonal_vaccine
0    3491
1    2946
Name: count, dtype: int64

Proporção:
seasonal_vaccine
0    0.542333
1    0.457667
Name: proportion, dtype: float64


In [133]:
numerical_cols = X_train.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = X_train.select_dtypes(include=["object"]).columns.tolist()

print(f"Variáveis numéricas ({len(numerical_cols)}):")
print(numerical_cols)
print(f"\nVariáveis categóricas ({len(categorical_cols)}):")
print(categorical_cols)

Variáveis numéricas (23):
['h1n1_concern', 'h1n1_knowledge', 'behavioral_antiviral_meds', 'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home', 'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal', 'chronic_med_condition', 'child_under_6_months', 'health_worker', 'health_insurance', 'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'household_adults', 'household_children']

Variáveis categóricas (12):
['age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa', 'employment_industry', 'employment_occupation']


In [134]:
print("Aplicando One-Hot Encoding...")

X_train_encoded = pd.get_dummies(X_train, columns=categorical_cols, drop_first=False)
X_test_encoded = pd.get_dummies(X_test, columns=categorical_cols, drop_first=False)

print("\nShape após encoding:")
print(f"X_train_encoded: {X_train_encoded.shape}")
print(f"X_test_encoded: {X_test_encoded.shape}")

Aplicando One-Hot Encoding...

Shape após encoding:
X_train_encoded: (6437, 103)
X_test_encoded: (26708, 105)


In [135]:
print("Alinhando colunas entre treino e teste...")

X_train_encoded, X_test_encoded = X_train_encoded.align(
    X_test_encoded,
    join="right",
    axis=1,
    fill_value=0,
)

print("\nShape após alinhamento:")
print(f"X_train_encoded: {X_train_encoded.shape}")
print(f"X_test_encoded: {X_test_encoded.shape}")
print(f"\nColunas idênticas: {X_train_encoded.columns.equals(X_test_encoded.columns)}")

Alinhando colunas entre treino e teste...

Shape após alinhamento:
X_train_encoded: (6437, 105)
X_test_encoded: (26708, 105)

Colunas idênticas: True


In [136]:
print("Colunas numéricas a serem normalizadas:")
print(numerical_cols)

existing_num_cols = [col for col in numerical_cols if col in X_train_encoded.columns]
print("\nColunas numéricas presentes:")
print(existing_num_cols)

Colunas numéricas a serem normalizadas:
['h1n1_concern', 'h1n1_knowledge', 'behavioral_antiviral_meds', 'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home', 'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal', 'chronic_med_condition', 'child_under_6_months', 'health_worker', 'health_insurance', 'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'household_adults', 'household_children']

Colunas numéricas presentes:
['h1n1_concern', 'h1n1_knowledge', 'behavioral_antiviral_meds', 'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home', 'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal', 'chronic_med_condition', 'child_under_6_months', 'health_worker', 'health_insurance', 'opinion_h1n1_vacc_eff

In [137]:
standard = True
scaler = StandardScaler() if standard else MinMaxScaler()

X_train_encoded[existing_num_cols] = scaler.fit_transform(X_train_encoded[existing_num_cols])
X_test_encoded[existing_num_cols] = scaler.transform(X_test_encoded[existing_num_cols])

print("\nEstatísticas das variáveis numéricas no treino:")
print(X_train_encoded[existing_num_cols].describe())


Estatísticas das variáveis numéricas no treino:
       h1n1_concern  h1n1_knowledge  behavioral_antiviral_meds  \
count  6.437000e+03    6.437000e+03               6.437000e+03   
mean   8.830732e-18    7.506122e-17               6.015936e-17   
std    1.000078e+00    1.000078e+00               1.000078e+00   
min   -1.724224e+00   -2.323563e+00              -2.412363e-01   
25%   -5.571546e-01   -5.978475e-01              -2.412363e-01   
50%    6.099148e-01   -5.978475e-01              -2.412363e-01   
75%    6.099148e-01    1.127868e+00              -2.412363e-01   
max    1.776984e+00    1.127868e+00               4.145313e+00   

       behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
count          6.437000e+03          6.437000e+03           6.437000e+03   
mean          -1.131437e-16          4.636134e-17           1.170072e-16   
std            1.000078e+00          1.000078e+00           1.000078e+00   
min           -1.597632e+00         -2.556867e-01   

In [138]:
X_train_encoded.to_csv("../data/processed/X_train_transformed.csv", index=False)
X_test_encoded.to_csv("../data/processed/X_test_transformed.csv", index=False)
y_train.to_csv("../data/processed/y_train.csv")