In [1]:
import numpy as np
import pandas as pd

path = "../files/adult_outcome/salary.csv"
df: pd.DataFrame = pd.read_csv(path)

df = df[["age", "workclass", "education", "race", "sex", "native-country", "salary"]]

df.head(5)

Unnamed: 0,age,workclass,education,race,sex,native-country,salary
0,39,State-gov,Bachelors,White,Male,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,White,Male,United-States,<=50K
2,38,Private,HS-grad,White,Male,United-States,<=50K
3,53,Private,11th,Black,Male,United-States,<=50K
4,28,Private,Bachelors,Black,Female,Cuba,<=50K


In [2]:
########################################

freq = df["workclass"].value_counts(normalize=True).sort_values(ascending=False)

raras = freq[freq < 0.01].index
df["workclass"] = df["workclass"].where(~df["workclass"].isin(raras), "others")

df["workclass"] = (df["workclass"]
           .astype(str).str.strip().str.lower()
           .replace({"others": "others", "?": "others"})
           .replace({"local-gov": "gov", "state-gov": "gov", "federal-gov": "gov"})
           .replace({"self-emp-not-inc": "self-employed", "self-emp-inc": "self-employed"}))

map_workclass = {'gov': 0, 'self-employed': 1, 'private': 2, 'others': 3}
df['workclass'] = df['workclass'].map(map_workclass)

########################################

map_sex = {' Male': 0, ' Female': 1}
df['sex'] = df['sex'].map(map_sex)

########################################

age_column = df["age"].clip(lower=1, upper=100)

start = ((age_column - 1) // 10) * 10 + 1
bins = list(range(1, 101, 10)) + [101] 
labels = [f"{start}-{start+9}" for start in range(1, 100, 10)]

df["age"] = pd.cut(age_column, bins=bins, right=False, labels=labels, ordered=True)

df["age"] = (df["age"]
           .astype(str).str.strip().str.lower()
           .replace({"1-10": "11-20", "91-100": "71-80", "81-90": "71-80", "71-80": "61-70"}))

map_age = {'11-20': 0, '21-30': 1, '31-40': 2, '41-50': 3, '51-60': 4, '61-70': 5}
df['age'] = df['age'].map(map_age)

df["age"].value_counts()

########################################

df["race"] = (df["race"]
           .astype(str).str.strip().str.lower()
           .replace({"asian-pac-islander": "others", "amer-indian-eskimo": "others", "other": "others"}))

map_race = {'white': 0, 'black': 1, 'others': 2}
df['race'] = df['race'].map(map_race)

########################################

df["native-country"] = (df["native-country"]
           .astype(str).str.strip().str.lower()
           .replace({"mexico": "others", "?": "others", "philippines": "others", "china": "others", "japan": "others", "india": "others", "germany": "others", 
                    "canada": "others", "puerto-rico": "others", "el-salvador": "others", "cuba": "others", "england": "others", "jamaica": "others", "south": "others", 
                    "italy": "others", "dominican-republic": "others", "vietnam": "others", "guatemala": "others", "poland": "others", "columbia": "others", "taiwan": "others", 
                    "haiti": "others", "iran": "others", "holand-netherlands": "others", "scotland": "others", "hungary": "others", "honduras": "others", 
                    "yugoslavia": "others", "laos": "others", "thailand": "others", "cambodia": "others", "trinadad&tobago": "others", "hong": "others", "ireland": "others", "ecuador": "others",
                    "greece": "others", "france": "others", "peru": "others", "nicaragua": "others", "outlying-us(guam-usvi-etc)": "others",}))

map_native_country = {'united-states': 0, 'others': 1}
df['native-country'] = df['native-country'].map(map_native_country)

########################################

df["education"] = (df["education"]
           .astype(str).str.strip().str.lower()
           .replace({"11th": "school", "10th": "school", "7th-8th": "school", "9th": "school", "12th": "school", "1st-4th": "school", "5th-6th": "school"})
           .replace({"assoc-voc": "assoc", "assoc-acdm": "assoc"})
           .replace({"preschool": "school", "prof-school": "school"}))


map_education = {'hs-grad': 0, 'some-college': 1, 'bachelors': 2, 'school': 3, 'assoc': 4, 'masters': 5, 'doctorate': 6}
df['education'] = df['education'].map(map_education)

########################################

map_salary = {' <=50K': 0, ' >50K': 1}
df['salary'] = df['salary'].map(map_salary)

########################################

df = df.astype("float64")

df.head()

Unnamed: 0,age,workclass,education,race,sex,native-country,salary
0,2.0,0.0,2.0,0.0,0.0,0.0,0.0
1,3.0,1.0,2.0,0.0,0.0,0.0,0.0
2,2.0,2.0,0.0,0.0,0.0,0.0,0.0
3,4.0,2.0,3.0,1.0,0.0,0.0,0.0
4,1.0,2.0,2.0,1.0,1.0,1.0,0.0


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns

X = df.drop("salary", axis = 1)
y = df["salary"]

X_treino, X_teste, y_treino, y_teste = train_test_split(X, y, test_size = 0.2, random_state = 42)

print(f"Shape de X_treino :{X_treino.shape}")
print(f"Shape de X_teste: {X_teste.shape}")
print(f"Shape de y_treino: {y_treino.shape}")    
print(f"Shape de y_teste: {y_teste.shape}")

Shape de X_treino :(26048, 6)
Shape de X_teste: (6513, 6)
Shape de y_treino: (26048,)
Shape de y_teste: (6513,)


In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from scipy.sparse import issparse

import joblib

cat_cols = X.columns.tolist()

preproc = ColumnTransformer(
    transformers=[("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)]
)

ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
ohe.fit(X_treino[cat_cols])

pipe = Pipeline([
    ("prep", preproc),
    ("clf", LogisticRegression(max_iter=200, class_weight="balanced"))
])

pipe.fit(X_treino, y_treino)
y_pred = pipe.predict(X_teste)



print("=== Logistic Regression (com OHE) ===")
print("Accuracy:", accuracy_score(y_teste, y_pred))
print("F1-macro:", f1_score(y_teste, y_pred, average="macro"))
print(confusion_matrix(y_teste, y_pred))
print(classification_report(y_teste, y_pred, digits=3))

ohe = pipe.named_steps["prep"].named_transformers_["cat"]
feature_names_out = ohe.get_feature_names_out(cat_cols)

artifact = {
    "ohe": ohe,
    "input_cols": cat_cols,
    "ohe_feature_names": feature_names_out
}

joblib.dump(artifact, "encoder_ohe.joblib")


art = joblib.load("encoder_ohe.joblib")
ohe = art["ohe"]
input_cols = art["input_cols"]
ohe_feature_names = art["ohe_feature_names"]

# exemplo de registro (valores já nos mesmos mapeamentos inteiros que você usou)
novo = {
    "age": 2,
    "workclass": 2,
    "education": 2,
    "race": 0,
    "sex": 0,
    "native-country": 0
}

X_new = pd.DataFrame([novo], columns=input_cols)
X_new_ohe = ohe.transform(X_new)


print("Shape pós OHE:", X_new_ohe.shape)



# (Opcional) ver como DataFrame esparso
if issparse(X_new_ohe):
    X_new_ohe_df = pd.DataFrame.sparse.from_spmatrix(X_new_ohe, columns=ohe_feature_names)
else:
    X_new_ohe_df = pd.DataFrame(X_new_ohe, columns=ohe_feature_names)

X_new_ohe_df.head()

ohe.get_feature_names_out(cols)

# rf = RandomForestClassifier(
#     n_estimators=300,
#     random_state=42,
#     class_weight="balanced_subsample"
# )

# rf.fit(X_treino, y_treino)
# y_pred_rf = rf.predict(X_teste)

# print("\n=== Random Forest (sem OHE) ===")
# print("Accuracy:", accuracy_score(y_teste, y_pred_rf))
# print("F1-macro:", f1_score(y_teste, y_pred_rf, average="macro"))
# print(confusion_matrix(y_teste, y_pred_rf))
# print(classification_report(y_teste, y_pred_rf, digits=3))


=== Logistic Regression (com OHE) ===
Accuracy: 0.7116536158452326
F1-macro: 0.6745542094353575
[[3417 1525]
 [ 353 1218]]
              precision    recall  f1-score   support

         0.0      0.906     0.691     0.784      4942
         1.0      0.444     0.775     0.565      1571

    accuracy                          0.712      6513
   macro avg      0.675     0.733     0.675      6513
weighted avg      0.795     0.712     0.731      6513

Shape pós OHE: (1, 26)


Unnamed: 0,age_0.0,age_1.0,age_2.0,age_3.0,age_4.0,age_5.0,age_nan,workclass_0.0,workclass_1.0,workclass_2.0,...,education_5.0,education_6.0,race_0.0,race_1.0,race_2.0,sex_0.0,sex_1.0,native-country_0.0,native-country_1.0,native-country_nan
0,0,0,1.0,0,0,0,0,0,0,1.0,...,0,0,1.0,0,0,1.0,0,1.0,0,0


In [28]:
novo = {
    "age": 3,                # 31-40 anos → mapeado como 2
    "workclass": 1,          # private
    "education": 2,          # bachelors
    "race": 0,               # white
    "sex": 0,                # male
    "native-country": 0,     # united-states
    "occupation": 5          # supondo que vc mapeou occupations
}

X_new = pd.DataFrame([novo])

# Predição
classe = pipe.predict(X_new)[0]  # 0 = <=50K, 1 = >50K
proba = pipe.predict_proba(X_new)[0,1]  # probabilidade de >50K

faixa = "<=50K" if classe == 0 else ">50K"
print(f"Faixa salarial prevista: {faixa} (prob. {proba:.2f})")


set(y)                     # deve ser {0, 1}
type(pipe.named_steps["clf"])

Faixa salarial prevista: >50K (prob. 0.88)


sklearn.linear_model._logistic.LogisticRegression