In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn import impute
from sklearn.datasets import make_regression
from sklearn import model_selection
import seaborn as sns
import pandas as pd
import numpy as np
import datasets
from matplotlib import pyplot as plt

In [None]:
dataset = datasets.load_dataset("scikit-learn/adult-census-income", split="train", cache_dir="../../.datasets")
dataset.info

In [None]:
df = dataset.to_pandas()

In [None]:
cols_with_int64_dtype = [cols for cols in df.columns if df[cols].dtype == 'int64']
cols_with_int64_dtype

In [None]:
fig, axes = plt.subplots(6, 1, figsize=(20, 20))
for col, ax in zip(cols_with_int64_dtype, axes):
  sns.kdeplot(df, x=col, ax=ax)

In [None]:
df["income"].unique()

In [None]:
numeric_columns = df.select_dtypes(include=['int64', 'float64', 'float32']).columns
categoric_columns = df.select_dtypes(include=['object']).columns.drop("income")

imputer = impute.SimpleImputer(strategy='most_frequent')
df[categoric_columns] = imputer.fit_transform(df[categoric_columns])

scalar = preprocessing.MinMaxScaler()
onehotencoder = preprocessing.OneHotEncoder(drop='first')

df[numeric_columns] = scalar.fit_transform(df[numeric_columns])
encoded = onehotencoder.fit_transform(df[categoric_columns])
encoded_dense = encoded.toarray()
# Create new column names
encoded_cols = onehotencoder.get_feature_names_out(categoric_columns)
# Convert to DataFrame
encoded_df = pd.DataFrame(encoded_dense, columns=encoded_cols, index=df.index)
# Drop original categorical columns and concat
df = df.drop(categoric_columns, axis=1)
df = pd.concat([df, encoded_df], axis=1)

df["income"] = df["income"].map({"<=50K": 0, ">50K": 1})

df = df.dropna()

X = df.drop("income", axis=1)
y = df["income"]

In [None]:
df

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=55)

In [None]:
logistic_reg = LogisticRegression()
logistic_reg.fit(X_train, y_train)

In [93]:
y_pred = logistic_reg.predict(X_test)
print(y_pred[:10])
print(y_test.values[:10])   # use .values to get NumPy array

accuracy = sum([1 for i, label in enumerate(y_pred) if label == y_test.values[i]])
print(accuracy/len(y_test.values))


[1 0 1 0 0 0 0 0 1 0]
[1 0 0 0 1 0 1 0 1 0]
0.8532391771568928
