In [2]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.3-py3-none-any.whl (7.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.3


In [18]:
from ucimlrepo import fetch_ucirepo

def load_dataset():
  # fetch dataset
  adult = fetch_ucirepo(id=2)

  # data (as pandas dataframes)
  X = adult.data.features
  y = adult.data.targets

  # metadata
  print(adult.metadata)

  # variable information
  variables_info = adult.variables
  print(variables_info)

  return X, y, variables_info

In [106]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

def preprocess(features, labels, var_info):
  # Create copies to avoid changing raw features and labels
  X = features.copy()
  y = labels.copy()

  X.isna().sum()

  # Clean the data - remove missing value rows
  features_missing_values = var_info[var_info['missing_values'] == 'yes']
  rows_to_remove = set()
  for index, row in features_missing_values.iterrows():
    feature_name = row['name']
    rows_to_remove.update(X.index[X[feature_name].isna()].tolist());
  X.drop(rows_to_remove, axis = 0, inplace=True)
  y.drop(rows_to_remove, axis = 0, inplace=True)

  # Encode categorical feature values and store encoders to later decode them
  categorical = var_info[var_info['type'] == 'Categorical']
  encoders = pd.DataFrame(columns=['name', 'encoders'])
  for index, row in categorical.iterrows():
    feature_name = row['name']
    le = LabelEncoder()
    X[feature_name] = le.fit_transform(X[feature_name])
    encoders.loc[len(encoders.index)] = [feature_name, le]
  le = LabelEncoder()
  y['income'] = le.fit_transform(y['income'])
  encoders.loc[len(encoders.index)] = ['income', le]

  return X, y, encoders


In [107]:
X_raw, y_raw, var_info = load_dataset()
X, y, encoders = preprocess(X_raw, y_raw, var_info)

{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether income exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Mon Aug 07 2023', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': 'Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the following conditions: ((AAG