In [44]:
import kagglehub
import pandas as pd
import os
from IPython.display import display

# Download latest version
path = kagglehub.dataset_download("uciml/adult-census-income")

print("Path to dataset files:", path)

Path to dataset files: /Users/olaoluwatunmise/.cache/kagglehub/datasets/uciml/adult-census-income/versions/3


In [45]:


# Use the path from the previous cell
dataset_path = os.path.join(path, "adult.csv")
df = pd.read_csv(dataset_path)

# Display basic information about the dataset
print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nDataset info:")
print(df.info())
print("\nColumn names:")
print(df.columns.tolist())

Dataset shape: (32561, 15)

First few rows:
   age workclass  fnlwgt     education  education.num marital.status  \
0   90         ?   77053       HS-grad              9        Widowed   
1   82   Private  132870       HS-grad              9        Widowed   
2   66         ?  186061  Some-college             10        Widowed   
3   54   Private  140359       7th-8th              4       Divorced   
4   41   Private  264663  Some-college             10      Separated   

       occupation   relationship   race     sex  capital.gain  capital.loss  \
0               ?  Not-in-family  White  Female             0          4356   
1  Exec-manage...  Not-in-family  White  Female             0          4356   
2               ?      Unmarried  Black  Female             0          4356   
3  Machine-op-...      Unmarried  White  Female             0          3900   
4  Prof-specialty      Own-child  White  Female             0          3900   

   hours.per.week native.country income  
0     

In [46]:
# Set pandas display options for better formatting


# Set display options to show more columns and prevent line wrapping
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)        # Don't wrap to new lines
pd.set_option('display.max_colwidth', 15)   # Limit column width for readability

# # Display the first few rows in a more compact format
print("First few rows (compact display):")
print(df.head())

# Alternative: Use display() function for even better formatting in Jupyter
from IPython.display import display
print("\nUsing display() function:")
display(df.head())

First few rows (compact display):
   age workclass  fnlwgt     education  education.num marital.status  \
0   90         ?   77053       HS-grad              9        Widowed   
1   82   Private  132870       HS-grad              9        Widowed   
2   66         ?  186061  Some-college             10        Widowed   
3   54   Private  140359       7th-8th              4       Divorced   
4   41   Private  264663  Some-college             10      Separated   

       occupation   relationship   race     sex  capital.gain  capital.loss  \
0               ?  Not-in-family  White  Female             0          4356   
1  Exec-manage...  Not-in-family  White  Female             0          4356   
2               ?      Unmarried  Black  Female             0          4356   
3  Machine-op-...      Unmarried  White  Female             0          3900   
4  Prof-specialty      Own-child  White  Female             0          3900   

   hours.per.week native.country income  
0              4

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-manage...,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-...,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


### Cleaning column names

In [47]:

# Remove dots and replace with underscores, convert to lowercase
df.columns = df.columns.str.replace('.', '_').str.lower()

# Display the cleaned column names
print("Cleaned column names:")
print(df.columns.tolist())

# Display first few rows to confirm the changes
print("\nFirst few rows with cleaned column names:")
print(df.head())

# Show dataset info again
print("\nDataset info with cleaned columns:")
print(df.info())

Cleaned column names:
['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income']

First few rows with cleaned column names:
   age workclass  fnlwgt     education  education_num marital_status  \
0   90         ?   77053       HS-grad              9        Widowed   
1   82   Private  132870       HS-grad              9        Widowed   
2   66         ?  186061  Some-college             10        Widowed   
3   54   Private  140359       7th-8th              4       Divorced   
4   41   Private  264663  Some-college             10      Separated   

       occupation   relationship   race     sex  capital_gain  capital_loss  \
0               ?  Not-in-family  White  Female             0          4356   
1  Exec-manage...  Not-in-family  White  Female             0          4356   
2               ?      Unmarried  Black  Female             0  

### Handling missing values

In [48]:
import numpy as np 

# First, let's see how many missing values we have in each column
print("Missing values count before cleaning:")
print(df.isnull().sum())

Missing values count before cleaning:
age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64


In [49]:
# Replace '?' with NaN
df = df.replace('?', np.nan)

In [50]:
print("\nMissing values count after replacing '?' with NaN:")
print(df.isnull().sum())


Missing values count after replacing '?' with NaN:
age                  0
workclass         1836
fnlwgt               0
education            0
education_num        0
marital_status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital_gain         0
capital_loss         0
hours_per_week       0
native_country     583
income               0
dtype: int64


In [51]:
# Display the data types to understand which columns are categorical vs numerical
print("\nData types:")
print(df.dtypes)


Data types:
age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
income            object
dtype: object


In [52]:
# Drop rows with missing values
print("Dataset shape before dropping missing values:", df.shape)
print("Total missing values:", df.isnull().sum().sum())

Dataset shape before dropping missing values: (32561, 15)
Total missing values: 4262


In [53]:
# Drop rows with any missing values
df_clean = df.dropna()

print("\nDataset shape after dropping missing values:", df_clean.shape)
print("Total missing values after cleaning:", df_clean.isnull().sum().sum())


Dataset shape after dropping missing values: (30162, 15)
Total missing values after cleaning: 0


In [54]:
# Calculate percentage of data retained
percentage_retained = (len(df_clean) / len(df)) * 100
print(f"Percentage of data retained: {percentage_retained:.2f}%")

Percentage of data retained: 92.63%


In [55]:
# Display first few rows of cleaned dataset
print("\nFirst few rows of cleaned dataset:")
display(df_clean.head())


First few rows of cleaned dataset:


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
1,82,Private,132870,HS-grad,9,Widowed,Exec-manage...,Not-in-family,White,Female,0,4356,18,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-...,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
5,34,Private,216864,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0,3770,45,United-States,<=50K
6,38,Private,150601,10th,6,Separated,Adm-clerical,Unmarried,White,Male,0,3770,40,United-States,<=50K


In [59]:
# Show summary statistics of cleaned dataset
print("\nSummary statistics of cleaned dataset:")
display(df_clean.describe())




Summary statistics of cleaned dataset:


Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,30162.0,30162.0,30162.0,30162.0,30162.0,30162.0
mean,38.437902,189793.8,10.121312,1092.007858,88.372489,40.931238
std,13.134665,105653.0,2.549995,7406.346497,404.29837,11.979984
min,17.0,13769.0,1.0,0.0,0.0,1.0
25%,28.0,117627.2,9.0,0.0,0.0,40.0
50%,37.0,178425.0,10.0,0.0,0.0,40.0
75%,47.0,237628.5,13.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [58]:
# Verify no missing values remain
print("\nMissing values check:")
print(df_clean.isnull().sum())


Missing values check:
age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64


In [60]:
# Your dataset categorical features
categorical_features = ['workclass', 'education', 'marital_status', 'occupation', 
                       'relationship', 'race', 'sex', 'native_country', 'income']

print("Cardinality Analysis:")
print("=" * 40)

for feature in categorical_features:
    unique_count = df_clean[feature].nunique()
    unique_values = df_clean[feature].unique()
    
    print(f"\n{feature}:")
    print(f"  Unique values: {unique_count}")
    print(f"  Values: {list(unique_values)}")
    
    if unique_count > 15:
        print("  → HIGH CARDINALITY (Use Label Encoding)")
    else:
        print("  → LOW CARDINALITY (Use One-Hot Encoding)")

Cardinality Analysis:

workclass:
  Unique values: 7
  Values: ['Private', 'State-gov', 'Federal-gov', 'Self-emp-not-inc', 'Self-emp-inc', 'Local-gov', 'Without-pay']
  → LOW CARDINALITY (Use One-Hot Encoding)

education:
  Unique values: 16
  Values: ['HS-grad', '7th-8th', 'Some-college', '10th', 'Doctorate', 'Prof-school', 'Bachelors', 'Masters', '11th', 'Assoc-voc', '1st-4th', '5th-6th', 'Assoc-acdm', '12th', '9th', 'Preschool']
  → HIGH CARDINALITY (Use Label Encoding)

marital_status:
  Unique values: 7
  Values: ['Widowed', 'Divorced', 'Separated', 'Never-married', 'Married-civ-spouse', 'Married-spouse-absent', 'Married-AF-spouse']
  → LOW CARDINALITY (Use One-Hot Encoding)

occupation:
  Unique values: 14
  Values: ['Exec-managerial', 'Machine-op-inspct', 'Prof-specialty', 'Other-service', 'Adm-clerical', 'Transport-moving', 'Sales', 'Craft-repair', 'Farming-fishing', 'Tech-support', 'Protective-serv', 'Handlers-cleaners', 'Armed-Forces', 'Priv-house-serv']
  → LOW CARDINALITY (

In [61]:
# Encode categorical variables
from sklearn.preprocessing import LabelEncoder

In [62]:
# First, let's check the cardinality of each categorical variable
categorical_cols = ['workclass', 'education', 'marital_status', 'occupation', 
                   'relationship', 'race', 'sex', 'native_country', 'income']

print("Cardinality of categorical variables:")
for col in categorical_cols:
    unique_count = df_clean[col].nunique()
    print(f"{col}: {unique_count} unique values")


# Define high-cardinality and low-cardinality features
high_cardinality = ['occupation', 'native_country', 'education']  # >10 unique values
low_cardinality = ['workclass', 'marital_status', 'relationship', 'race', 'sex', 'income']  # ≤10 unique values

print(f"\nHigh-cardinality features (will use label encoding): {high_cardinality}")
print(f"Low-cardinality features (will use one-hot encoding): {low_cardinality}")

Cardinality of categorical variables:
workclass: 7 unique values
education: 16 unique values
marital_status: 7 unique values
occupation: 14 unique values
relationship: 6 unique values
race: 5 unique values
sex: 2 unique values
native_country: 41 unique values
income: 2 unique values

High-cardinality features (will use label encoding): ['occupation', 'native_country', 'education']
Low-cardinality features (will use one-hot encoding): ['workclass', 'marital_status', 'relationship', 'race', 'sex', 'income']


In [63]:
# Create a copy of the cleaned dataset for encoding
df_encoded = df_clean.copy()

# 1. Label Encoding for high-cardinality features
label_encoders = {}
for col in high_cardinality:
    le = LabelEncoder()
    df_encoded[col + '_encoded'] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le
    print(f"Label encoded {col}: {len(le.classes_)} classes")



Label encoded occupation: 14 classes
Label encoded native_country: 41 classes
Label encoded education: 16 classes


In [64]:
# 2. One-hot encoding for low-cardinality features
df_encoded = pd.get_dummies(df_encoded, columns=low_cardinality, prefix=low_cardinality)

# Display the encoded dataset info
print(f"\nDataset shape after encoding: {df_encoded.shape}")
print(f"Number of features after encoding: {df_encoded.shape[1]}")

# Show the new column names
print("\nNew column names after encoding:")
print(df_encoded.columns.tolist())

# Display first few rows of encoded dataset
print("\nFirst few rows of encoded dataset:")
display(df_encoded.head())



Dataset shape after encoding: (30162, 41)
Number of features after encoding: 41

New column names after encoding:
['age', 'fnlwgt', 'education', 'education_num', 'occupation', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'occupation_encoded', 'native_country_encoded', 'education_encoded', 'workclass_Federal-gov', 'workclass_Local-gov', 'workclass_Private', 'workclass_Self-emp-inc', 'workclass_Self-emp-not-inc', 'workclass_State-gov', 'workclass_Without-pay', 'marital_status_Divorced', 'marital_status_Married-AF-spouse', 'marital_status_Married-civ-spouse', 'marital_status_Married-spouse-absent', 'marital_status_Never-married', 'marital_status_Separated', 'marital_status_Widowed', 'relationship_Husband', 'relationship_Not-in-family', 'relationship_Other-relative', 'relationship_Own-child', 'relationship_Unmarried', 'relationship_Wife', 'race_Amer-Indian-Eskimo', 'race_Asian-Pac-Islander', 'race_Black', 'race_Other', 'race_White', 'sex_Female', 'sex_Male', 'income

Unnamed: 0,age,fnlwgt,education,education_num,occupation,capital_gain,capital_loss,hours_per_week,native_country,occupation_encoded,native_country_encoded,education_encoded,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,marital_status_Divorced,marital_status_Married-AF-spouse,marital_status_Married-civ-spouse,marital_status_Married-spouse-absent,marital_status_Never-married,marital_status_Separated,marital_status_Widowed,relationship_Husband,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Female,sex_Male,income_<=50K,income_>50K
1,82,132870,HS-grad,9,Exec-manage...,0,4356,18,United-States,3,38,11,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,True,True,False,True,False
3,54,140359,7th-8th,4,Machine-op-...,0,3900,40,United-States,6,38,5,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,True,False,True,False
4,41,264663,Some-college,10,Prof-specialty,0,3900,40,United-States,9,38,15,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,True,True,False,True,False
5,34,216864,HS-grad,9,Other-service,0,3770,45,United-States,7,38,11,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,True,False,True,False
6,38,150601,10th,6,Adm-clerical,0,3770,40,United-States,0,38,0,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,True,False,True,True,False


In [65]:
# Show summary of the encoding
print("\nEncoding Summary:")
print("=" * 50)
print("Label Encoded Features:")
for col in high_cardinality:
    print(f"  {col}: {len(label_encoders[col].classes_)} classes")
    print(f"    Classes: {list(label_encoders[col].classes_)}")

print("\nOne-Hot Encoded Features:")
for col in low_cardinality:
    encoded_cols = [c for c in df_encoded.columns if c.startswith(col + '_')]
    print(f"  {col}: {len(encoded_cols)} dummy variables")
    print(f"    Dummy variables: {encoded_cols}")

# Show data types of encoded dataset
print("\nData types after encoding:")
print(df_encoded.dtypes.value_counts())


Encoding Summary:
Label Encoded Features:
  occupation: 14 classes
    Classes: ['Adm-clerical', 'Armed-Forces', 'Craft-repair', 'Exec-managerial', 'Farming-fishing', 'Handlers-cleaners', 'Machine-op-inspct', 'Other-service', 'Priv-house-serv', 'Prof-specialty', 'Protective-serv', 'Sales', 'Tech-support', 'Transport-moving']
  native_country: 41 classes
    Classes: ['Cambodia', 'Canada', 'China', 'Columbia', 'Cuba', 'Dominican-Republic', 'Ecuador', 'El-Salvador', 'England', 'France', 'Germany', 'Greece', 'Guatemala', 'Haiti', 'Holand-Netherlands', 'Honduras', 'Hong', 'Hungary', 'India', 'Iran', 'Ireland', 'Italy', 'Jamaica', 'Japan', 'Laos', 'Mexico', 'Nicaragua', 'Outlying-US(Guam-USVI-etc)', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Puerto-Rico', 'Scotland', 'South', 'Taiwan', 'Thailand', 'Trinadad&Tobago', 'United-States', 'Vietnam', 'Yugoslavia']
  education: 16 classes
    Classes: ['10th', '11th', '12th', '1st-4th', '5th-6th', '7th-8th', '9th', 'Assoc-acdm', 'Assoc-voc', 'B

In [66]:
df_encoded

Unnamed: 0,age,fnlwgt,education,education_num,occupation,capital_gain,capital_loss,hours_per_week,native_country,occupation_encoded,native_country_encoded,education_encoded,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,marital_status_Divorced,marital_status_Married-AF-spouse,marital_status_Married-civ-spouse,marital_status_Married-spouse-absent,marital_status_Never-married,marital_status_Separated,marital_status_Widowed,relationship_Husband,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Female,sex_Male,income_<=50K,income_>50K
1,82,132870,HS-grad,9,Exec-manage...,0,4356,18,United-States,3,38,11,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,True,True,False,True,False
3,54,140359,7th-8th,4,Machine-op-...,0,3900,40,United-States,6,38,5,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,True,False,True,False
4,41,264663,Some-college,10,Prof-specialty,0,3900,40,United-States,9,38,15,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,True,True,False,True,False
5,34,216864,HS-grad,9,Other-service,0,3770,45,United-States,7,38,11,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,True,False,True,False
6,38,150601,10th,6,Adm-clerical,0,3770,40,United-States,0,38,0,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,True,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,310152,Some-college,10,Protective-...,0,0,40,United-States,10,38,15,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,True,False,True,True,False
32557,27,257302,Assoc-acdm,12,Tech-support,0,0,38,United-States,12,38,7,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,True,False,True,False
32558,40,154374,HS-grad,9,Machine-op-...,0,0,40,United-States,6,38,11,False,False,True,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,True,False,True
32559,58,151910,HS-grad,9,Adm-clerical,0,0,40,United-States,0,38,11,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,True,True,False,True,False
