In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv("data/adult.csv")

In [5]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [7]:
df.describe()

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [8]:
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

Replace "?" with np.nan

In [9]:
df.replace("?", np.nan, inplace=True)

Check how many missing values per column

In [10]:
df.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
income               0
dtype: int64

Impute missing values with mode

In [12]:
# Compute most frequent values (modes) for each column
workclass_mode = df['workclass'].mode()[0]
occupation_mode = df['occupation'].mode()[0]
country_mode = df['native.country'].mode()[0]

print(f"Imputing workclass with: {workclass_mode}")
print(f"Imputing occupation with: {occupation_mode}")
print(f"Imputing native.country with: {country_mode}")

# Impute missing values in a single call
df.fillna({
    'workclass': workclass_mode,
    'occupation': occupation_mode,
    'native.country': country_mode
}, inplace=True)

# Verify no missing values remain
print("\nRemaining missing values:\n")
print(df.isnull().sum())


Imputing workclass with: Private
Imputing occupation with: Prof-specialty
Imputing native.country with: United-States

Remaining missing values:

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64


In [13]:
df['sex'] = df['sex'].map({'Male': 1, 'Female': 0})

In [14]:
df['income'] = df['income'].map({'>50K': 1, '<=50K': 0})

In [15]:
print(df['income'].unique())

[0 1]


In [16]:
df = pd.get_dummies(df, columns=[
    'workclass', 'education', 'marital.status',
    'occupation', 'relationship', 'race', 'native.country'
], drop_first=True)

In [17]:
print(df.head())
print(df.shape)


   age  fnlwgt  education.num  sex  capital.gain  capital.loss  \
0   90   77053              9    0             0          4356   
1   82  132870              9    0             0          4356   
2   66  186061             10    0             0          4356   
3   54  140359              4    0             0          3900   
4   41  264663             10    0             0          3900   

   hours.per.week  income  workclass_Local-gov  workclass_Never-worked  ...  \
0              40       0                False                   False  ...   
1              18       0                False                   False  ...   
2              40       0                False                   False  ...   
3              40       0                False                   False  ...   
4              40       0                False                   False  ...   

   native.country_Portugal  native.country_Puerto-Rico  \
0                    False                       False   
1           