# 1. Data Exploration and Preprocessing:

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("adult_with_headers.csv")

In [4]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [6]:
# Check for missing values in the DataFrame
print(df.isnull().sum())

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64


In [7]:
numerical_column=df.select_dtypes(include=['number']).columns
print(numerical_column)

Index(['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss',
       'hours_per_week'],
      dtype='object')


In [8]:
cater_column=df.select_dtypes(include=['object']).columns
print(cater_column)

Index(['workclass', 'education', 'marital_status', 'occupation',
       'relationship', 'race', 'sex', 'native_country', 'income'],
      dtype='object')


In [9]:
df.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [11]:
df.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
income            object
dtype: object

In [15]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Numerical features
numerical_features = numerical_column

# Standard Scaling
standard_scaler = StandardScaler()
df_standard_scaled = df.copy()
df_standard_scaled[numerical_features] = standard_scaler.fit_transform(df[numerical_features])
df_standard_scaled[numerical_features].head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,0.837109,-1.008707,1.134739,-0.14592,-0.21666,-2.222153
2,-0.042642,0.245079,-0.42006,-0.14592,-0.21666,-0.035429
3,1.057047,0.425801,-1.197459,-0.14592,-0.21666,-0.035429
4,-0.775768,1.408176,1.134739,-0.14592,-0.21666,-0.035429


In [16]:
# Min-Max Scaling
minmax_scaler = MinMaxScaler()
df_minmax_scaled = df.copy()
df_minmax_scaled[numerical_features] = minmax_scaler.fit_transform(df[numerical_features])
df_minmax_scaled[numerical_features].head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,0.30137,0.044302,0.8,0.02174,0.0,0.397959
1,0.452055,0.048238,0.8,0.0,0.0,0.122449
2,0.287671,0.138113,0.533333,0.0,0.0,0.397959
3,0.493151,0.151068,0.4,0.0,0.0,0.397959
4,0.150685,0.221488,0.8,0.0,0.0,0.397959


# 2. Encoding Techniques:

In [19]:
from sklearn.preprocessing import LabelEncoder

# One-Hot Encoding for variables with less than 5 categories
df_ohe = pd.get_dummies(df, columns=['sex', 'race'])


In [21]:
# Label Encoding for variables with more than 5 categories
label_enc = LabelEncoder()
for col in ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'native_country']:
    df[col] = label_enc.fit_transform(df[col])

df.head()  # Check the encoded data

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,7,77516,9,13,4,1,1,White,Male,2174,0,40,39,<=50K
1,50,6,83311,9,13,2,4,0,White,Male,0,0,13,39,<=50K
2,38,4,215646,11,9,0,6,1,White,Male,0,0,40,39,<=50K
3,53,4,234721,1,7,2,6,0,Black,Male,0,0,40,39,<=50K
4,28,4,338409,9,13,2,10,5,Black,Female,0,0,40,5,<=50K


# 3. Feature Engineering

In [23]:
# Create new features
df['capital_diff'] = df['capital_gain'] - df['capital_loss']
df['work_hours_per_year'] = df['hours_per_week'] * 52

df[['capital_diff', 'work_hours_per_year']].head()

Unnamed: 0,capital_diff,work_hours_per_year
0,2174,2080
1,0,676
2,0,2080
3,0,2080
4,0,2080


In [25]:
import numpy as np

# Log transforming skewed feature 'capital_gain'
df['log_capital_gain'] = np.log1p(df['capital_gain'])  # log1p to handle zeros

df[['capital_gain', 'log_capital_gain']].head() # Compare original and transformed


Unnamed: 0,capital_gain,log_capital_gain
0,2174,7.684784
1,0,0.0
2,0,0.0
3,0,0.0
4,0,0.0


# 4. Feature Selection:

In [28]:
import warnings
warnings.filterwarnings("ignore")
from sklearn.ensemble import IsolationForest

# Isolation Forest to detect outliers
iso_forest = IsolationForest(contamination=0.01)  # Adjust contamination based on preference
outliers = iso_forest.fit_predict(df[numerical_features])

In [29]:
# Add outliers flag to the dataframe
df['outliers'] = outliers
df_outliers_removed = df[df['outliers'] == 1]  # Keeping only non-outliers

print(df_outliers_removed.shape)  # Compare the size of the dataset


(32235, 19)


In [34]:
import ppscore as pps

# Calculate PPS score for each feature against 'income'
pps_matrix = pps.matrix(df)[['x', 'y', 'ppscore']].pivot(columns='x', index='y', values='ppscore')

# Display the PPS matrix
pps_matrix

x,age,capital_diff,capital_gain,capital_loss,education,education_num,fnlwgt,hours_per_week,income,log_capital_gain,marital_status,native_country,occupation,outliers,race,relationship,sex,work_hours_per_year,workclass
y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
age,1.0,0.007082,0.003296,0.0,0.01692639,0.01692639,0.0,0.002377,0.03416143,0.003711,0.2015819,0.0,0.009147658,0.0,0.0,0.1373713,0.0,0.002377,0.019258
capital_diff,0.0,1.0,0.845595,0.0,0.0,0.0,0.0,0.0,0.0,0.845392,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
capital_gain,0.0,0.996354,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.996114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
capital_loss,0.0,0.996118,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
education,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
education_num,0.0,0.009724,0.012555,0.0,1.0,1.0,0.0,0.0,0.02805543,0.012885,0.0,0.0,0.1551207,0.0,0.0,0.0,0.0,0.0,0.0
fnlwgt,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hours_per_week,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.999451,0.0
income,0.005415335,0.400876,0.297123,0.141755,0.2431351,0.2431351,0.0,0.047278,1.0,0.297578,0.0,0.009409,0.09240967,0.04719413,0.0,0.0,0.0,0.047278,0.094056
log_capital_gain,0.0,0.998032,0.998032,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
# Compare with the correlation matrix
correlation_matrix = df.corr()
correlation_matrix

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,capital_gain,capital_loss,hours_per_week,native_country,capital_diff,work_hours_per_year,log_capital_gain,outliers
age,1.0,0.003787,-0.076646,-0.010508,0.036527,-0.266288,-0.020947,-0.263698,0.077674,0.057775,0.068756,-0.001151,0.074284,0.068756,0.124183,-0.092257
workclass,0.003787,1.0,-0.016656,0.023513,0.052085,-0.064731,0.254892,-0.090461,0.033835,0.012216,0.138962,-0.00769,0.033062,0.138962,0.022672,-0.021769
fnlwgt,-0.076646,-0.016656,1.0,-0.028145,-0.043195,0.028153,0.001597,0.008931,0.000432,-0.010252,-0.018768,-0.051966,0.000988,-0.018768,-0.004414,-0.015952
education,-0.010508,0.023513,-0.028145,1.0,0.359153,-0.038407,-0.02126,-0.010876,0.030046,0.016746,0.05551,0.064288,0.029039,0.05551,0.024955,-0.000541
education_num,0.036527,0.052085,-0.043195,0.359153,1.0,-0.069304,0.109697,-0.094153,0.12263,0.079923,0.148123,0.05084,0.117891,0.148123,0.129135,-0.064713
marital_status,-0.266288,-0.064731,0.028153,-0.038407,-0.069304,1.0,-0.009654,0.185451,-0.043393,-0.034187,-0.190519,-0.023819,-0.041395,-0.190519,-0.066595,0.01443
occupation,-0.020947,0.254892,0.001597,-0.02126,0.109697,-0.009654,1.0,-0.075607,0.025505,0.017987,0.080383,-0.012543,0.024445,0.080383,0.019852,-0.016726
relationship,-0.263698,-0.090461,0.008931,-0.010876,-0.094153,0.185451,-0.075607,1.0,-0.057919,-0.061062,-0.248974,-0.005507,-0.054413,-0.248974,-0.083402,0.043681
capital_gain,0.077674,0.033835,0.000432,0.030046,0.12263,-0.043393,0.025505,-0.057919,1.0,-0.031615,0.078409,-0.001982,0.998521,0.078409,0.56452,-0.675249
capital_loss,0.057775,0.012216,-0.010252,0.016746,0.079923,-0.034187,0.017987,-0.061062,-0.031615,1.0,0.054256,0.000419,-0.085902,0.054256,-0.06484,-0.180888
