In [17]:
# Import libraries and load dataset
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# Load dataset
data = pd.read_csv('/content/adult_with_headers.csv')
data

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [3]:
# 1. Data Exploration and Preprocessing
# Checking for any missing values
missing_values = data.isnull().sum()
print('Missing values per column:')
print(missing_values)

# Display basic statistics
print('Summary statistics:')
print(data.describe())

# Data types
print('Data Types:', data.dtypes)

Missing values per column:
age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64
Summary statistics:
                age        fnlwgt  education_num  capital_gain  capital_loss  \
count  32561.000000  3.256100e+04   32561.000000  32561.000000  32561.000000   
mean      38.581647  1.897784e+05      10.080679   1077.648844     87.303830   
std       13.640433  1.055500e+05       2.572720   7385.292085    402.960219   
min       17.000000  1.228500e+04       1.000000      0.000000      0.000000   
25%       28.000000  1.178270e+05       9.000000      0.000000      0.000000   
50%       37.000000  1.783560e+05      10.000000      0.000000      0.000000   
75%       48.000000  2.370510e+05      12.000000      0.000000      0.000000   


In [4]:
# Scaling numerical columns
num_cols = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

# Standard Scaling
scaler_standard = StandardScaler()
data_scaled_standard = data.copy()
data_scaled_standard[num_cols] = scaler_standard.fit_transform(data[num_cols])
print('Data after Standard Scaling:')
data_scaled_standard.head()

Data after Standard Scaling:


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,0.030671,State-gov,-1.063611,Bachelors,1.134739,Never-married,Adm-clerical,Not-in-family,White,Male,0.148453,-0.21666,-0.035429,United-States,<=50K
1,0.837109,Self-emp-not-inc,-1.008707,Bachelors,1.134739,Married-civ-spouse,Exec-managerial,Husband,White,Male,-0.14592,-0.21666,-2.222153,United-States,<=50K
2,-0.042642,Private,0.245079,HS-grad,-0.42006,Divorced,Handlers-cleaners,Not-in-family,White,Male,-0.14592,-0.21666,-0.035429,United-States,<=50K
3,1.057047,Private,0.425801,11th,-1.197459,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,-0.14592,-0.21666,-0.035429,United-States,<=50K
4,-0.775768,Private,1.408176,Bachelors,1.134739,Married-civ-spouse,Prof-specialty,Wife,Black,Female,-0.14592,-0.21666,-0.035429,Cuba,<=50K


In [5]:
# Min-Max Scaling
scaler_minmax = MinMaxScaler()
data_scaled_minmax = data.copy()
data_scaled_minmax[num_cols] = scaler_minmax.fit_transform(data[num_cols])
print('Data after Min-Max Scaling:')
data_scaled_minmax.head()

Data after Min-Max Scaling:


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,0.30137,State-gov,0.044302,Bachelors,0.8,Never-married,Adm-clerical,Not-in-family,White,Male,0.02174,0.0,0.397959,United-States,<=50K
1,0.452055,Self-emp-not-inc,0.048238,Bachelors,0.8,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,0.122449,United-States,<=50K
2,0.287671,Private,0.138113,HS-grad,0.533333,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,0.397959,United-States,<=50K
3,0.493151,Private,0.151068,11th,0.4,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,0.397959,United-States,<=50K
4,0.150685,Private,0.221488,Bachelors,0.8,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,0.397959,Cuba,<=50K


In [6]:
# Encoding Categorical Variables
# One-Hot Encoding for categories with < 5 unique values
categorical_columns = [col for col in data.select_dtypes(include=['object']).columns if data[col].nunique() < 5]
data_encoded_onehot = pd.get_dummies(data, columns=categorical_columns)
print('One-Hot Encoded Data:')
data_encoded_onehot.head()

One-Hot Encoded Data:


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,capital_gain,capital_loss,hours_per_week,native_country,sex_ Female,sex_ Male,income_ <=50K,income_ >50K
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,2174,0,40,United-States,0,1,1,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,0,0,13,United-States,0,1,1,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,0,0,40,United-States,0,1,1,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,0,0,40,United-States,0,1,1,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,0,0,40,Cuba,1,0,1,0


In [7]:
# Label Encoding for categories with >= 5 unique values
encoder_label = LabelEncoder()
for col in categorical_columns:
    data[col] = encoder_label.fit_transform(data[col])

encoder_label

In [8]:
# Feature Engineering: Create new features
# Feature 1: Age Group
data['age_group'] = pd.cut(data['age'], bins=[0, 25, 45, 65, 100], labels=['Young', 'Middle-Aged', 'Senior', 'Elderly'])

# Feature 2: Work Hours Category
data['hours_category'] = pd.cut(data['hours_per_week'], bins=[0, 20, 40, 60, 100], labels=['Part-time', 'Full-time', 'Over-time', 'Extreme'])

print('New Features:')
print(data[['age_group', 'hours_category']].head())


New Features:
     age_group hours_category
0  Middle-Aged      Full-time
1       Senior      Part-time
2  Middle-Aged      Full-time
3       Senior      Full-time
4  Middle-Aged      Full-time


In [9]:
# Isolation Forest to detect and remove outliers
iso_forest = IsolationForest(contamination=0.05, random_state=42)
outliers = iso_forest.fit_predict(data[num_cols])
iso_forest

In [10]:
# Marking the outliers
data['outliers'] = outliers
outliers

array([1, 1, 1, ..., 1, 1, 1])

In [11]:
# Removing the outliers
data_cleaned = data[data['outliers'] == 1]
print('Data with no outliers:')
data_cleaned.head()

Data with no outliers:


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income,age_group,hours_category,outliers
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,1,2174,0,40,United-States,0,Middle-Aged,Full-time,1
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,1,0,0,13,United-States,0,Senior,Part-time,1
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,1,0,0,40,United-States,0,Middle-Aged,Full-time,1
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,1,0,0,40,United-States,0,Senior,Full-time,1
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,0,0,0,40,Cuba,0,Middle-Aged,Full-time,1


In [12]:
print('Data after outlier removal:')
print(data_cleaned.shape)

Data after outlier removal:
(30933, 18)


In [2]:
!pip install virtualenv
!virtualenv my_env
!source my_env/bin/activate
!pip install --upgrade pip setuptools wheel
!pip install pandas --only-binary :all:
!pip install ppscore

Collecting virtualenv
  Downloading virtualenv-20.27.0-py3-none-any.whl.metadata (4.5 kB)
Collecting distlib<1,>=0.3.7 (from virtualenv)
  Downloading distlib-0.3.9-py2.py3-none-any.whl.metadata (5.2 kB)
Downloading virtualenv-20.27.0-py3-none-any.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m57.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading distlib-0.3.9-py2.py3-none-any.whl (468 kB)
Installing collected packages: distlib, virtualenv
Successfully installed distlib-0.3.9 virtualenv-20.27.0
created virtual environment CPython3.10.12.final.0-64 in 1760ms
  creator CPython3Posix(dest=/content/my_env, clear=False, no_vcs_ignore=False, global=False)
  seeder FromAppData(download=False, pip=bundle, setuptools=bundle, wheel=bundle, via=copy, app_data_dir=/root/.local/share/virtualenv)
    added seed packages: pip==24.2, setuptools==75.2.0, wheel==0.44.0
  activators BashActivator,CShellActivator,FishActivator,NushellActivator,PowerShellActiv

In [5]:
!pip install --force-reinstall pandas==1.5.3 numpy==1.24.3
!pip install --upgrade ppscore

Collecting pandas==1.5.3
  Using cached pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting numpy==1.24.3
  Downloading numpy-1.24.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting python-dateutil>=2.8.1 (from pandas==1.5.3)
  Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting pytz>=2020.1 (from pandas==1.5.3)
  Using cached pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting six>=1.5 (from python-dateutil>=2.8.1->pandas==1.5.3)
  Using cached six-1.16.0-py2.py3-none-any.whl.metadata (1.8 kB)
Using cached pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
Downloading numpy-1.24.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached python_dateutil-2.9.0.post0-py2.py3-no



In [18]:
# Apply PPS (Predictive Power Score):

import ppscore as pps
pps_matrix = pps.matrix(data)
pps_matrix

Unnamed: 0,x,y,ppscore,case,is_valid_score,metric,baseline_score,model_score,model
0,age,age,1.000000,predict_itself,True,,0.000000,1.000000,
1,age,workclass,0.011232,classification,True,weighted F1,0.579088,0.583816,DecisionTreeClassifier()
2,age,fnlwgt,0.000000,regression,True,mean absolute error,75872.186200,77535.141544,DecisionTreeRegressor()
3,age,education,0.052315,classification,True,weighted F1,0.201200,0.242989,DecisionTreeClassifier()
4,age,education_num,0.000000,regression,True,mean absolute error,1.853000,1.898306,DecisionTreeRegressor()
...,...,...,...,...,...,...,...,...,...
220,income,capital_gain,0.000000,regression,True,mean absolute error,1093.884000,1760.682115,DecisionTreeRegressor()
221,income,capital_loss,0.000000,regression,True,mean absolute error,94.942600,176.261353,DecisionTreeRegressor()
222,income,hours_per_week,0.000000,regression,True,mean absolute error,7.656400,8.097596,DecisionTreeRegressor()
223,income,native_country,0.000000,classification,True,weighted F1,0.841082,0.841082,DecisionTreeClassifier()


In [19]:
correlation_matrix = data.corr()
correlation_matrix

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
age,1.0,-0.076646,0.036527,0.077674,0.057775,0.068756
fnlwgt,-0.076646,1.0,-0.043195,0.000432,-0.010252,-0.018768
education_num,0.036527,-0.043195,1.0,0.12263,0.079923,0.148123
capital_gain,0.077674,0.000432,0.12263,1.0,-0.031615,0.078409
capital_loss,0.057775,-0.010252,0.079923,-0.031615,1.0,0.054256
hours_per_week,0.068756,-0.018768,0.148123,0.078409,0.054256,1.0
