## Data Preprocessing and Feature Engineering on Adult Dataset

### 1. Data Exploration and Preprocessing

#### 1.1 Load the Dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [44]:
df=pd.read_csv('adult_with_headers.csv')
df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


#### 1.2 Data Exploration

In [45]:
# Basic info about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [47]:
# Check for missing values
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [48]:
# Checking for duplicate values
df[df.duplicated()]

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
4881,25,Private,308144,Bachelors,13,Never-married,Craft-repair,Not-in-family,White,Male,0,0,40,Mexico,<=50K
5104,90,Private,52386,Some-college,10,Never-married,Other-service,Not-in-family,Asian-Pac-Islander,Male,0,0,35,United-States,<=50K
9171,21,Private,250051,Some-college,10,Never-married,Prof-specialty,Own-child,White,Female,0,0,10,United-States,<=50K
11631,20,Private,107658,Some-college,10,Never-married,Tech-support,Not-in-family,White,Female,0,0,10,United-States,<=50K
13084,25,Private,195994,1st-4th,2,Never-married,Priv-house-serv,Not-in-family,White,Female,0,0,40,Guatemala,<=50K
15059,21,Private,243368,Preschool,1,Never-married,Farming-fishing,Not-in-family,White,Male,0,0,50,Mexico,<=50K
17040,46,Private,173243,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
18555,30,Private,144593,HS-grad,9,Never-married,Other-service,Not-in-family,Black,Male,0,0,40,?,<=50K
18698,19,Private,97261,HS-grad,9,Never-married,Farming-fishing,Not-in-family,White,Male,0,0,40,United-States,<=50K
21318,19,Private,138153,Some-college,10,Never-married,Adm-clerical,Own-child,White,Female,0,0,10,United-States,<=50K


In [49]:
# Removing duplicate values
df.drop_duplicates(keep='first',inplace=True,ignore_index=True)
df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32532,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32533,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32534,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32535,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [8]:
# Define function for detecting outliers using IQR for all the numerical columns
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Apply outlier removal to all numerical columns
def remove_outliers_all(df):
    for column in df.select_dtypes(include=['float64', 'int64']).columns:
        df = remove_outliers(df, column)
    return df

# Apply the function to the entire dataframe
df_cleaned = remove_outliers_all(df)
df=df_cleaned

# Display the cleaned dataframe
df


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32530,53,Private,321865,Masters,14,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,>50K
32531,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
32532,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32533,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K


In [9]:
# Get statistical summary
summary_stats = df.describe()

# Display summary statistics
summary_stats

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,18991.0,18991.0,18991.0,18991.0,18991.0,18991.0
mean,38.050498,180715.467432,10.246327,0.0,0.0,41.470486
std,12.026915,87085.715528,2.152306,0.0,0.0,3.950531
min,17.0,14878.0,5.0,0.0,0.0,33.0
25%,28.0,117499.0,9.0,0.0,0.0,40.0
50%,37.0,176839.0,10.0,0.0,0.0,40.0
75%,47.0,228910.0,12.0,0.0,0.0,40.0
max,78.0,416415.0,16.0,0.0,0.0,52.0


#### 1.3 Scaling Techniques

In [50]:
# Segregate numerical columns
numerical_columns = df.select_dtypes(include=['number']).columns.tolist()
numerical_columns

['age',
 'fnlwgt',
 'education_num',
 'capital_gain',
 'capital_loss',
 'hours_per_week']

In [51]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler,OneHotEncoder,LabelEncoder

##### Standard Scaling

In [12]:
std=StandardScaler()

In [13]:
df[['fnlwgt']]=std.fit_transform(df[['fnlwgt']])
df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
2,38,Private,0.401116,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,0.620159,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,1.810833,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,1.192725,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
7,52,Self-emp-not-inc,0.332170,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32530,53,Private,1.620854,Masters,14,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,>50K
32531,22,Private,1.486351,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
32532,27,Private,0.879462,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32533,40,Private,-0.302485,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K


##### MinMax Scaling

In [14]:
minmax=MinMaxScaler()

In [15]:
df[['education_num','hours_per_week']]=minmax.fit_transform(df[['education_num','hours_per_week']])
df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
2,38,Private,0.401116,HS-grad,0.363636,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,0.368421,United-States,<=50K
3,53,Private,0.620159,11th,0.181818,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,0.368421,United-States,<=50K
4,28,Private,1.810833,Bachelors,0.727273,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,0.368421,Cuba,<=50K
5,37,Private,1.192725,Masters,0.818182,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,0.368421,United-States,<=50K
7,52,Self-emp-not-inc,0.332170,HS-grad,0.363636,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,0.631579,United-States,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32530,53,Private,1.620854,Masters,0.818182,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,0.368421,United-States,>50K
32531,22,Private,1.486351,Some-college,0.454545,Never-married,Protective-serv,Not-in-family,White,Male,0,0,0.368421,United-States,<=50K
32532,27,Private,0.879462,Assoc-acdm,0.636364,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,0.263158,United-States,<=50K
32533,40,Private,-0.302485,HS-grad,0.363636,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,0.368421,United-States,>50K


### **Standard Scaling**:
- **Description**: Scales the data to have a mean of 0 and a standard deviation of 1.
- **Preferred when**: The data follows a normal distribution or when you're using algorithms that assume normally distributed features (e.g., Linear Regression, Logistic Regression, SVM, PCA).
- **Why**: It centers the data and adjusts for different units or scales, making it easier for these models to converge and perform well.

### **Min-Max Scaling**:
- **Description**: Scales the data to a fixed range (usually 0 to 1).
- **Preferred when**: The data doesn't follow a normal distribution, or you want to maintain the relationships between features but normalize the scale (e.g., Neural Networks, K-Nearest Neighbors).
- **Why**: Min-Max scaling preserves the relative relationships between values, which can be important for distance-based models and models sensitive to feature magnitude.

### 2. Encoding Techniques

In [16]:
categorical_columns = df.select_dtypes(include=['object']).columns
for column in categorical_columns:
    num_categories = df[column].nunique()  
    print(f"The column '{column}' has {num_categories} unique categories.")
for column in categorical_columns:
    unique_categories = df[column].unique()  # Get unique values in the column
    print(f"The column '{column}' has the following categories:")
    print(unique_categories)
    print("\n")  # Newline for better readability

The column 'workclass' has 9 unique categories.
The column 'education' has 12 unique categories.
The column 'marital_status' has 7 unique categories.
The column 'occupation' has 15 unique categories.
The column 'relationship' has 6 unique categories.
The column 'race' has 5 unique categories.
The column 'sex' has 2 unique categories.
The column 'native_country' has 41 unique categories.
The column 'income' has 2 unique categories.
The column 'workclass' has the following categories:
[' Private' ' Self-emp-not-inc' ' State-gov' ' Federal-gov' ' Local-gov'
 ' Self-emp-inc' ' ?' ' Never-worked' ' Without-pay']


The column 'education' has the following categories:
[' HS-grad' ' 11th' ' Bachelors' ' Masters' ' Assoc-acdm' ' Assoc-voc'
 ' 9th' ' Some-college' ' Doctorate' ' Prof-school' ' 10th' ' 12th']


The column 'marital_status' has the following categories:
[' Divorced' ' Married-civ-spouse' ' Never-married' ' Separated'
 ' Widowed' ' Married-spouse-absent' ' Married-AF-spouse']


The 

#### 2.1 One-Hot Encoding( for variables with <5 categories)

In [17]:
onehot=OneHotEncoder()

In [18]:
df1=pd.DataFrame(onehot.fit_transform(df[['race']]).toarray())
df1

Unnamed: 0,0,1,2,3,4
0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...
18986,0.0,0.0,0.0,0.0,1.0
18987,0.0,0.0,0.0,0.0,1.0
18988,0.0,0.0,0.0,0.0,1.0
18989,0.0,0.0,0.0,0.0,1.0


In [19]:
df2=df.join(df1)
df2

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income,0,1,2,3,4
2,38,Private,0.401116,HS-grad,0.363636,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,0.368421,United-States,<=50K,0.0,0.0,1.0,0.0,0.0
3,53,Private,0.620159,11th,0.181818,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,0.368421,United-States,<=50K,0.0,0.0,0.0,0.0,1.0
4,28,Private,1.810833,Bachelors,0.727273,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,0.368421,Cuba,<=50K,0.0,0.0,0.0,0.0,1.0
5,37,Private,1.192725,Masters,0.818182,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,0.368421,United-States,<=50K,0.0,1.0,0.0,0.0,0.0
7,52,Self-emp-not-inc,0.332170,HS-grad,0.363636,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,0.631579,United-States,>50K,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32530,53,Private,1.620854,Masters,0.818182,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,0.368421,United-States,>50K,,,,,
32531,22,Private,1.486351,Some-college,0.454545,Never-married,Protective-serv,Not-in-family,White,Male,0,0,0.368421,United-States,<=50K,,,,,
32532,27,Private,0.879462,Assoc-acdm,0.636364,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,0.263158,United-States,<=50K,,,,,
32533,40,Private,-0.302485,HS-grad,0.363636,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,0.368421,United-States,>50K,,,,,


In [20]:
df2.drop('race',axis=1,inplace=True)
df2

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,sex,capital_gain,capital_loss,hours_per_week,native_country,income,0,1,2,3,4
2,38,Private,0.401116,HS-grad,0.363636,Divorced,Handlers-cleaners,Not-in-family,Male,0,0,0.368421,United-States,<=50K,0.0,0.0,1.0,0.0,0.0
3,53,Private,0.620159,11th,0.181818,Married-civ-spouse,Handlers-cleaners,Husband,Male,0,0,0.368421,United-States,<=50K,0.0,0.0,0.0,0.0,1.0
4,28,Private,1.810833,Bachelors,0.727273,Married-civ-spouse,Prof-specialty,Wife,Female,0,0,0.368421,Cuba,<=50K,0.0,0.0,0.0,0.0,1.0
5,37,Private,1.192725,Masters,0.818182,Married-civ-spouse,Exec-managerial,Wife,Female,0,0,0.368421,United-States,<=50K,0.0,1.0,0.0,0.0,0.0
7,52,Self-emp-not-inc,0.332170,HS-grad,0.363636,Married-civ-spouse,Exec-managerial,Husband,Male,0,0,0.631579,United-States,>50K,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32530,53,Private,1.620854,Masters,0.818182,Married-civ-spouse,Exec-managerial,Husband,Male,0,0,0.368421,United-States,>50K,,,,,
32531,22,Private,1.486351,Some-college,0.454545,Never-married,Protective-serv,Not-in-family,Male,0,0,0.368421,United-States,<=50K,,,,,
32532,27,Private,0.879462,Assoc-acdm,0.636364,Married-civ-spouse,Tech-support,Wife,Female,0,0,0.263158,United-States,<=50K,,,,,
32533,40,Private,-0.302485,HS-grad,0.363636,Married-civ-spouse,Machine-op-inspct,Husband,Male,0,0,0.368421,United-States,>50K,,,,,


In [21]:
df2.rename(columns={0:'White',1:'Black',2:'Asian-Pac-Islander',3:'Other',4:'Amer-Indian-Eskimo'},inplace=True)

In [22]:
df3=pd.DataFrame(onehot.fit_transform(df2[['sex','income']]).toarray())
df3

Unnamed: 0,0,1,2,3
0,0.0,1.0,1.0,0.0
1,0.0,1.0,1.0,0.0
2,1.0,0.0,1.0,0.0
3,1.0,0.0,1.0,0.0
4,0.0,1.0,0.0,1.0
...,...,...,...,...
18986,0.0,1.0,0.0,1.0
18987,0.0,1.0,1.0,0.0
18988,1.0,0.0,1.0,0.0
18989,0.0,1.0,0.0,1.0


In [23]:
df4=df2.join(df3)

In [24]:
df4

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,sex,capital_gain,...,income,White,Black,Asian-Pac-Islander,Other,Amer-Indian-Eskimo,0,1,2,3
2,38,Private,0.401116,HS-grad,0.363636,Divorced,Handlers-cleaners,Not-in-family,Male,0,...,<=50K,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,53,Private,0.620159,11th,0.181818,Married-civ-spouse,Handlers-cleaners,Husband,Male,0,...,<=50K,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
4,28,Private,1.810833,Bachelors,0.727273,Married-civ-spouse,Prof-specialty,Wife,Female,0,...,<=50K,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
5,37,Private,1.192725,Masters,0.818182,Married-civ-spouse,Exec-managerial,Wife,Female,0,...,<=50K,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
7,52,Self-emp-not-inc,0.332170,HS-grad,0.363636,Married-civ-spouse,Exec-managerial,Husband,Male,0,...,>50K,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32530,53,Private,1.620854,Masters,0.818182,Married-civ-spouse,Exec-managerial,Husband,Male,0,...,>50K,,,,,,,,,
32531,22,Private,1.486351,Some-college,0.454545,Never-married,Protective-serv,Not-in-family,Male,0,...,<=50K,,,,,,,,,
32532,27,Private,0.879462,Assoc-acdm,0.636364,Married-civ-spouse,Tech-support,Wife,Female,0,...,<=50K,,,,,,,,,
32533,40,Private,-0.302485,HS-grad,0.363636,Married-civ-spouse,Machine-op-inspct,Husband,Male,0,...,>50K,,,,,,,,,


In [25]:
df4.drop(['sex','income'],axis=1,inplace=True)
df4

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,capital_gain,capital_loss,...,native_country,White,Black,Asian-Pac-Islander,Other,Amer-Indian-Eskimo,0,1,2,3
2,38,Private,0.401116,HS-grad,0.363636,Divorced,Handlers-cleaners,Not-in-family,0,0,...,United-States,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,53,Private,0.620159,11th,0.181818,Married-civ-spouse,Handlers-cleaners,Husband,0,0,...,United-States,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
4,28,Private,1.810833,Bachelors,0.727273,Married-civ-spouse,Prof-specialty,Wife,0,0,...,Cuba,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
5,37,Private,1.192725,Masters,0.818182,Married-civ-spouse,Exec-managerial,Wife,0,0,...,United-States,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
7,52,Self-emp-not-inc,0.332170,HS-grad,0.363636,Married-civ-spouse,Exec-managerial,Husband,0,0,...,United-States,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32530,53,Private,1.620854,Masters,0.818182,Married-civ-spouse,Exec-managerial,Husband,0,0,...,United-States,,,,,,,,,
32531,22,Private,1.486351,Some-college,0.454545,Never-married,Protective-serv,Not-in-family,0,0,...,United-States,,,,,,,,,
32532,27,Private,0.879462,Assoc-acdm,0.636364,Married-civ-spouse,Tech-support,Wife,0,0,...,United-States,,,,,,,,,
32533,40,Private,-0.302485,HS-grad,0.363636,Married-civ-spouse,Machine-op-inspct,Husband,0,0,...,United-States,,,,,,,,,


In [26]:
df4.rename(columns={0:'Male',1:'Female',2:'<=50K',3:'>50K'},inplace=True)
df4

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,capital_gain,capital_loss,...,native_country,White,Black,Asian-Pac-Islander,Other,Amer-Indian-Eskimo,Male,Female,<=50K,>50K
2,38,Private,0.401116,HS-grad,0.363636,Divorced,Handlers-cleaners,Not-in-family,0,0,...,United-States,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,53,Private,0.620159,11th,0.181818,Married-civ-spouse,Handlers-cleaners,Husband,0,0,...,United-States,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
4,28,Private,1.810833,Bachelors,0.727273,Married-civ-spouse,Prof-specialty,Wife,0,0,...,Cuba,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
5,37,Private,1.192725,Masters,0.818182,Married-civ-spouse,Exec-managerial,Wife,0,0,...,United-States,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
7,52,Self-emp-not-inc,0.332170,HS-grad,0.363636,Married-civ-spouse,Exec-managerial,Husband,0,0,...,United-States,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32530,53,Private,1.620854,Masters,0.818182,Married-civ-spouse,Exec-managerial,Husband,0,0,...,United-States,,,,,,,,,
32531,22,Private,1.486351,Some-college,0.454545,Never-married,Protective-serv,Not-in-family,0,0,...,United-States,,,,,,,,,
32532,27,Private,0.879462,Assoc-acdm,0.636364,Married-civ-spouse,Tech-support,Wife,0,0,...,United-States,,,,,,,,,
32533,40,Private,-0.302485,HS-grad,0.363636,Married-civ-spouse,Machine-op-inspct,Husband,0,0,...,United-States,,,,,,,,,


In [27]:
data=df4

In [28]:
data

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,capital_gain,capital_loss,...,native_country,White,Black,Asian-Pac-Islander,Other,Amer-Indian-Eskimo,Male,Female,<=50K,>50K
2,38,Private,0.401116,HS-grad,0.363636,Divorced,Handlers-cleaners,Not-in-family,0,0,...,United-States,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,53,Private,0.620159,11th,0.181818,Married-civ-spouse,Handlers-cleaners,Husband,0,0,...,United-States,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
4,28,Private,1.810833,Bachelors,0.727273,Married-civ-spouse,Prof-specialty,Wife,0,0,...,Cuba,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
5,37,Private,1.192725,Masters,0.818182,Married-civ-spouse,Exec-managerial,Wife,0,0,...,United-States,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
7,52,Self-emp-not-inc,0.332170,HS-grad,0.363636,Married-civ-spouse,Exec-managerial,Husband,0,0,...,United-States,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32530,53,Private,1.620854,Masters,0.818182,Married-civ-spouse,Exec-managerial,Husband,0,0,...,United-States,,,,,,,,,
32531,22,Private,1.486351,Some-college,0.454545,Never-married,Protective-serv,Not-in-family,0,0,...,United-States,,,,,,,,,
32532,27,Private,0.879462,Assoc-acdm,0.636364,Married-civ-spouse,Tech-support,Wife,0,0,...,United-States,,,,,,,,,
32533,40,Private,-0.302485,HS-grad,0.363636,Married-civ-spouse,Machine-op-inspct,Husband,0,0,...,United-States,,,,,,,,,


#### 2.2 Label Encoding (for variables with > 5 categories)

In [29]:
columns_to_encode = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'native_country']

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Apply Label Encoding to each specified column individually
for column in columns_to_encode:
    data[column] = label_encoder.fit_transform(data[column].astype(str))  # Convert to string to avoid issues
    print(f"Label encoding applied to '{column}'")

# Display the encoded data
print(data[columns_to_encode].head())

Label encoding applied to 'workclass'
Label encoding applied to 'education'
Label encoding applied to 'marital_status'
Label encoding applied to 'occupation'
Label encoding applied to 'relationship'
Label encoding applied to 'native_country'
   workclass  education  marital_status  occupation  relationship  \
2          4          8               0           6             1   
3          4          1               2           6             0   
4          4          6               2          10             5   
5          4          9               2           4             5   
7          6          8               2           4             0   

   native_country  
2              38  
3              38  
4               5  
5              38  
7              38  


In [30]:
data

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,capital_gain,capital_loss,...,native_country,White,Black,Asian-Pac-Islander,Other,Amer-Indian-Eskimo,Male,Female,<=50K,>50K
2,38,4,0.401116,8,0.363636,0,6,1,0,0,...,38,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,53,4,0.620159,1,0.181818,2,6,0,0,0,...,38,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
4,28,4,1.810833,6,0.727273,2,10,5,0,0,...,5,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
5,37,4,1.192725,9,0.818182,2,4,5,0,0,...,38,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
7,52,6,0.332170,8,0.363636,2,4,0,0,0,...,38,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32530,53,4,1.620854,9,0.818182,2,4,0,0,0,...,38,,,,,,,,,
32531,22,4,1.486351,11,0.454545,4,11,1,0,0,...,38,,,,,,,,,
32532,27,4,0.879462,4,0.636364,2,13,5,0,0,...,38,,,,,,,,,
32533,40,4,-0.302485,8,0.363636,2,7,0,0,0,...,38,,,,,,,,,


### **One-Hot Encoder:**

**Pros:**
- **No Assumptions:** Does not impose any ordinal relationship between categories, making it ideal for nominal variables (e.g., colors, gender).
- **Works with Many ML Models:** Helps in handling categorical data in machine learning models that expect numerical input.

**Cons:**
- **High Dimensionality:** Can create many new features if there are many categories, leading to a "curse of dimensionality" issue.
- **Sparse Representation:** Leads to sparse matrices, which can increase memory usage and computation time.

---

### **Label Encoder:**

**Pros:**
- **Simple & Efficient:** Converts categorical variables into numerical form without creating multiple new columns, saving space.
- **Useful for Ordinal Data:** Suitable for variables with an inherent order (e.g., rankings).

**Cons:**
- **Ordinal Assumption:** Imposes an artificial ordinal relationship between categories, which may be misleading for nominal data.
- **Misinterpreted Relationships:** Some machine learning models might assume a linear relationship between encoded values that doesn't exist.

### 3.Feature Engineering

#### 3.1 Create new Features

#### We create two new features:

 1. **Age Group**: Binning age into categories.
 2. **Work/Week Ratio**: Ratio of hours_per_week to the maximum hours worked in the dataset.

#### Reason: 
 Age Group helps to categorize individuals into broader age brackets, which may help the model capture patterns more effectively.
 Work/Week Ratio can normalize the work hours based on the maximum work hours and provide a relative comparison.

In [31]:
# Age Grouping (Young: <30, Middle-aged: 30-60, Elderly: >60)
data['age-group'] = pd.cut(data['age'], bins=[0, 30, 60, 100], labels=['Young', 'Middle-aged', 'Elderly'])

# Categorizing work hours (Full-Time: >=40 hours, Part-Time: <40 hours)
data['work-hours'] = ['Full-Time' if x >= 40 else 'Part-Time' for x in data['hours_per_week']]

# Display the new features
print(data[['age', 'age-group', 'hours_per_week', 'work-hours']].head())


   age    age-group  hours_per_week work-hours
2   38  Middle-aged        0.368421  Part-Time
3   53  Middle-aged        0.368421  Part-Time
4   28        Young        0.368421  Part-Time
5   37  Middle-aged        0.368421  Part-Time
7   52  Middle-aged        0.631579  Part-Time


In [32]:
data.drop(['age','hours_per_week'],axis=1,inplace=True)
data

Unnamed: 0,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,capital_gain,capital_loss,native_country,...,Black,Asian-Pac-Islander,Other,Amer-Indian-Eskimo,Male,Female,<=50K,>50K,age-group,work-hours
2,4,0.401116,8,0.363636,0,6,1,0,0,38,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,Middle-aged,Part-Time
3,4,0.620159,1,0.181818,2,6,0,0,0,38,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,Middle-aged,Part-Time
4,4,1.810833,6,0.727273,2,10,5,0,0,5,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,Young,Part-Time
5,4,1.192725,9,0.818182,2,4,5,0,0,38,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,Middle-aged,Part-Time
7,6,0.332170,8,0.363636,2,4,0,0,0,38,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,Middle-aged,Part-Time
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32530,4,1.620854,9,0.818182,2,4,0,0,0,38,...,,,,,,,,,Middle-aged,Part-Time
32531,4,1.486351,11,0.454545,4,11,1,0,0,38,...,,,,,,,,,Young,Part-Time
32532,4,0.879462,4,0.636364,2,13,5,0,0,38,...,,,,,,,,,Young,Part-Time
32533,4,-0.302485,8,0.363636,2,7,0,0,0,38,...,,,,,,,,,Middle-aged,Part-Time


#### 3.2 Apply Transformation (Log Transformation)

Log transformation helps to reduce skewness and makes features more normally distributed, improving model performance.



In [33]:
data['capital_gain_log'] = data['capital_gain'].apply(lambda x: np.log(x + 1))  # log(1+x) to avoid log(0) errors

# Display the transformed feature
print(data[['capital_gain', 'capital_gain_log']].head())

   capital_gain  capital_gain_log
2             0               0.0
3             0               0.0
4             0               0.0
5             0               0.0
7             0               0.0


In [34]:
data

Unnamed: 0,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,capital_gain,capital_loss,native_country,...,Asian-Pac-Islander,Other,Amer-Indian-Eskimo,Male,Female,<=50K,>50K,age-group,work-hours,capital_gain_log
2,4,0.401116,8,0.363636,0,6,1,0,0,38,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,Middle-aged,Part-Time,0.0
3,4,0.620159,1,0.181818,2,6,0,0,0,38,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,Middle-aged,Part-Time,0.0
4,4,1.810833,6,0.727273,2,10,5,0,0,5,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,Young,Part-Time,0.0
5,4,1.192725,9,0.818182,2,4,5,0,0,38,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,Middle-aged,Part-Time,0.0
7,6,0.332170,8,0.363636,2,4,0,0,0,38,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,Middle-aged,Part-Time,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32530,4,1.620854,9,0.818182,2,4,0,0,0,38,...,,,,,,,,Middle-aged,Part-Time,0.0
32531,4,1.486351,11,0.454545,4,11,1,0,0,38,...,,,,,,,,Young,Part-Time,0.0
32532,4,0.879462,4,0.636364,2,13,5,0,0,38,...,,,,,,,,Young,Part-Time,0.0
32533,4,-0.302485,8,0.363636,2,7,0,0,0,38,...,,,,,,,,Middle-aged,Part-Time,0.0


### 4. Feature Selection

In [35]:
numerical_features = data.select_dtypes(include=['number']).columns.tolist()
numerical_features

['workclass',
 'fnlwgt',
 'education',
 'education_num',
 'marital_status',
 'occupation',
 'relationship',
 'capital_gain',
 'capital_loss',
 'native_country',
 'White',
 'Black',
 'Asian-Pac-Islander',
 'Other',
 'Amer-Indian-Eskimo',
 'Male',
 'Female',
 '<=50K',
 '>50K',
 'capital_gain_log']

In [36]:
data.isnull().sum()

workclass                0
fnlwgt                   0
education                0
education_num            0
marital_status           0
occupation               0
relationship             0
capital_gain             0
capital_loss             0
native_country           0
White                 7867
Black                 7867
Asian-Pac-Islander    7867
Other                 7867
Amer-Indian-Eskimo    7867
Male                  7867
Female                7867
<=50K                 7867
>50K                  7867
age-group                0
work-hours               0
capital_gain_log         0
dtype: int64

In [37]:
data.dropna(inplace=True)
data

Unnamed: 0,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,capital_gain,capital_loss,native_country,...,Asian-Pac-Islander,Other,Amer-Indian-Eskimo,Male,Female,<=50K,>50K,age-group,work-hours,capital_gain_log
2,4,0.401116,8,0.363636,0,6,1,0,0,38,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,Middle-aged,Part-Time,0.0
3,4,0.620159,1,0.181818,2,6,0,0,0,38,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,Middle-aged,Part-Time,0.0
4,4,1.810833,6,0.727273,2,10,5,0,0,5,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,Young,Part-Time,0.0
5,4,1.192725,9,0.818182,2,4,5,0,0,38,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,Middle-aged,Part-Time,0.0
7,6,0.332170,8,0.363636,2,4,0,0,0,38,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,Middle-aged,Part-Time,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18984,2,-1.694829,6,0.727273,4,10,1,0,0,38,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,Middle-aged,Part-Time,0.0
18986,4,-0.110474,4,0.636364,0,12,4,0,0,38,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,Middle-aged,Part-Time,0.0
18987,4,0.213158,6,0.727273,0,4,1,0,0,38,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,Middle-aged,Part-Time,0.0
18988,4,0.625716,8,0.363636,6,8,4,0,0,38,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,Middle-aged,Part-Time,0.0


#### 4.1 Outlier Detection with Isolation Forest

Outliers can skew the distribution of features, leading to poor model performance. Removing them helps the model focus on the core patterns in the data.

In [38]:
from sklearn.ensemble import IsolationForest

# Fit Isolation Forest model
isolation_forest = IsolationForest(contamination=0.01, random_state=42)
outliers = isolation_forest.fit_predict(data[numerical_features])

# Removing outliers (where the prediction is -1)
data_cleaned = data[outliers != -1]

# Display the cleaned data without outliers
print(data_cleaned.shape)


(11012, 22)


### 4.2 Predictive Power Score (PPS) Analysis

PPS helps to find predictive relationships between features, especially when correlations are non-linear. This offers an alternative to the correlation matrix, which only captures linear relationships.

In [39]:
import ppscore as pps
pps_matrix = pps.matrix(data)[['x', 'y', 'ppscore']].sort_values(by='ppscore', ascending=False)
print(pps_matrix)
correlation_matrix = data.corr()
print(correlation_matrix)

                  x               y  ppscore
0         workclass       workclass      1.0
299           Other           Other      1.0
69    education_num   education_num      1.0
92   marital_status  marital_status      1.0
115      occupation      occupation      1.0
..              ...             ...      ...
18        workclass            >50K      0.0
183    capital_loss    capital_gain      0.0
182    capital_loss    relationship      0.0
181    capital_loss      occupation      0.0
242           Black       workclass      0.0

[484 rows x 3 columns]
                    workclass    fnlwgt  education  education_num  \
workclass            1.000000 -0.012295  -0.012070       0.026655   
fnlwgt              -0.012295  1.000000  -0.021561      -0.024501   
education           -0.012070 -0.021561   1.000000       0.177890   
education_num        0.026655 -0.024501   0.177890       1.000000   
marital_status      -0.040749  0.040908  -0.020832      -0.005805   
occupation           0