In [45]:
# Load necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ppscore as pps
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import IsolationForest

In [46]:
#to visualise all the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

# Load dataset
df = pd.read_csv('adult_with_headers.csv')
df.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


## Data Exploration and Preprocessing

In [47]:
# Display summary statistics
df.describe(include='all')

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
count,32561.0,32561,32561.0,32561,32561.0,32561,32561,32561,32561,32561,32561.0,32561.0,32561.0,32561,32561
unique,,9,,16,,7,15,6,5,2,,,,42,2
top,,Private,,HS-grad,,Married-civ-spouse,Prof-specialty,Husband,White,Male,,,,United-States,<=50K
freq,,22696,,10501,,14976,4140,13193,27816,21790,,,,29170,24720
mean,38.581647,,189778.4,,10.080679,,,,,,1077.648844,87.30383,40.437456,,
std,13.640433,,105550.0,,2.57272,,,,,,7385.292085,402.960219,12.347429,,
min,17.0,,12285.0,,1.0,,,,,,0.0,0.0,1.0,,
25%,28.0,,117827.0,,9.0,,,,,,0.0,0.0,40.0,,
50%,37.0,,178356.0,,10.0,,,,,,0.0,0.0,40.0,,
75%,48.0,,237051.0,,12.0,,,,,,0.0,0.0,45.0,,


In [48]:
# Check for missing values
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [49]:
# Check data types
print(df.dtypes)

age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
income            object
dtype: object


In [50]:
# List of numerical columns
numerical_columns = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

In [51]:
# Standard Scaling
standard_scaler = StandardScaler()
df_standard_scaled = df.copy()
df_standard_scaled[numerical_columns] = standard_scaler.fit_transform(df[numerical_columns])
df_standard_scaled.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,0.030671,State-gov,-1.063611,Bachelors,1.134739,Never-married,Adm-clerical,Not-in-family,White,Male,0.148453,-0.21666,-0.035429,United-States,<=50K
1,0.837109,Self-emp-not-inc,-1.008707,Bachelors,1.134739,Married-civ-spouse,Exec-managerial,Husband,White,Male,-0.14592,-0.21666,-2.222153,United-States,<=50K
2,-0.042642,Private,0.245079,HS-grad,-0.42006,Divorced,Handlers-cleaners,Not-in-family,White,Male,-0.14592,-0.21666,-0.035429,United-States,<=50K
3,1.057047,Private,0.425801,11th,-1.197459,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,-0.14592,-0.21666,-0.035429,United-States,<=50K
4,-0.775768,Private,1.408176,Bachelors,1.134739,Married-civ-spouse,Prof-specialty,Wife,Black,Female,-0.14592,-0.21666,-0.035429,Cuba,<=50K


In [52]:
# Min-Max Scaling
minmax_scaler = MinMaxScaler()
df_minmax_scaled = df.copy()
df_minmax_scaled[numerical_columns] = minmax_scaler.fit_transform(df[numerical_columns])
df_minmax_scaled.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,0.30137,State-gov,0.044302,Bachelors,0.8,Never-married,Adm-clerical,Not-in-family,White,Male,0.02174,0.0,0.397959,United-States,<=50K
1,0.452055,Self-emp-not-inc,0.048238,Bachelors,0.8,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,0.122449,United-States,<=50K
2,0.287671,Private,0.138113,HS-grad,0.533333,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,0.397959,United-States,<=50K
3,0.493151,Private,0.151068,11th,0.4,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,0.397959,United-States,<=50K
4,0.150685,Private,0.221488,Bachelors,0.8,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,0.397959,Cuba,<=50K


#### Discuss the scenarios where each scaling technique is preferred and why
- Standard Scaling (StandardScaler): This technique transforms the data to have a mean of 0 and a standard deviation of 1. It is preferred when the data follows a normal distribution or when the model assumes that the data is normally distributed (e.g., linear regression, logistic regression).

- Min-Max Scaling (MinMaxScaler): This technique scales the data to a fixed range, usually [0, 1]. It is preferred when the data does not necessarily follow a normal distribution and you want to preserve the relationships of the original data. It is also useful when using algorithms that do not assume any specific distribution of the data, such as k-nearest neighbors and neural networks.s.
ks.


## Encoding Techniques

In [53]:
# Identify categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns
categorical_columns

Index(['workclass', 'education', 'marital_status', 'occupation',
       'relationship', 'race', 'sex', 'native_country', 'income'],
      dtype='object')

In [54]:
# Apply One-hot encoding to categorial variables with kess then 5 categories
one_hot_columns = [col for col in categorical_columns if df[col].nunique() < 5]
df_one_hot_encoded  = pd.get_dummies(df, columns=one_hot_columns, drop_first=True)
df_one_hot_encoded .head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,capital_gain,capital_loss,hours_per_week,native_country,sex_ Male,income_ >50K
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,2174,0,40,United-States,1,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,0,0,13,United-States,1,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,0,0,40,United-States,1,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,0,0,40,United-States,1,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,0,0,40,Cuba,0,0


In [55]:
# Identify categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns

# Apply Label Encoding to categorical variables with more than 5 categories
label_encode_columns = [col for col in categorical_columns if df[col].nunique() >= 5]

# Initialize Label Encoder
label_encoder = LabelEncoder()

# Apply Label Encoding
for col in label_encode_columns:
    df[col] = label_encoder.fit_transform(df[col])

# Display the first few rows of the encoded dataframe
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,7,77516,9,13,4,1,1,4,Male,2174,0,40,39,<=50K
1,50,6,83311,9,13,2,4,0,4,Male,0,0,13,39,<=50K
2,38,4,215646,11,9,0,6,1,4,Male,0,0,40,39,<=50K
3,53,4,234721,1,7,2,6,0,2,Male,0,0,40,39,<=50K
4,28,4,338409,9,13,2,10,5,2,Female,0,0,40,5,<=50K


#### Discuss the pros and cons of One-Hot Encoding and Label Encoding.
##### One-Hot Encoding
- Pros:
    - No Ordinal Relationship Assumption:
        - One-Hot Encoding does not assume any ordinal relationship between the categories.         - It treats each category as an independent entity.
        - Suitable for nominal categorical variables (e.g., color, gender).
    - Avoids Arbitrary Ranking:
        - Prevents assigning arbitrary ranking to categories, which could mislead some algorithms (e.g., linear regression).
- Cons:
    - Increased Dimensionality:
        - Increases the number of features, especially for categorical variables with many categories. This can lead to the curse of dimensionality, making the model more complex and computationally expensive.
    - Sparse Matrix:
        - Results in a sparse matrix, where many values are zeros, which can consume more memory and slow down the computation.

##### Label Encoding
- Pros:
    - Simplicity:
        - Simple and straightforward to implement.
        - Converts categories to integers, which can be easily interpreted by most algorithms.
    - No Increased Dimensionality:
        - Does not increase the dimensionality of the dataset, keeping it compact.
- Cons:
    - Assumes Ordinal Relationship:
        - Assumes an ordinal relationship between the categories, which may not be true for nominal categorical variables. This can mislead algorithms into interpreting these numerical values as having some sort of ranking or order.
    - Potential Bias:
        - Some algorithms might interpret the encoded integers as having inherent ordinal importance, which can introduce bias and affect the model’s performance.

## Feature Engineering

In [56]:
# Create new feature: 'age_group'
# Age groups: 0-18 (Child), 19-35 (Young Adult), 36-60 (Adult), 61+ (Senior)
df_one_hot_encoded['age_group'] = pd.cut(df['age'], bins=[0, 18, 35, 60, 100], labels=['Child', 'Young Adult', 'Adult', 'Senior'])

In [57]:
# Create new feature: 'capital_diff' (difference between capital gain and capital loss)
df_one_hot_encoded['capital_diff'] = df['capital_gain'] - df['capital_loss']

In [58]:
# Apply log transformation to 'capital_gain' due to its high skewness
df_one_hot_encoded['log_capital_gain'] = np.log1p(df['capital_gain'])

In [59]:
# Check skewness before and after transformation
print("Skewness of 'capital_gain' before transformation:", df['capital_gain'].skew())
print("Skewness of 'log_capital_gain' after transformation:", df_one_hot_encoded['log_capital_gain'].skew())

Skewness of 'capital_gain' before transformation: 11.953847687699799
Skewness of 'log_capital_gain' after transformation: 3.096143524467517


In [60]:
# Display the first few rows of the modified dataframe
df_one_hot_encoded.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,capital_gain,capital_loss,hours_per_week,native_country,sex_ Male,income_ >50K,age_group,capital_diff,log_capital_gain
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,2174,0,40,United-States,1,0,Adult,2174,7.684784
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,0,0,13,United-States,1,0,Adult,0,0.0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,0,0,40,United-States,1,0,Adult,0,0.0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,0,0,40,United-States,1,0,Adult,0,0.0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,0,0,40,Cuba,0,0,Young Adult,0,0.0


## Feature Selection

In [61]:
# Define the Isolation Forest model
clf = IsolationForest(random_state=42, contamination=0.01)  # Contamination is the proportion of outliers

# Fit the Isolation Forest model to numerical features
clf.fit(df[numerical_columns])

# Predict outliers
outliers = clf.predict(df[numerical_columns])

# Remove outliers
df_cleaned = df[outliers != -1]

# Print the shape of the cleaned dataset
print("Original dataset shape:", df.shape)
print("Cleaned dataset shape:", df_cleaned.shape)

Original dataset shape: (32561, 15)
Cleaned dataset shape: (32235, 15)


In [62]:
# Compute Pearson correlation coefficient manually
correlation_matrix = df_cleaned[numerical_columns].corr()

# Square the absolute values of the correlation coefficients to get the PPS
pps_matrix = correlation_matrix.applymap(lambda x: np.square(abs(x)))

print(pps_matrix)

                     age    fnlwgt  education_num  capital_gain  capital_loss  \
age             1.000000  0.005976       0.000988      0.011704      0.001659   
fnlwgt          0.005976  1.000000       0.001922      0.000067      0.000176   
education_num   0.000988  0.001922       1.000000      0.020163      0.006228   
capital_gain    0.011704  0.000067       0.020163      1.000000      0.002273   
capital_loss    0.001659  0.000176       0.006228      0.002273      1.000000   
hours_per_week  0.005061  0.000467       0.020770      0.006858      0.002087   

                hours_per_week  
age                   0.005061  
fnlwgt                0.000467  
education_num         0.020770  
capital_gain          0.006858  
capital_loss          0.002087  
hours_per_week        1.000000  
