In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import IsolationForest

In [2]:
data = pd.read_csv('adult_with_headers.csv')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
print('Summary Statistics')
data.describe()

Summary Statistics


Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [4]:
print('Missing Values')
data.isnull().sum()

Missing Values


age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [5]:
print('data dtypes')
data.dtypes

data dtypes


age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
income            object
dtype: object

In [6]:
# Handle missing values
imputer = SimpleImputer(strategy='mean')

In [7]:
data['age'] = imputer.fit_transform(data[['age']])

In [8]:
# Apply scaling techniques
Scaler_standard = StandardScaler()
data_Scaler_standard = pd.DataFrame(Scaler_standard.fit_transform(data.select_dtypes(include=[np.number])),columns = data.select_dtypes(include=[np.number]).columns)

In [9]:
Scaler_Minmax = MinMaxScaler()
data_Scaler_Minmax = pd.DataFrame(Scaler_Minmax.fit_transform(data.select_dtypes(include=[np.number])),columns = data.select_dtypes(include=[np.number]).columns)

In [10]:
# Display scaled data
data_Scaler_standard.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,0.837109,-1.008707,1.134739,-0.14592,-0.21666,-2.222153
2,-0.042642,0.245079,-0.42006,-0.14592,-0.21666,-0.035429
3,1.057047,0.425801,-1.197459,-0.14592,-0.21666,-0.035429
4,-0.775768,1.408176,1.134739,-0.14592,-0.21666,-0.035429


In [11]:
data_Scaler_Minmax.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,0.30137,0.044302,0.8,0.02174,0.0,0.397959
1,0.452055,0.048238,0.8,0.0,0.0,0.122449
2,0.287671,0.138113,0.533333,0.0,0.0,0.397959
3,0.493151,0.151068,0.4,0.0,0.0,0.397959
4,0.150685,0.221488,0.8,0.0,0.0,0.397959


### Encoding Techniques 

In [12]:
from sklearn.preprocessing import OneHotEncoder,LabelEncoder

In [13]:
# Select categorical columns
categorical_columns = data.select_dtypes(include=['object']).columns

In [14]:
# One-Hot Encoding for categorical variables with less than 5 categories
one_hot_encoder = OneHotEncoder(drop='first',sparse_output= False)

In [15]:
categorical_small = [col for col in categorical_columns if data[col].nunique() < 5]

In [16]:
data_one_hot_encoder = pd.DataFrame(one_hot_encoder.fit_transform(data[categorical_small]), columns = one_hot_encoder.get_feature_names_out(categorical_small))

In [17]:
# Label Encoding for categorical variables with more than 5 categories
label_encoder = LabelEncoder()

In [18]:
categorical_large = [col for col in categorical_columns if data[col].nunique() >= 5]

In [19]:
for col in categorical_large:
    data[col] = label_encoder.fit_transform(data[col])

In [20]:
# Merge the one-hot encoded columns back to the original dataframe
data = data.drop(columns = categorical_small)

In [21]:
data = pd.concat([data,data_one_hot_encoder],axis = 1)

In [22]:
# Display the first few rows of the transformed dataframe
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,capital_gain,capital_loss,hours_per_week,native_country,sex_ Male,income_ >50K
0,39.0,7,77516,9,13,4,1,1,4,2174,0,40,39,1.0,0.0
1,50.0,6,83311,9,13,2,4,0,4,0,0,13,39,1.0,0.0
2,38.0,4,215646,11,9,0,6,1,4,0,0,40,39,1.0,0.0
3,53.0,4,234721,1,7,2,6,0,2,0,0,40,39,1.0,0.0
4,28.0,4,338409,9,13,2,10,5,2,0,0,40,5,0.0,0.0


#### FEATURE ENGINEERING

In [23]:
# Creating new features
data['age_group'] = pd.cut(data['age'],bins = [0,25,45,65,np.inf], labels= ['Young','Adult','Middle-aged','Senior'])


In [24]:
data['captial_gain_loss'] = data['capital_gain'] - data['capital_loss']

In [25]:
# Log transformation for skewed numerical feature
data['captial_gain_log'] = np.log1p(data['capital_gain'])

In [26]:
# Display new features
print('New features')
data[['age_group','captial_gain_loss','captial_gain_log']].head()

New features


Unnamed: 0,age_group,captial_gain_loss,captial_gain_log
0,Adult,2174,7.684784
1,Middle-aged,0,0.0
2,Adult,0,0.0
3,Middle-aged,0,0.0
4,Adult,0,0.0


In [27]:
# Using Isolation Forest to identify outliers
isolation_forest = IsolationForest(contamination=0.1)

In [28]:
outliers = isolation_forest.fit_predict(data.select_dtypes(include=[np.number]))

In [29]:
data['outliers'] = outliers

In [30]:
data_no_outliers = data[data['outliers'] == 1]

In [31]:
# Display data without outliers
print('data without outliers')
data_no_outliers.head()

data without outliers


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,capital_gain,capital_loss,hours_per_week,native_country,sex_ Male,income_ >50K,age_group,captial_gain_loss,captial_gain_log,outliers
0,39.0,7,77516,9,13,4,1,1,4,2174,0,40,39,1.0,0.0,Adult,2174,7.684784,1
1,50.0,6,83311,9,13,2,4,0,4,0,0,13,39,1.0,0.0,Middle-aged,0,0.0,1
2,38.0,4,215646,11,9,0,6,1,4,0,0,40,39,1.0,0.0,Adult,0,0.0,1
3,53.0,4,234721,1,7,2,6,0,2,0,0,40,39,1.0,0.0,Middle-aged,0,0.0,1
5,37.0,4,284582,12,14,2,4,5,4,0,0,40,39,0.0,0.0,Adult,0,0.0,1


In [32]:
import ppscore as pps

In [33]:
# PPS score analysis
pps_matrix = pps.matrix(data_no_outliers)

In [34]:
# Display PPS matrix
print('pps_matrix')
pps_matrix.head()

pps_matrix


Unnamed: 0,x,y,ppscore,case,is_valid_score,metric,baseline_score,model_score,model
0,age,age,1.0,predict_itself,True,,0.0,1.0,
1,age,workclass,0.0,regression,True,mean absolute error,0.6808,0.81348,DecisionTreeRegressor()
2,age,fnlwgt,0.0,regression,True,mean absolute error,75323.688,76392.409535,DecisionTreeRegressor()
3,age,education,0.0,regression,True,mean absolute error,2.6122,2.66917,DecisionTreeRegressor()
4,age,education_num,0.0,regression,True,mean absolute error,1.7462,1.793512,DecisionTreeRegressor()


## Conclusion

In this assignment, we focused on essential steps for preparing the "Adult" dataset for machine learning:



1.**Data Exploration and Preprocessing:**

- **Summary Statistics & Missing Values:** Conducted basic exploration and handled missing values using mean imputation.
- **Scaling:** 
           
     **Standard Scaling:** Preferred for normally distributed data.  
     **Min-Max Scaling:** Useful for preserving original data distribution.



2.**Encoding Techniques:**

**One-Hot Encoding:** Used for categorical variables with fewer than 5 categories to avoid ordinal relationships.
                        
   **Label Encoding:** Applied to categorical variables with more than 5 categories for simplicity.


3.**Feature Engineering:**



- Created two new features and applied log transformation to a skewed numerical feature to normalize its distribution.


4.**Feature Selection:**

   **Isolation Forest:** Identified and removed outliers to improve model performance.

   **PPS Score:** Analyzed relationships between features, providing a more nuanced view than the correlation matrix.


These preprocessing steps are crucial for building effective and efficient machine learning models, ensuring the data is clean, well-scaled, and features are appropriately engineered and selected.

