# DATA PREPROCESSING AND FEATURE ENGINEERING IN MACHINE LEARNING

Task 1: Data Exploration and Preprocessing

In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [3]:
# Load the dataset
data = pd.read_csv("adult_with_headers.csv")

In [4]:
# Basic data exploration
print(data.head())

   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  income  
0          2174             0              40   United-States   <=50

In [5]:
print(data.describe())

                age        fnlwgt  education_num  capital_gain  capital_loss  \
count  32561.000000  3.256100e+04   32561.000000  32561.000000  32561.000000   
mean      38.581647  1.897784e+05      10.080679   1077.648844     87.303830   
std       13.640433  1.055500e+05       2.572720   7385.292085    402.960219   
min       17.000000  1.228500e+04       1.000000      0.000000      0.000000   
25%       28.000000  1.178270e+05       9.000000      0.000000      0.000000   
50%       37.000000  1.783560e+05      10.000000      0.000000      0.000000   
75%       48.000000  2.370510e+05      12.000000      0.000000      0.000000   
max       90.000000  1.484705e+06      16.000000  99999.000000   4356.000000   

       hours_per_week  
count    32561.000000  
mean        40.437456  
std         12.347429  
min          1.000000  
25%         40.000000  
50%         40.000000  
75%         45.000000  
max         99.000000  


In [6]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
None


In [7]:
# Handling missing values
data.dropna(inplace=True)  # Drop rows with missing values

In [8]:
# Scaling techniques
scaler_standard = StandardScaler()
scaler_minmax = MinMaxScaler()

In [9]:
# Assuming 'age' and 'education-num' are numerical features
data['age_scaled_standard'] = scaler_standard.fit_transform(data[['age']])
data['age_scaled_minmax'] = scaler_minmax.fit_transform(data[['age']])

Task 2: Encoding Techniques

In [10]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [12]:
# One-Hot Encoding
onehot_cols = ['workclass', 'marital_status', 'occupation', 'relationship', 'race']
for col in onehot_cols:
    if len(data[col].unique()) < 5:
        onehot_encoder = OneHotEncoder(drop='first')
        encoded_cols = pd.DataFrame(onehot_encoder.fit_transform(data[[col]]).toarray(),
                                    columns=[col + '_' + str(i) for i in range(1, len(data[col].unique()))])
        data = pd.concat([data, encoded_cols], axis=1)

In [13]:
onehot_cols

['workclass', 'marital_status', 'occupation', 'relationship', 'race']

In [14]:
data

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income,age_scaled_standard,age_scaled_minmax
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,0.030671,0.301370
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,0.837109,0.452055
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,-0.042642,0.287671
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,1.057047,0.493151
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,-0.775768,0.150685
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K,-0.849080,0.136986
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K,0.103983,0.315068
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K,1.423610,0.561644
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K,-1.215643,0.068493


Task 3: Feature Engineering

In [17]:
# Create new features
data['capital-gain-minus-loss'] = data['capital_gain'] - data['capital_loss']
data['age_squared'] = data['age'] ** 2

In [19]:
# Apply log transformation to skewed numerical feature (assuming 'capital-gain' is skewed)
import numpy as np
data['capital-gain_log'] = np.log1p(data['capital_gain'])

Task 4: Feature Selection python

In [None]:
from sklearn.ensemble import IsolationForest
import ppscore