In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import mutual_info_classif
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

In [2]:
print("=== ADULT DATASET ANALYSIS ===\n")

=== ADULT DATASET ANALYSIS ===



In [3]:
#LOAD & EXPLORE DATA
df = pd.read_csv('/content/sample_data/adult_with_headers.csv')
df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
              'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
              'hours_per_week', 'native_country', 'income']

In [4]:
print(f"Shape: {df.shape}")
print(f"Missing (?): {df.isin(['?']).sum().sum()}")

Shape: (32561, 15)
Missing (?): 0


In [5]:
#Clean categorical columns
cat_cols = df.select_dtypes('object').columns
for col in cat_cols:
    df[col] = df[col].str.strip().replace('?', np.nan)

In [6]:
#Impute missing with mode
for col in ['workclass', 'occupation', 'native_country']:
    df[col] = df[col].fillna(df[col].mode()[0])

In [7]:
print(f"Clean shape: {df.shape}\n")

Clean shape: (32561, 15)



In [8]:
#SCALING TECHNIQUES
num_cols = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
scaler_std = StandardScaler()
scaler_mm = MinMaxScaler()

In [9]:
df_std = df[num_cols].copy()
df_std[num_cols] = scaler_std.fit_transform(df[num_cols])
df_mm = df[num_cols].copy()
df_mm[num_cols] = scaler_mm.fit_transform(df[num_cols])

In [10]:
print("StandardScaler (mean=0, std=1):")
print(df_std[num_cols].describe())
print("\nMinMaxScaler (0-1 range):")
print(df_mm[num_cols].describe())
print()

StandardScaler (mean=0, std=1):
                age        fnlwgt  education_num  capital_gain  capital_loss  \
count  3.256100e+04  3.256100e+04   3.256100e+04  3.256100e+04  3.256100e+04   
mean  -2.705915e-17 -1.001625e-16   1.471887e-16  1.309314e-17  1.016900e-16   
std    1.000015e+00  1.000015e+00   1.000015e+00  1.000015e+00  1.000015e+00   
min   -1.582206e+00 -1.681631e+00  -3.529656e+00 -1.459205e-01 -2.166595e-01   
25%   -7.757679e-01 -6.816910e-01  -4.200596e-01 -1.459205e-01 -2.166595e-01   
50%   -1.159546e-01 -1.082193e-01  -3.136003e-02 -1.459205e-01 -2.166595e-01   
75%    6.904838e-01  4.478765e-01   7.460392e-01 -1.459205e-01 -2.166595e-01   
max    3.769612e+00  1.226856e+01   2.300838e+00  1.339458e+01  1.059351e+01   

       hours_per_week  
count    3.256100e+04  
mean    -1.549355e-17  
std      1.000015e+00  
min     -3.194030e+00  
25%     -3.542945e-02  
50%     -3.542945e-02  
75%      3.695194e-01  
max      4.742967e+00  

MinMaxScaler (0-1 range):
    

In [11]:
#ENCODING
cat_low = ['race', 'sex']
cat_high = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'native_country']

In [12]:
df_encoded = pd.get_dummies(df, columns=cat_low)
for col in cat_high:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))

In [13]:
print(f"Encoded shape: {df_encoded.shape}\n")

Encoded shape: (32561, 20)



In [14]:
#FEATURE ENGINEERING
df_eng = df_encoded.copy()
df_eng['age_edu'] = df_eng['age'] * df_eng['education_num']
df_eng['capital_net'] = df_eng['capital_gain'] - df_eng['capital_loss']
df_eng['capital_gain_log'] = np.log1p(df_eng['capital_gain'])

In [15]:
print("New features created:")
print("- age_edu: age × education interaction")
print("- capital_net: gain - loss")
print("- capital_gain_log: log transform (skew: {:.1f}→{:.1f})".format(
    stats.skew(df['capital_gain']), stats.skew(df_eng['capital_gain_log'])))
print()

New features created:
- age_edu: age × education interaction
- capital_net: gain - loss
- capital_gain_log: log transform (skew: 12.0→3.1)



In [16]:
#OUTLIER REMOVAL
iso = IsolationForest(contamination=0.1, random_state=42)
outliers = iso.fit_predict(df_eng[num_cols])
df_clean = df_eng[outliers == 1]
print(f"Removed {len(df_eng)-len(df_clean)} outliers (10%)\n")

Removed 3256 outliers (10%)



In [17]:
#FEATURE SELECTION
X_num = df_clean.select_dtypes('number').drop('income', axis=1, errors='ignore')
y = (df_clean['income'] == '>50K').astype(int)

In [18]:
mi_scores = mutual_info_classif(X_num, y, random_state=42)
mi_df = pd.DataFrame({'feature': X_num.columns, 'mi_score': mi_scores}).sort_values('mi_score', ascending=False)

In [19]:
num_corr = df_clean[num_cols + ['age_edu', 'capital_net']].copy()
num_corr['target'] = y
corr_target = num_corr.corr()['target'].sort_values(ascending=False)

In [20]:
print("TOP 5 MUTUAL INFO:")
print(mi_df.head())
print("\nTOP 5 CORRELATION:")
print(corr_target.head())
print()

TOP 5 MUTUAL INFO:
           feature  mi_score
7     relationship  0.114019
5   marital_status  0.105600
12         age_edu  0.095959
0              age  0.063541
13     capital_net  0.057477

TOP 5 CORRELATION:
target           1.000000
age_edu          0.370617
education_num    0.297777
capital_gain     0.272243
capital_net      0.272182
Name: target, dtype: float64



In [22]:
#SUMMARY
print("=== FINAL SUMMARY ===")
print(f"Dataset: {len(df_clean):,} rows after cleaning")


=== FINAL SUMMARY ===
Dataset: 29,305 rows after cleaning
