**Final Project Task 1 -Census Data Preprocess**

Requirements

Encode data

Handle missing values if any

Correct errors, inconsistencies, remove duplicates if any

Outlier detection and treatment if any

Normalization / Standardization if necesarry

Feature engineering

Train test split, save it.
Others?

Deliverable:

Notebook code with no errors.
Preprocessed data as csv.

In [31]:
#Importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [32]:
#Data load
data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = [
    "age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
    "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
    "hours-per-week", "native-country", "income"
]

data = pd.read_csv(data_url, header=None, names=columns, na_values="?", skipinitialspace=True)
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [33]:
#Columns managing
numerical_columns = ["age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week"]
categorical_columns = [
    "workclass", "education", "marital-status", "occupation",
    "relationship", "race", "sex", "native-country", "income"
]

# Replace missing values with median/mode
for col in numerical_columns:
    data[col] = data[col].fillna(data[col].median())

for col in categorical_columns:
    data[col] = data[col].fillna(data[col].mode()[0])

In [34]:
#Remove duplicates
data = data.drop_duplicates()

In [35]:
print(data.head())

   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital-gain  capital-loss  hours-per-week native-country income  
0          2174             0              40  United-States  <=50K  
1             0             0             

In [36]:
#Outlier treatment
Q1 = data[numerical_columns].quantile(0.25)
Q3 = data[numerical_columns].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

data[numerical_columns] = data[numerical_columns].clip(lower_bound, upper_bound, axis=1)

In [37]:
#Encode target variable
target_column = "income"
label_encoder = LabelEncoder()
data[target_column] = label_encoder.fit_transform(data[target_column])

In [38]:
#Scaling and encoding
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_columns),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_columns[:-1])  
    ]
)
# Exclude 'income'

In [39]:
#Feature Engineering
features = data.drop(target_column, axis=1)  #predictori, aici avem toate coloanele
target = data[target_column] # ceea ce se vrea a fi prezis
features_transformed = preprocessor.fit_transform(features) #aplicarea transformarilor

In [40]:
# Convert to DataFrame
feature_names = (
    numerical_columns + list(preprocessor.named_transformers_["cat"].get_feature_names_out(categorical_columns[:-1]))
)
features_transformed_df = pd.DataFrame(features_transformed.toarray(), columns=feature_names)  

In [41]:
#Train test
X_train, X_test, y_train, y_test = train_test_split(
    features_transformed_df, target, test_size=0.2, random_state=42
)

In [42]:
#Save data to csv
data.to_csv("processed_census_data.csv", index=False)

In [43]:
print("Before Preprocessing:\n", data[numerical_columns].describe())

Before Preprocessing:
                 age         fnlwgt  education-num  capital-gain  capital-loss  \
count  32537.000000   32537.000000   32537.000000       32537.0       32537.0   
mean      38.559855  186824.961736      10.125165           0.0           0.0   
std       13.554847   95118.115529       2.459436           0.0           0.0   
min       17.000000   12285.000000       4.500000           0.0           0.0   
25%       28.000000  117827.000000       9.000000           0.0           0.0   
50%       37.000000  178356.000000      10.000000           0.0           0.0   
75%       48.000000  236993.000000      12.000000           0.0           0.0   
max       78.000000  415742.000000      16.000000           0.0           0.0   

       hours-per-week  
count    32537.000000  
mean        41.203246  
std          6.187352  
min         32.500000  
25%         40.000000  
50%         40.000000  
75%         45.000000  
max         52.500000  


In [44]:
print("After Preprocessing:\n", features_transformed_df.describe())

After Preprocessing:
                 age        fnlwgt  education-num  capital-gain  capital-loss  \
count  3.253700e+04  3.253700e+04   3.253700e+04       32537.0       32537.0   
mean   1.489351e-16  5.568688e-18   3.173060e-16           0.0           0.0   
std    1.000015e+00  1.000015e+00   1.000015e+00           0.0           0.0   
min   -1.590589e+00 -1.835009e+00  -2.287212e+00           0.0           0.0   
25%   -7.790584e-01 -7.254036e-01  -4.574962e-01           0.0           0.0   
50%   -1.150790e-01 -8.903763e-02  -5.089262e-02           0.0           0.0   
75%    6.964512e-01  5.274370e-01   7.623146e-01           0.0           0.0   
max    2.909716e+00  2.406698e+00   2.388729e+00           0.0           0.0   

       hours-per-week  workclass_Federal-gov  workclass_Local-gov  \
count    3.253700e+04           32537.000000         32537.000000   
mean    -2.292989e-16               0.029505             0.064327   
std      1.000015e+00               0.169219      

In [45]:
print("Data preprocessing complete. Processed data saved to 'processed_census_data.csv'.")


Data preprocessing complete. Processed data saved to 'processed_census_data.csv'.
