# **Final Project Task 1 - Census Data Preprocess**

Requirements

- Target variable specification:
    - The target variable for this project is hours-per-week. 
    - Ensure all preprocessing steps are designed to support regression analysis on this target variable.
- Encode data  **3p**
- Handle missing values if any **1p**
- Correct errors, inconsistencies, remove duplicates if any **1p**
- Outlier detection and treatment if any **1p**
- Normalization / Standardization if necesarry **1p**
- Feature engineering **3p**
- Train test split, save it.
- Others?


Deliverable:

- Notebook code with no errors.
- Preprocessed data as csv.

In [1]:
import pandas as pd

In [2]:
data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = [
    "age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
    "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
    "hours-per-week", "native-country", "income"
]

data = pd.read_csv(data_url, header=None, names=columns, na_values=" ?", skipinitialspace=True)
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Load dataset
data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = [
    "age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
    "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
    "hours-per-week", "native-country", "income"
]

df = pd.read_csv(data_url, header=None, names=columns, skipinitialspace=True)

# --- Target Variable Specification ---
target = 'hours-per-week'


# --- Handle Missing Values ---
missing_value_replacement = {'?': np.nan}
df.replace(missing_value_replacement, inplace=True)

df.fillna(df.median(numeric_only=True), inplace=True)  # Fill numeric columns with median
for col in df.select_dtypes(include=['object']).columns:
    df[col].fillna(df[col].mode()[0], inplace=True)  # Fill categorical columns with mode

# --- Correct Errors, Inconsistencies, Remove Duplicates ---
df.drop_duplicates(inplace=True)
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col] = df[col].str.strip()

# --- Outlier Detection and Treatment ---
def handle_outliers(column, method='IQR'):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
    df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])

numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
for col in numerical_cols:
    handle_outliers(col)

# --- Feature Engineering ---
# Replace less frequent native-country values with 'Others'
top_10_countries = df['native-country'].value_counts().nlargest(10).index
df['native-country'] = df['native-country'].apply(lambda x: x if x in top_10_countries else 'Others')

# Create age buckets
bins = [0, 20, 30, 40, 50, 60, float('inf')]
labels = ['<20', '20-30', '30-40', '40-50', '50-60', '60+']
df['age_bucket'] = pd.cut(df['age'], bins=bins, labels=labels, right=False)

# --- Encoding Categorical Data ---
numerical_cols = [col for col in df.select_dtypes(include=['int64', 'float64']).columns if col != target]
categorical_cols = [col for col in df.select_dtypes(include=['object']).columns if col != target]

# For some reason the age_bucket column doesn't make it in the categorical_cols so i added it here
if 'age_bucket' not in categorical_cols:
    categorical_cols.append('age_bucket')


preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_cols)
    ],
    remainder='drop'  # Drop unprocessed columns
)

# --- Train-Test Split ---
X = df.drop(columns=[target])
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply preprocessing pipeline
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Convert preprocessed data to DataFrame
preprocessed_columns = (
    numerical_cols + list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols))
)
X_train_preprocessed_df = pd.DataFrame(X_train_preprocessed, columns=preprocessed_columns, index=X_train.index)
X_test_preprocessed_df = pd.DataFrame(X_test_preprocessed, columns=preprocessed_columns, index=X_test.index)

# Combine preprocessed features with the target
train_data = pd.concat([X_train_preprocessed_df, y_train], axis=1)
test_data = pd.concat([X_test_preprocessed_df, y_test], axis=1)

# Save to CSV
train_data.to_csv('E:/Master/ADC/14.Machine_Learning/ubb-sociology-ml/final_project/Train_Preprocessed.csv', index=False)
test_data.to_csv('E:/Master/ADC/14.Machine_Learning/ubb-sociology-ml/final_project/Test_Preprocessed.csv', index=False)

print("Preprocessing complete. Train and test datasets saved as CSV.")
