# Final Project Task 1: Census Data Preprocessing

Requirements

- Encode data
- Handle missing values if any
- Correct errors, inconsistencies, remove duplicates if any
- Outlier detection and treatment if any
- Normalization / Standardization if necesarry
- Feature engineering
- Train test split, save it.
- Others?


Deliverable:

- Notebook code with no errors.
- Preprocessed data as csv.

In [None]:
# task_1_preprocessing.ipynb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Load the provided database file
database_path = "Initial_Database.csv"  # Ensure the correct path
df = pd.read_csv(database_path)

# Check for null values
null_values = df.isnull().sum()
print("Null Values in Each Column:\n", null_values)

# Check for duplicates
duplicate_rows = df.duplicated().sum()
print("\nNumber of Duplicate Rows:", duplicate_rows)

# Remove duplicate rows if they exist
if duplicate_rows > 0:
    df_cleaned = df.drop_duplicates()
    print("\nDuplicates removed. Updated dataset shape:", df_cleaned.shape)
else:
    df_cleaned = df.copy()
    print("\nNo duplicates found.")

# Save the cleaned dataset
cleaned_database_path = "cleaned_database.csv"
df_cleaned.to_csv(cleaned_database_path, index=False)

print("\nPreprocessing complete. Cleaned dataset saved as 'cleaned_database.csv'.")


In [19]:

# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Load dataset 
data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = [
    "age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
    "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
    "hours-per-week", "native-country", "income"
]

data = pd.read_csv(data_url, header=None, names=columns, na_values=" ?", skipinitialspace=True)


# Display dataset info
print("Dataset Info:")
#print(data.info())

data.isna().any()

data.drop_duplicates()


data.head(10)
#separating the numerical and categorical columns in two separate categories
numerical_cols = data.select_dtypes(include=['int64']).columns
categorical_cols = data.select_dtypes(include=['object']).columns
print (numerical_cols)
print (categorical_cols)




Dataset Info:
Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week'],
      dtype='object')
Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country', 'income'],
      dtype='object')


In [None]:


data[numerical_cols] = num_imputer.fit_transform(data[numerical_cols])
data[categorical_cols] = cat_imputer.fit_transform(data[categorical_cols])

# Encoding categorical variables
encoder = OneHotEncoder(sparse=False, drop='first')
encoded_cols = encoder.fit_transform(data[categorical_cols])
encoded_df = pd.DataFrame(encoded_cols, columns=encoder.get_feature_names_out(categorical_cols))

# Combine numerical and encoded categorical data
data = pd.concat([data[numerical_cols], encoded_df], axis=1)

# Outlier detection and removal using IQR
Q1 = data[numerical_cols].quantile(0.25)
Q3 = data[numerical_cols].quantile(0.75)
IQR = Q3 - Q1
data = data[~((data[numerical_cols] < (Q1 - 1.5 * IQR)) | (data[numerical_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

# Normalize numerical data
scaler = StandardScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# Train-test split
train, test = train_test_split(data, test_size=0.2, random_state=42)
train.to_csv('train_preprocessed.csv', index=False)
test.to_csv('test_preprocessed.csv', index=False)

print("Preprocessing completed. Preprocessed data saved as CSV.")
