# **Final Project Task 1 - Census Data Preprocess**

Requirements

- Target variable specification:
    - The target variable for this project is hours-per-week. 
    - Ensure all preprocessing steps are designed to support regression analysis on this target variable.
- Encode data  **3p**
- Handle missing values if any **1p**
- Correct errors, inconsistencies, remove duplicates if any **1p**
- Outlier detection and treatment if any **1p**
- Normalization / Standardization if necesarry **1p**
- Feature engineering **3p**
- Train test split, save it.
- Others?


Deliverable:

- Notebook code with no errors.
- Preprocessed data as csv.

In [71]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder

In [72]:
data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = [
    "age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
    "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
    "hours-per-week", "native-country", "income"
]

data = pd.read_csv(data_url, header=None, names=columns, na_values=" ?", skipinitialspace=True)
data.head(20)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [73]:
data.tail(20)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
32541,41,?,202822,HS-grad,9,Separated,?,Not-in-family,Black,Female,0,0,32,United-States,<=50K
32542,72,?,129912,HS-grad,9,Married-civ-spouse,?,Husband,White,Male,0,0,25,United-States,<=50K
32543,45,Local-gov,119199,Assoc-acdm,12,Divorced,Prof-specialty,Unmarried,White,Female,0,0,48,United-States,<=50K
32544,31,Private,199655,Masters,14,Divorced,Other-service,Not-in-family,Other,Female,0,0,30,United-States,<=50K
32545,39,Local-gov,111499,Assoc-acdm,12,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,20,United-States,>50K
32546,37,Private,198216,Assoc-acdm,12,Divorced,Tech-support,Not-in-family,White,Female,0,0,40,United-States,<=50K
32547,43,Private,260761,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,Mexico,<=50K
32548,65,Self-emp-not-inc,99359,Prof-school,15,Never-married,Prof-specialty,Not-in-family,White,Male,1086,0,60,United-States,<=50K
32549,43,State-gov,255835,Some-college,10,Divorced,Adm-clerical,Other-relative,White,Female,0,0,40,United-States,<=50K
32550,43,Self-emp-not-inc,27242,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,50,United-States,<=50K


# Encode data

In [74]:
#  Label ecoding for income
data['income_encode'] = data['income'].map({'<=50K': 0, '>50K': 1})


In [75]:
# Encode sex as binary variable
data['sex_encode'] = data['sex'].map({'Female': 0, 'Male': 1})

In [None]:
# Feature engineering, turning native-country into a binary feature
data['usa_yes_no'] = np.where(data['native-country'] == 'United-States', 1, 0)


In [77]:
data.head(15)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,income_encode,sex_encode,usa_yes_no
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,0,1,1
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,0,1,1
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,0,1,1
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,0,1,1
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,0,0,0
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K,0,0,1
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K,0,0,0
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K,1,1,1
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K,1,0,1
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K,1,1,1


# Handling missing values

In [78]:
missing_values = data.isnull().sum()
missing_values

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
income_encode     0
sex_encode        0
usa_yes_no        0
dtype: int64

In [79]:
# Replacing ? with nan 
data.replace({'?': np.nan}, inplace=True)

In [80]:
missing_values = data.isnull().sum()
missing_values

age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
income               0
income_encode        0
sex_encode           0
usa_yes_no           0
dtype: int64

In [81]:
# Filling missing values with mode imputation
cols_missing = ['workclass', 'occupation', 'native-country']
for col in cols_missing:
    mode_value = data[col].mode()[0]
    data[col].fillna(mode_value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(mode_value, inplace=True)


In [82]:
# Verifying duplicates
duplicates = data[data.duplicated(keep=False)]
duplicates_sorted = duplicates.sort_values(by=list(data.columns))
print(duplicates_sorted.head(10))

       age workclass  fnlwgt     education  education-num marital-status  \
17673   19   Private   97261       HS-grad              9  Never-married   
18698   19   Private   97261       HS-grad              9  Never-married   
6990    19   Private  138153  Some-college             10  Never-married   
21318   19   Private  138153  Some-college             10  Never-married   
15189   19   Private  146679  Some-college             10  Never-married   
21490   19   Private  146679  Some-college             10  Never-married   
3917    19   Private  251579  Some-college             10  Never-married   
31993   19   Private  251579  Some-college             10  Never-married   
5805    20   Private  107658  Some-college             10  Never-married   
11631   20   Private  107658  Some-college             10  Never-married   

            occupation   relationship   race     sex  capital-gain  \
17673  Farming-fishing  Not-in-family  White    Male             0   
18698  Farming-fishing 

In [83]:
# Removing duplicates
data = data.drop_duplicates()

# One hot encoding

In [84]:
# One hot encoding
cols = ['workclass', 'marital-status', 'occupation', 'relationship', 'race']
data = pd.get_dummies(data, columns=cols, drop_first=True)


In [85]:
data.head(5)

Unnamed: 0,age,fnlwgt,education,education-num,sex,capital-gain,capital-loss,hours-per-week,native-country,income,...,occupation_Transport-moving,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Asian-Pac-Islander,race_Black,race_Other,race_White
0,39,77516,Bachelors,13,Male,2174,0,40,United-States,<=50K,...,False,True,False,False,False,False,False,False,False,True
1,50,83311,Bachelors,13,Male,0,0,13,United-States,<=50K,...,False,False,False,False,False,False,False,False,False,True
2,38,215646,HS-grad,9,Male,0,0,40,United-States,<=50K,...,False,True,False,False,False,False,False,False,False,True
3,53,234721,11th,7,Male,0,0,40,United-States,<=50K,...,False,False,False,False,False,False,False,True,False,False
4,28,338409,Bachelors,13,Female,0,0,40,Cuba,<=50K,...,False,False,False,False,False,True,False,True,False,False


In [87]:
# Defining correct order for education
order_education = ['Preschool', '1st-4th', '5th-6th', '7th-8th', '9th', '10th', '11th', '12th', 'HS-grad', 'Some-college', 'Assoc-voc', 'Assoc-acdm', 'Bachelors',
                   'Masters', 'Prof-school', 'Doctorate' ]
order_education = OrdinalEncoder(categories=[order_education])
data['education'] = order_education.fit_transform(data[['education']])

In [88]:
data.head(5)

Unnamed: 0,age,fnlwgt,education,education-num,sex,capital-gain,capital-loss,hours-per-week,native-country,income,...,occupation_Transport-moving,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Asian-Pac-Islander,race_Black,race_Other,race_White
0,39,77516,12.0,13,Male,2174,0,40,United-States,<=50K,...,False,True,False,False,False,False,False,False,False,True
1,50,83311,12.0,13,Male,0,0,13,United-States,<=50K,...,False,False,False,False,False,False,False,False,False,True
2,38,215646,8.0,9,Male,0,0,40,United-States,<=50K,...,False,True,False,False,False,False,False,False,False,True
3,53,234721,6.0,7,Male,0,0,40,United-States,<=50K,...,False,False,False,False,False,False,False,True,False,False
4,28,338409,12.0,13,Female,0,0,40,Cuba,<=50K,...,False,False,False,False,False,True,False,True,False,False


In [91]:
# Droping some columns that we don't need anymore because we have the encoded version of them
columns_to_drop = ['education', 'native-country', 'sex', 'income']
data = data.drop(columns=columns_to_drop)

In [92]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
Index: 32537 entries, 0 to 32560
Data columns (total 44 columns):
 #   Column                                Non-Null Count  Dtype
---  ------                                --------------  -----
 0   age                                   32537 non-null  int64
 1   fnlwgt                                32537 non-null  int64
 2   education-num                         32537 non-null  int64
 3   capital-gain                          32537 non-null  int64
 4   capital-loss                          32537 non-null  int64
 5   hours-per-week                        32537 non-null  int64
 6   income_encode                         32537 non-null  int64
 7   sex_encode                            32537 non-null  int64
 8   usa_yes_no                            32537 non-null  int64
 9   workclass_Local-gov                   32537 non-null  bool 
 10  workclass_Never-worked                32537 non-null  bool 
 11  workclass_Private                     32537 no

# Checking inconsistencies

In [94]:
# if never-worked is true, then hours-per-week should be 0
never_worked = data[(data['workclass_Never-worked'] == 1) & (data['hours-per-week'] > 0)]
print(f'Persoane care nu au lucrat niciodata, dar au ore pe saptamana > 0: {len(never_worked)}')

Persoane care nu au lucrat niciodata, dar au ore pe saptamana > 0: 7


In [101]:
# husband = male, if sex_encode = 0 (female) = error
relationship_cols = [col for col in data.columns if 'relationship_' in col]
is_husband = (data[relationship_cols].sum(axis=1) == 0)
is_female = (data['sex_encode'] == 0)

In [None]:
# spotting the error 
error_gender = data[is_husband & is_female]
print(f"Persoane care sunt sot dar la sex apar female:' {len(error_gender)}")

2. Persoane care sunt sot dar la sex apar female:' 1


In [112]:
# Verifying outliers
outliers = data[data['capital-gain'] == 99999]
print(f"Numarul de outlieri: {len(outliers)}")

Numarul de outlieri: 159


# Deleting inconsistencies and outliers

In [114]:
i_count = len(data)
i_count

32537

In [117]:
# Stergerea erorilor never-worked
data = data[~((data['workclass_Never-worked'] == 1) & (data['hours-per-week'] > 0))]
# Stergerea erorilor de gen
data = data[~(is_husband & is_female)]
# Stergerea outlierilor
data = data[data['capital-gain'] != 99999]

  data = data[~(is_husband & is_female)]


In [120]:
# Verificari finale
c_count = len(data)
c_count
d_count = i_count - c_count
print(f'Randuri inainte de corectare: {i_count}')
print(f'Randuri dupa corectare: {c_count}')
print(f'Randuri eliminate: {d_count}')

Randuri inainte de corectare: 32537
Randuri dupa corectare: 32370
Randuri eliminate: 167


# Standardization

In [122]:
# scaling the numerical columns
from sklearn.preprocessing import StandardScaler
cols_scale = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss']

In [123]:
scaler = StandardScaler()

In [124]:
data[cols_scale] = scaler.fit_transform(data[cols_scale])

In [125]:
data.head(5)

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,income_encode,sex_encode,usa_yes_no,workclass_Local-gov,...,occupation_Transport-moving,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Asian-Pac-Islander,race_Black,race_Other,race_White
0,0.032893,-1.063131,1.143255,0.617308,-0.217328,40,0,1,1,False,...,False,True,False,False,False,False,False,False,False,True
1,0.839461,-1.008242,1.143255,-0.231441,-0.217328,13,0,1,1,False,...,False,False,False,False,False,False,False,False,False,True
2,-0.040431,0.245208,-0.416698,-0.231441,-0.217328,40,0,1,1,False,...,False,True,False,False,False,False,False,False,False,True
3,1.059434,0.425882,-1.196675,-0.231441,-0.217328,40,0,1,1,False,...,False,False,False,False,False,False,False,True,False,False
4,-0.773675,1.407994,1.143255,-0.231441,-0.217328,40,0,0,0,False,...,False,False,False,False,False,True,False,True,False,False


In [126]:
# Saving the clean data
data.to_csv('clean_data.csv', index=False)


In [128]:
df = pd.read_csv('clean_data.csv')
df.head(5)

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,income_encode,sex_encode,usa_yes_no,workclass_Local-gov,...,occupation_Transport-moving,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Asian-Pac-Islander,race_Black,race_Other,race_White
0,0.032893,-1.063131,1.143255,0.617308,-0.217328,40,0,1,1,False,...,False,True,False,False,False,False,False,False,False,True
1,0.839461,-1.008242,1.143255,-0.231441,-0.217328,13,0,1,1,False,...,False,False,False,False,False,False,False,False,False,True
2,-0.040431,0.245208,-0.416698,-0.231441,-0.217328,40,0,1,1,False,...,False,True,False,False,False,False,False,False,False,True
3,1.059434,0.425882,-1.196675,-0.231441,-0.217328,40,0,1,1,False,...,False,False,False,False,False,False,False,True,False,False
4,-0.773675,1.407994,1.143255,-0.231441,-0.217328,40,0,0,0,False,...,False,False,False,False,False,True,False,True,False,False


In [136]:
from sklearn.model_selection import train_test_split
# Defining features (x) and target (y)
X = df.drop(columns=['hours-per-week'])
y = df['hours-per-week']

In [None]:
# Performing the split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [137]:
# Saving the results
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)