# **Final Project Task 1 - Census Data Preprocess**

Requirements

- Target variable specification:
    - The target variable for this project is hours-per-week. 
    - Ensure all preprocessing steps are designed to support regression analysis on this target variable.
- Encode data  **3p**
- Handle missing values if any **1p**
- Correct errors, inconsistencies, remove duplicates if any **1p**
- Outlier detection and treatment if any **1p**
- Normalization / Standardization if necesarry **1p**
- Feature engineering **3p**
- Train test split, save it.
- Others?


Deliverable:

- Notebook code with no errors.
- Preprocessed data as csv.

In [40]:
import pandas as pd

In [41]:
data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = [
    "age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
    "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
    "hours-per-week", "native-country", "income"
]

data = pd.read_csv(data_url, header=None, names=columns, na_values=" ?", skipinitialspace=True)
data.head()

data.to_csv("adult_data", index=False)
adult_data = data

In [42]:
adult_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [43]:
adult_data.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [44]:
print("Dimensiune dataframe:", adult_data.shape)
print("\nTipuri de date:")
print(adult_data.dtypes)

Dimensiune dataframe: (32561, 15)

Tipuri de date:
age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
income            object
dtype: object


In [45]:
adult_data["sex"].value_counts()


sex
Male      21790
Female    10771
Name: count, dtype: int64

In [46]:
print("\nProcent lipsuri pe coloană (%):")
print((adult_data.isna().mean() * 100).round(2))


Procent lipsuri pe coloană (%):
age               0.0
workclass         0.0
fnlwgt            0.0
education         0.0
education-num     0.0
marital-status    0.0
occupation        0.0
relationship      0.0
race              0.0
sex               0.0
capital-gain      0.0
capital-loss      0.0
hours-per-week    0.0
native-country    0.0
income            0.0
dtype: float64


In [47]:
for col in adult_data:
    print(adult_data[col].unique())

[39 50 38 53 28 37 49 52 31 42 30 23 32 40 34 25 43 54 35 59 56 19 20 45
 22 48 21 24 57 44 41 29 18 47 46 36 79 27 67 33 76 17 55 61 70 64 71 68
 66 51 58 26 60 90 75 65 77 62 63 80 72 74 69 73 81 78 88 82 83 84 85 86
 87]
['State-gov' 'Self-emp-not-inc' 'Private' 'Federal-gov' 'Local-gov' '?'
 'Self-emp-inc' 'Without-pay' 'Never-worked']
[ 77516  83311 215646 ...  34066  84661 257302]
['Bachelors' 'HS-grad' '11th' 'Masters' '9th' 'Some-college' 'Assoc-acdm'
 'Assoc-voc' '7th-8th' 'Doctorate' 'Prof-school' '5th-6th' '10th'
 '1st-4th' 'Preschool' '12th']
[13  9  7 14  5 10 12 11  4 16 15  3  6  2  1  8]
['Never-married' 'Married-civ-spouse' 'Divorced' 'Married-spouse-absent'
 'Separated' 'Married-AF-spouse' 'Widowed']
['Adm-clerical' 'Exec-managerial' 'Handlers-cleaners' 'Prof-specialty'
 'Other-service' 'Sales' 'Craft-repair' 'Transport-moving'
 'Farming-fishing' 'Machine-op-inspct' 'Tech-support' '?'
 'Protective-serv' 'Armed-Forces' 'Priv-house-serv']
['Not-in-family' 'Husband' 'Wif

In [48]:
adult_data = adult_data[adult_data['workclass'] != "?"].copy()

In [49]:
adult_data = adult_data[adult_data['occupation'] != "?"].copy()

In [50]:
for col in adult_data:
    print(adult_data[col].unique())

[39 50 38 53 28 37 49 52 31 42 30 23 32 40 34 25 43 54 35 59 56 19 20 45
 22 48 21 24 57 44 41 29 18 47 46 36 79 27 33 76 55 61 70 64 71 66 51 58
 26 17 60 90 75 65 77 62 63 67 74 72 69 68 73 81 78 88 80 84 83 85 82 86]
['State-gov' 'Self-emp-not-inc' 'Private' 'Federal-gov' 'Local-gov'
 'Self-emp-inc' 'Without-pay']
[ 77516  83311 215646 ...  84661 257302 201490]
['Bachelors' 'HS-grad' '11th' 'Masters' '9th' 'Some-college' 'Assoc-acdm'
 'Assoc-voc' '7th-8th' 'Doctorate' 'Prof-school' '5th-6th' '10th'
 'Preschool' '12th' '1st-4th']
[13  9  7 14  5 10 12 11  4 16 15  3  6  1  8  2]
['Never-married' 'Married-civ-spouse' 'Divorced' 'Married-spouse-absent'
 'Separated' 'Married-AF-spouse' 'Widowed']
['Adm-clerical' 'Exec-managerial' 'Handlers-cleaners' 'Prof-specialty'
 'Other-service' 'Sales' 'Craft-repair' 'Transport-moving'
 'Farming-fishing' 'Machine-op-inspct' 'Tech-support' 'Protective-serv'
 'Armed-Forces' 'Priv-house-serv']
['Not-in-family' 'Husband' 'Wife' 'Own-child' 'Unmarried' 

In [51]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
import math

In [52]:
adult_data.drop_duplicates()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [53]:
adult_data.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,30718.0,30718.0,30718.0,30718.0,30718.0,30718.0
mean,38.443584,189845.5,10.130314,1106.037079,88.910216,40.949313
std,13.118227,105458.3,2.562469,7497.863364,405.657203,11.985382
min,17.0,13769.0,1.0,0.0,0.0,1.0
25%,28.0,117828.5,9.0,0.0,0.0,40.0
50%,37.0,178517.0,10.0,0.0,0.0,40.0
75%,47.0,237317.0,13.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [54]:
adult_data = adult_data.drop(columns=['education'])


In [55]:
adult_data

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [56]:
import scipy.stats as stats

In [57]:
adult_data["age"] = stats.zscore(adult_data.age)

In [58]:
adult_data["fnlwgt"] = stats.zscore(adult_data.fnlwgt)

In [59]:
adult_data["education-num"] = stats.zscore(adult_data["education-num"])
adult_data["capital-gain"] = stats.zscore(adult_data["capital-gain"])


In [60]:
adult_data["capital-loss"] = stats.zscore(adult_data["capital-loss"])

In [61]:
adult_data.head()

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,0.042416,State-gov,-1.065173,1.119909,Never-married,Adm-clerical,Not-in-family,White,Male,0.142438,-0.219179,40,United-States,<=50K
1,0.880958,Self-emp-not-inc,-1.010222,1.119909,Married-civ-spouse,Exec-managerial,Husband,White,Male,-0.147516,-0.219179,13,United-States,<=50K
2,-0.033815,Private,0.244655,-0.441111,Divorced,Handlers-cleaners,Not-in-family,White,Male,-0.147516,-0.219179,40,United-States,<=50K
3,1.109651,Private,0.425535,-1.221621,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,-0.147516,-0.219179,40,United-States,<=50K
4,-0.796125,Private,1.408764,1.119909,Married-civ-spouse,Prof-specialty,Wife,Black,Female,-0.147516,-0.219179,40,Cuba,<=50K


In [62]:
adult_data.income.unique()

array(['<=50K', '>50K'], dtype=object)

In [63]:
adult_data.income.value_counts()

income
<=50K    23068
>50K      7650
Name: count, dtype: int64

In [64]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [65]:
le = LabelEncoder

In [66]:
adult_data['income_label'] = le().fit_transform(adult_data['income'])

In [67]:
adult_data.income_label.value_counts()

income_label
0    23068
1     7650
Name: count, dtype: int64

In [68]:
adult_data.columns

Index(['age', 'workclass', 'fnlwgt', 'education-num', 'marital-status',
       'occupation', 'relationship', 'race', 'sex', 'capital-gain',
       'capital-loss', 'hours-per-week', 'native-country', 'income',
       'income_label'],
      dtype='object')

In [69]:
adult_data

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,income_label
0,0.042416,State-gov,-1.065173,1.119909,Never-married,Adm-clerical,Not-in-family,White,Male,0.142438,-0.219179,40,United-States,<=50K,0
1,0.880958,Self-emp-not-inc,-1.010222,1.119909,Married-civ-spouse,Exec-managerial,Husband,White,Male,-0.147516,-0.219179,13,United-States,<=50K,0
2,-0.033815,Private,0.244655,-0.441111,Divorced,Handlers-cleaners,Not-in-family,White,Male,-0.147516,-0.219179,40,United-States,<=50K,0
3,1.109651,Private,0.425535,-1.221621,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,-0.147516,-0.219179,40,United-States,<=50K,0
4,-0.796125,Private,1.408764,1.119909,Married-civ-spouse,Prof-specialty,Wife,Black,Female,-0.147516,-0.219179,40,Cuba,<=50K,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,-0.872356,Private,0.639661,0.729654,Married-civ-spouse,Tech-support,Wife,White,Female,-0.147516,-0.219179,38,United-States,<=50K,0
32557,0.118647,Private,-0.336362,-0.441111,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,-0.147516,-0.219179,40,United-States,>50K,1
32558,1.490806,Private,-0.359727,-0.441111,Widowed,Adm-clerical,Unmarried,White,Female,-0.147516,-0.219179,40,United-States,<=50K,0
32559,-1.253512,Private,0.110419,-0.441111,Never-married,Adm-clerical,Own-child,White,Male,-0.147516,-0.219179,20,United-States,<=50K,0


In [70]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

column_transformer = ColumnTransformer([
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_features)
], remainder='passthrough')

data_transformed = column_transformer.fit_transform(adult_data)

new_columns = column_transformer.get_feature_names_out()
adult_data_encoded = pd.DataFrame(data_transformed, columns=new_columns)

print(adult_data_encoded.head())

  onehot__workclass_Federal-gov onehot__workclass_Local-gov  \
0                           0.0                         0.0   
1                           0.0                         0.0   
2                           0.0                         0.0   
3                           0.0                         0.0   
4                           0.0                         0.0   

  onehot__workclass_Private onehot__workclass_Self-emp-inc  \
0                       0.0                            0.0   
1                       0.0                            0.0   
2                       1.0                            0.0   
3                       1.0                            0.0   
4                       1.0                            0.0   

  onehot__workclass_Self-emp-not-inc onehot__workclass_State-gov  \
0                                0.0                         1.0   
1                                1.0                         0.0   
2                                0.0        

In [71]:
adult_data_encoded

Unnamed: 0,onehot__workclass_Federal-gov,onehot__workclass_Local-gov,onehot__workclass_Private,onehot__workclass_Self-emp-inc,onehot__workclass_Self-emp-not-inc,onehot__workclass_State-gov,onehot__workclass_Without-pay,onehot__marital-status_Divorced,onehot__marital-status_Married-AF-spouse,onehot__marital-status_Married-civ-spouse,...,onehot__native-country_Vietnam,onehot__native-country_Yugoslavia,remainder__age,remainder__fnlwgt,remainder__education-num,remainder__capital-gain,remainder__capital-loss,remainder__hours-per-week,remainder__income,remainder__income_label
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.042416,-1.065173,1.119909,0.142438,-0.219179,40,<=50K,0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.880958,-1.010222,1.119909,-0.147516,-0.219179,13,<=50K,0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,-0.033815,0.244655,-0.441111,-0.147516,-0.219179,40,<=50K,0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.109651,0.425535,-1.221621,-0.147516,-0.219179,40,<=50K,0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,-0.796125,1.408764,1.119909,-0.147516,-0.219179,40,<=50K,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30713,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,-0.872356,0.639661,0.729654,-0.147516,-0.219179,38,<=50K,0
30714,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.118647,-0.336362,-0.441111,-0.147516,-0.219179,40,>50K,1
30715,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.490806,-0.359727,-0.441111,-0.147516,-0.219179,40,<=50K,0
30716,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-1.253512,0.110419,-0.441111,-0.147516,-0.219179,20,<=50K,0


In [72]:
adult_data_encoded.columns

Index(['onehot__workclass_Federal-gov', 'onehot__workclass_Local-gov',
       'onehot__workclass_Private', 'onehot__workclass_Self-emp-inc',
       'onehot__workclass_Self-emp-not-inc', 'onehot__workclass_State-gov',
       'onehot__workclass_Without-pay', 'onehot__marital-status_Divorced',
       'onehot__marital-status_Married-AF-spouse',
       'onehot__marital-status_Married-civ-spouse',
       'onehot__marital-status_Married-spouse-absent',
       'onehot__marital-status_Never-married',
       'onehot__marital-status_Separated', 'onehot__marital-status_Widowed',
       'onehot__occupation_Adm-clerical', 'onehot__occupation_Armed-Forces',
       'onehot__occupation_Craft-repair', 'onehot__occupation_Exec-managerial',
       'onehot__occupation_Farming-fishing',
       'onehot__occupation_Handlers-cleaners',
       'onehot__occupation_Machine-op-inspct',
       'onehot__occupation_Other-service',
       'onehot__occupation_Priv-house-serv',
       'onehot__occupation_Prof-specialty'

In [73]:
adult_data_encoded = adult_data_encoded.rename(columns={
    'remainder__age' : 'age',
    'remainder__fnlwgt' : 'fnlwgt',
    'remainder__education-num' : 'education_num',
    'remainder__capital-gain' : 'capital_gain',
    'remainder__capital-loss' : 'capital_loss',
    'remainder__hours-per-week' : 'hours_per_week',
    'remainder__income' : 'income',
    })


In [74]:
adult_data_encoded = adult_data_encoded.rename(columns={
    'remainder__income_label' : 'income_label',
    })

In [75]:
adult_data_encoded = adult_data_encoded.rename(columns={
    'income_label' : 'income_keep',
    })

In [82]:
adult_data_encoded = adult_data_encoded.drop(columns='income')

In [84]:
adult_data_encoded = adult_data_encoded.rename(columns={
    'income_keep' : 'income',
    })

In [85]:
adult_data_encoded

Unnamed: 0,onehot__workclass_Federal-gov,onehot__workclass_Local-gov,onehot__workclass_Private,onehot__workclass_Self-emp-inc,onehot__workclass_Self-emp-not-inc,onehot__workclass_State-gov,onehot__workclass_Without-pay,onehot__marital-status_Divorced,onehot__marital-status_Married-AF-spouse,onehot__marital-status_Married-civ-spouse,...,onehot__native-country_United-States,onehot__native-country_Vietnam,onehot__native-country_Yugoslavia,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,income
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.042416,-1.065173,1.119909,0.142438,-0.219179,40,0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.880958,-1.010222,1.119909,-0.147516,-0.219179,13,0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,-0.033815,0.244655,-0.441111,-0.147516,-0.219179,40,0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.109651,0.425535,-1.221621,-0.147516,-0.219179,40,0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,-0.796125,1.408764,1.119909,-0.147516,-0.219179,40,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30713,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,-0.872356,0.639661,0.729654,-0.147516,-0.219179,38,0
30714,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.118647,-0.336362,-0.441111,-0.147516,-0.219179,40,1
30715,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.490806,-0.359727,-0.441111,-0.147516,-0.219179,40,0
30716,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,-1.253512,0.110419,-0.441111,-0.147516,-0.219179,20,0


In [77]:
from sklearn.model_selection import train_test_split

X = adult_data_encoded.drop('hours_per_week', axis=1) 
Y = adult_data_encoded['hours_per_week']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print(f"Dimensiune set antrenament: {X_train.shape[0]} rânduri")
print(f"Dimensiune set testare: {X_test.shape[0]} rânduri")

Dimensiune set antrenament: 24574 rânduri
Dimensiune set testare: 6144 rânduri


In [86]:
adult_data_encoded.to_csv('adult_data_encoded', index=False)