In [1]:
# import pandas and numpy
import pandas as pd
import numpy as np

# Dataset Information
## Dataset

- The **adult** dataset of UCI is used to predict whether income exceeds \$50K/yr based on census data.
- Dataset link: [adult dataset](https://archive.ics.uci.edu/dataset/2/adult).
- Code repo in GitHub: [notebook](https://gooogle.com).
## Aim in this notebook
- Data cleaning
- Data Transformation
- Data Normalization
- Using `sklearn`: Indicates whether an individual earns more than $50,000 per year or not.
## Student
- name: Tran Huu Nhan
- id: 521H0507
- date: 11/9/2023


In [2]:
# Read data from file adult.data
data = pd.read_csv('adult.data', header=None, na_values='?', skipinitialspace=True)

# defined column names
data.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']

# print head of dataframe
print(data.head())


   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital-gain  capital-loss  hours-per-week native-country income  
0          2174             0              40  United-States  <=50K  
1             0             0             

# Data Cleaning
## Handling missing values
- Dataset contains a small amount of missing values and 3 columns is not continuous values. So that using statistical methods or removing the records with missing values.
Using mode in this case, fill missing valued with mode of each column.
- In this example:
    find list names column has missing data and depend on amount of values to decided next step.

In [3]:
# find missing data in dataset
data.replace('?', np.nan, inplace=True)
missing_value = data.isnull()

missing_columns = missing_value.sum()
print(f"Amount of value missing:\n{missing_columns[missing_columns > 0]}")

print(f"\nPercent missing: \n{missing_columns[missing_columns >0] / len(data)}")

Amount of value missing:
workclass         1836
occupation        1843
native-country     583
dtype: int64

Percent missing: 
workclass         0.056386
occupation        0.056601
native-country    0.017905
dtype: float64


**Has 3 columns missing value:**
|Column|Percent|
|-------|-------|
|workclass|        0.056386 %| 
|occupation|       0.056601 %|
|native-country|   0.017905 %|

Because percentage of value missing is small. So that, replace missing values instead remove row.

In [4]:
# Replace missing values with the column's mode.
data['workclass'].fillna(data['workclass'].mode()[0], inplace=True)
data['occupation'].fillna(data['occupation'].mode()[0], inplace=True)
data['native-country'].fillna(data['native-country'].mode()[0], inplace=True)

## Removing irrelevant data
For individuals who are *without-pay* and those who have *never-worked*, they are considered exceptions in the income prediction model within this dataset.

In [5]:
# Remove rows with a workclass of Without-pay
data = data[(data['workclass'] != 'Without-pay') & (data['workclass'] != 'Never-worked')]

# print number of rows in dataset after remove
print(data.shape[0])

32540


 # Data Transformation
 ## Using Label encoder

In [6]:
# import library Label Encoder
from sklearn.preprocessing import LabelEncoder

# Create a dictionary to store LabelEncoder objects for each column
encoders = {}

# Iterate over the columns to perform label encoding
for col in ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']:
    # Create a LabelEncoder object
    le = LabelEncoder()
    
    # Perform label encoding on the column and update the data
    data[col] = le.fit_transform(data[col])
    
    # Store the LabelEncoder object in the dictionary
    encoders[col] = le

# Data Normalization
## Using Standard Scaler

In [7]:
from sklearn.preprocessing import StandardScaler

# select the variables to normalize
variables_to_normalize = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

# Create a scaler object to perform StandardScaler
scaler = StandardScaler()

# fit the scaler to the data and compute the normalization parameters
scaled_data = scaler.fit_transform(data[variables_to_normalize])

# Update the original data with the normalized values
data[variables_to_normalize] = scaled_data

# Print a few rows of data after normalization
print(data.head())

        age  workclass    fnlwgt  education  education-num  marital-status  \
0  0.030691          5 -1.063518          9       1.134330               4   
1  0.837492          4 -1.008618          9       1.134330               2   
2 -0.042655          2  0.245070         11      -0.420444               0   
3  1.057529          2  0.425779          1      -1.197830               2   
4 -0.776111          2  1.408077          9       1.134330               2   

   occupation  relationship  race  sex  capital-gain  capital-loss  \
0           0             1     4    1      0.148341     -0.216733   
1           3             0     4    1     -0.145941     -0.216733   
2           5             1     4    1     -0.145941     -0.216733   
3           5             0     2    1     -0.145941     -0.216733   
4           9             5     2    0     -0.145941     -0.216733   

   hours-per-week  native-country  income  
0       -0.035922              38       0  
1       -2.223516     

In [8]:
# import necessary library to train model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [9]:
# Split the data into training and test sets
X = data.drop('income', axis=1)
y = data['income']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the machine learning model
model = LogisticRegression(max_iter=1000) # Increase the maximum number of iterations to avoid convergence warning
model.fit(X_train, y_train)

# Predict income on the test set
y_pred = model.predict(X_test)

# Evaluate the performance of the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Print the first 10 predictions
print("First 10 predictions:")
print(le.inverse_transform(y_pred[:10]))

# Print the actual values for comparison
print("Actual values for the first 10 instances:")
print(le.inverse_transform(y_test[:10]))

Accuracy: 0.8246773202212662
First 10 predictions:
['<=50K' '<=50K' '<=50K' '<=50K' '<=50K' '<=50K' '<=50K' '>50K' '<=50K'
 '<=50K']
Actual values for the first 10 instances:
['>50K' '>50K' '<=50K' '<=50K' '<=50K' '<=50K' '<=50K' '<=50K' '<=50K'
 '<=50K']
