# Cencus Income Classification

The Goal is to predict whether a person has an income of more than 50K a year or not.

# Importing required packages

In [1]:
library(readr)
library(dplyr)
library(ggplot2)

"package 'dplyr' was built under R version 4.3.3"

Attaching package: 'dplyr'


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union




# Step 1: Preliminary Analysis

In [3]:
# Defining column types
column_types <- cols(
  age = col_integer(),
  workclass = col_character(),
  fnlwgt = col_integer(),
  education = col_character(),
  education_num = col_integer(),  # Assuming education-num in Python corresponds to education_num
  marital_status = col_character(),  # Assuming marital-status in Python corresponds to marital_status
  occupation = col_character(),
  relationship = col_character(),
  race = col_character(),
  sex = col_character(),
  capital_gain = col_integer(),  # If very large values, consider col_double()
  capital_loss = col_integer(),  # If very large values, consider col_double()
  hours_per_week = col_integer(),  # Assuming hours-per-week in Python corresponds to hours_per_week
  native_country = col_character(),  # Assuming native-country in Python corresponds to native_country
  income = col_character()
)


In [4]:
# Reading the CSV file with specified column types
df <- read_csv('census_income.csv', col_types = column_types, show_col_types = FALSE)

"The following named parsers don't match the column names: education_num, marital_status, capital_gain, capital_loss, hours_per_week, native_country"


In [5]:
head(df)

age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
<int>,<chr>,<int>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>
39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


## Number of independent and dependent variables:

In [6]:
independent_vars <- names(df)[1:(ncol(df)-1)]
dependent_var <- names(df)[ncol(df)]

In [7]:
cat("Number of independent variables:", length(independent_vars), "\n")
cat("Number of dependent variables: 1\n")

Number of independent variables: 14 
Number of dependent variables: 1


## Number of records:¶

In [8]:
cat("Number of records:", nrow(df), "\n")

Number of records: 48842 


In [9]:
cat("Dimensions of the DataFrame: ", dim(df), "\n")

Dimensions of the DataFrame:  48842 15 


## Data types of variables:

In [10]:
cat("Data types of variables:\n")
print(sapply(df, class))

Data types of variables:
           age      workclass         fnlwgt      education  education-num 
     "integer"    "character"      "integer"    "character"      "numeric" 
marital-status     occupation   relationship           race            sex 
   "character"    "character"    "character"    "character"    "character" 
  capital-gain   capital-loss hours-per-week native-country         income 
     "numeric"      "numeric"      "numeric"    "character"    "character" 


## Summary Statistics

In [11]:
summary(df)

      age         workclass             fnlwgt         education        
 Min.   :17.00   Length:48842       Min.   :  12285   Length:48842      
 1st Qu.:28.00   Class :character   1st Qu.: 117551   Class :character  
 Median :37.00   Mode  :character   Median : 178145   Mode  :character  
 Mean   :38.64                      Mean   : 189664                     
 3rd Qu.:48.00                      3rd Qu.: 237642                     
 Max.   :90.00                      Max.   :1490400                     
 education-num   marital-status      occupation        relationship      
 Min.   : 1.00   Length:48842       Length:48842       Length:48842      
 1st Qu.: 9.00   Class :character   Class :character   Class :character  
 Median :10.00   Mode  :character   Mode  :character   Mode  :character  
 Mean   :10.08                                                           
 3rd Qu.:12.00                                                           
 Max.   :16.00                               

In [12]:
# Converting specified columns to factors
df <- df %>%
  mutate(
    workclass = as.factor(workclass),
    education = as.factor(education),
    marital_status = as.factor(`marital-status`),  # Handle the hyphen by using backticks
    occupation = as.factor(occupation),
    relationship = as.factor(relationship),
    race = as.factor(race),
    sex = as.factor(sex),
    native_country = as.factor(`native-country`),  # Handle the hyphen by using backticks
    income = as.factor(income)
  )

In [13]:
str(df)

tibble [48,842 × 17] (S3: tbl_df/tbl/data.frame)
 $ age           : int [1:48842] 39 50 38 53 28 37 49 52 31 42 ...
 $ workclass     : Factor w/ 9 levels "?","Federal-gov",..: 8 7 5 5 5 5 5 7 5 5 ...
 $ fnlwgt        : int [1:48842] 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
 $ education     : Factor w/ 16 levels "10th","11th",..: 10 10 12 2 10 13 7 12 13 10 ...
 $ education-num : num [1:48842] 13 13 9 7 13 14 5 9 14 13 ...
 $ marital-status: chr [1:48842] "Never-married" "Married-civ-spouse" "Divorced" "Married-civ-spouse" ...
 $ occupation    : Factor w/ 15 levels "?","Adm-clerical",..: 2 5 7 7 11 5 9 5 11 5 ...
 $ relationship  : Factor w/ 6 levels "Husband","Not-in-family",..: 2 1 2 1 6 6 2 1 2 1 ...
 $ race          : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
 $ sex           : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 1 1 2 1 2 ...
 $ capital-gain  : num [1:48842] 2174 0 0 0 0 ...
 $ capital-loss  : num [1:48842] 0 0 0 0 0

# Step 2: Data Cleaning

## Checking for Null Values

## Checking for unique values

## All Features

## Checking for low variance:

## Checking & handling for duplicates

## Handling Missing Values

## Unique counts of categories in columns

## Copy of Original Data

# Step 3: Feature Engineering

# Step 4: Train & Test Split

## Print the number of records in the training and testing data

## Save training data to CSV

## Save testing data to CSV

# Step 5: Feature Scaling

# Step 6: Logistic Regression

# Step 7: Random Forest