In [1]:
using Pkg
Pkg.activate(".")

[32m[1m  Activating[22m[39m environment at `~/Desktop/manning-liveprojects/hands-on-datascience-with-julia/data-pre-processing/Project.toml`


In [2]:
using Downloads
using SHA
using DataFrames
using CSV
using FreqTables
using Plots
using StatsBase
using Statistics
using Arrow

In [3]:
# Project constants
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
file_name = "./data/adult.data"
file_sha1 = UInt8[
    0xee, 0x86, 0xbb, 0xe5, 0x56,
    0x57, 0x8f, 0x70, 0x9a, 0xe0,
    0xfd, 0x00, 0x2a, 0xc5, 0x8a,
    0xc9, 0x37, 0x26, 0x48, 0x2f
];

In [4]:
# Create 'data' directory if it doesn't exist
data_dir = dirname(file_name)
if !isdir(data_dir)
    @info "Data directory: $data_dir does not exist. Creating..."
    mkpath(data_dir)
end

# Download Boston Housing data if not exists
if isfile(file_name)
    @info "$file_name found. Skipping download."
else
    @info "$file_name not found. Fetching from source."
    Downloads.download(url, file_name)
end

┌ Info: ./data/adult.data found. Skipping download.
└ @ Main In[4]:10


In [5]:
# Check SHA1 of Boston Housing file
if file_sha1 == open(sha1, file_name)
    @info "SHA1 check of $file_name passed."
else
    error("$file_name file has an invalid SHA1. Aborting!")
end

┌ Info: SHA1 check of ./data/adult.data passed.
└ @ Main In[5]:3


In [17]:
col_names = [
    :age, :workclass, :fnlwgt, :education, :education_num, :marital_status,
    :occupation, :relationship, :race, :sex, :capital_gain, :capital_loss,
    :hours_per_week, :native_country, :target,
];
adult_raw = CSV.read(file_name, DataFrame, header=col_names, delim=", ", missingstring="?")
adult = copy(adult_raw)
# Print up to 200 columns and 20 rows by default
ENV["COLUMNS"] = 200
ENV["LINES"] = 20;

In [18]:
select!(adult, Not(:fnlwgt))

Unnamed: 0_level_0,age,workclass,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
Unnamed: 0_level_1,Int64,String31?,String15,Int64,String31,String31?,String15,String31,String7,Int64,Int64,Int64,String31?
1,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
2,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
3,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
4,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
5,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba
6,37,Private,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States
7,49,Private,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica
8,52,Self-emp-not-inc,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States
9,31,Private,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States
10,42,Private,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States


In [19]:
describe(adult)

Unnamed: 0_level_0,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Int64,Type
1,age,38.5816,17,37.0,90,0,Int64
2,workclass,,Federal-gov,,Without-pay,1836,"Union{Missing, String31}"
3,education,,10th,,Some-college,0,String15
4,education_num,10.0807,1,10.0,16,0,Int64
5,marital_status,,Divorced,,Widowed,0,String31
6,occupation,,Adm-clerical,,Transport-moving,1843,"Union{Missing, String31}"
7,relationship,,Husband,,Wife,0,String15
8,race,,Amer-Indian-Eskimo,,White,0,String31
9,sex,,Female,,Male,0,String7
10,capital_gain,1077.65,0,0.0,99999,0,Int64
