In [6]:
# Installing and Importing appropriate packages
%pip install requests

# Statistical Libraries
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.formula.api as sm
from statsmodels.tools.eval_measures import rmse
import seaborn as sns
from scipy import stats
import plotly.express as px
from statsmodels.discrete.discrete_model import Logit
from scipy.special import logit

# Import data Libraries
import requests
from zipfile import ZipFile as zf
import io

Note: you may need to restart the kernel to use updated packages.


In [7]:
# Import data
URL = 'https://archive.ics.uci.edu/static/public/2/adult.zip'
filename_data = "adult.data"
filename_test = "adult.test"

# Extracting from URL
response = requests.get(URL)
zipped = zf(io.BytesIO(response.content))
zipped.extract(filename_data)
zipped.extract(filename_test)

# Creating DataFrame from CSV file
col_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'above-below']
df_data = pd.read_csv(filename_data, sep=',', header=None, names=col_names, index_col=False, skipinitialspace = True)
print('Training Dataset:')
print(df_data.head(10))
print(" ")
print(" ")

df_test = pd.read_csv(filename_test, sep=',', header=None, skiprows=1, names=col_names, index_col=False, skipinitialspace = True)
print('Testing Dataset:')
print(df_test.head(10))

Training Dataset:
   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   
5   37           Private  284582    Masters             14   
6   49           Private  160187        9th              5   
7   52  Self-emp-not-inc  209642    HS-grad              9   
8   31           Private   45781    Masters             14   
9   42           Private  159449  Bachelors             13   

          marital-status         occupation   relationship   race     sex  \
0          Never-married       Adm-clerical  Not-in-family  White    Male   
1     Married-civ-spouse    Exec-managerial        Husband  White    Male   
2               Divorced  Handlers-cleaners  Not-in-family  White   

In [8]:
# Searching for any NaN or blanks in the data
print('NaN values in DataFrame: ', df_data.isna().sum().sum())

NaN values in DataFrame:  0


In [9]:
# Identifying values in column to see if blank values are replaced by other characters
for col in df_data:
    print(df_data[col].unique())

[39 50 38 53 28 37 49 52 31 42 30 23 32 40 34 25 43 54 35 59 56 19 20 45
 22 48 21 24 57 44 41 29 18 47 46 36 79 27 67 33 76 17 55 61 70 64 71 68
 66 51 58 26 60 90 75 65 77 62 63 80 72 74 69 73 81 78 88 82 83 84 85 86
 87]
['State-gov' 'Self-emp-not-inc' 'Private' 'Federal-gov' 'Local-gov' '?'
 'Self-emp-inc' 'Without-pay' 'Never-worked']
[ 77516  83311 215646 ...  34066  84661 257302]
['Bachelors' 'HS-grad' '11th' 'Masters' '9th' 'Some-college' 'Assoc-acdm'
 'Assoc-voc' '7th-8th' 'Doctorate' 'Prof-school' '5th-6th' '10th'
 '1st-4th' 'Preschool' '12th']
[13  9  7 14  5 10 12 11  4 16 15  3  6  2  1  8]
['Never-married' 'Married-civ-spouse' 'Divorced' 'Married-spouse-absent'
 'Separated' 'Married-AF-spouse' 'Widowed']
['Adm-clerical' 'Exec-managerial' 'Handlers-cleaners' 'Prof-specialty'
 'Other-service' 'Sales' 'Craft-repair' 'Transport-moving'
 'Farming-fishing' 'Machine-op-inspct' 'Tech-support' '?'
 'Protective-serv' 'Armed-Forces' 'Priv-house-serv']
['Not-in-family' 'Husband' 'Wif