In [3]:
from matplotlib.colors import Normalize
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
adult_census = pd.read_csv(r'/Users/oluwa/Desktop/Python/advanced-python-datasci-master/data/adult-census.csv')

In [None]:
adult_census.head()

In [None]:
target_column = 'class'
adult_census[target_column].value_counts()

In [None]:
features = adult_census.drop(columns='class')
features.head()

In [None]:
numeric_columns = features.select_dtypes(include=np.number).columns.values
categorical_columns = features.drop(columns=numeric_columns).columns.values

In [None]:
print(f'''
There are {features.shape[0]} observations and {features.shape[1]} features.

Numeric features: {', '.join(numeric_columns)}.

Categorical features: {', '.join(categorical_columns)}.
''')

In [None]:
adult_census.hist(figsize=(20,14));

In [None]:
adult_census['sex'].value_counts()

In [None]:
adult_census['education'].value_counts()

In [None]:
pd.crosstab(
    index=adult_census['education'],
    columns=adult_census['education-num']
)

In [None]:
# We will plot a subset of the data to keep the plot readable and make the
# plotting faster
n_samples_to_plot = 5000
columns = ['age', 'education-num', 'hours-per-week']
sns.pairplot(data=adult_census[:n_samples_to_plot], vars=columns,
             hue=target_column, plot_kws={'alpha': 0.2},
             height=3, diag_kind='hist', diag_kws={'bins': 30});

In [None]:
ax = sns.scatterplot(
    x="age", y="hours-per-week", data=adult_census[:n_samples_to_plot],
    hue="class", alpha=0.5,
)

age_limit = 27
plt.axvline(x=age_limit, ymin=0, ymax=1, color="black", linestyle="--")

hours_per_week_limit = 40
plt.axhline(
    y=hours_per_week_limit, xmin=0.18, xmax=1, color="black", linestyle="--"
)

plt.annotate("<=50K", (17, 25), rotation=90, fontsize=35)
plt.annotate("<=50K", (35, 20), fontsize=35)
plt.annotate("???", (45, 60), fontsize=35);

Modelling with scikit-learn 

In [None]:
target_col = "class"
feature_col = adult_census.drop(columns=target_col).select_dtypes(np.number).columns.values

In [None]:
target = adult_census[target_col]
target

In [None]:
features = adult_census[feature_col]
features

In [None]:
print(
    f"The dataset contains {features.shape[0]} samples and "
    f"{features.shape[1]} features"
)

In [None]:
from sklearn import set_config
set_config(display='diagram')

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# define the algorithm
model = KNeighborsClassifier()

# fit the model
model.fit(features, target)

target_predicted = model.predict(features)
target_predicted
# accuracy of first 5 predictions
target[:5] == target_predicted[:5]

(target == target_predicted).mean()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    features, 
    target, 
    random_state=123, 
    test_size=0.25,
    stratify=target
)

In [None]:
y_train.shape

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_test.shape

In [None]:
y_train.value_counts(normalize=True)

KNeighbors Classifier

In [None]:
model = KNeighborsClassifier()

model.fit(X_train, y_train)

# score model on test data
accuracy = model.score(X_test, y_test)

print(f'The test accuracy using {model.__class__.__name__} is {round(accuracy, 4) * 100}%')

Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(X_train, y_train)

accuracy_logistic = model.score(X_test, y_test)
accuracy_logistic

In [None]:
# Modular Code
def get_features_and_target():
    '''Split a CSV into a DF of numeric features and a target column.'''
    adult_census = pd.read_csv(r'/Users/oluwa/Desktop/Python/advanced-python-datasci-master/data/adult-census.csv')

    target_col = "class"
    
    raw_features = adult_census.drop(columns=target_col)
    numeric_features = raw_features.select_dtypes(np.number)
    feature_cols = numeric_features.columns.values

    features = adult_census[feature_cols]
    target = adult_census[target_col]
    
    return (features, target)

In [None]:
f, t = get_features_and_target()
f.head()

In [None]:
t.head()

In [None]:
def get_features_and_target(csv_file, target_col):
    '''Split a CSV into a DF of numeric features and a target column.'''
    
    adult_census = pd.read_csv(csv_file)
    
    raw_features = adult_census.drop(columns=target_col)
    numeric_features = raw_features.select_dtypes(np.number)
    feature_cols = numeric_features.columns.values

    features = adult_census[feature_cols]
    target = adult_census[target_col]
    
    return (features, target)

In [None]:
f, t = get_features_and_target(
    csv_file=r'/Users/oluwa/Desktop/Python/advanced-python-datasci-master/data/adult-census.csv',
    target_col='class',
)

In [None]:
features.shape

In [None]:
f.head()

In [None]:
t.head()

In [None]:
import my_module

In [None]:
x = my_module.reverse_and_capitalize('oluwatimilehin')

In [None]:
x

In [None]:
import my_module

In [None]:
features, target = my_module.get_features_and_target(csv_file='../data/adult-census.csv', target_col='class')

In [None]:
features

In [None]:
target

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    features, 
    target, 
    random_state=123, 
    train_size=0.75,
    stratify=target
)

In [None]:
X_train.shape

In [None]:
features.shape

In [None]:
target

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()

model.fit(X_train, y_train)

model.score(X_test, y_test)

Feature Engineering

In [1]:
from sklearn import set_config
set_config(display='diagram')

In [4]:
# import data
adult_census = pd.read_csv('../data/adult-census.csv')

# separate feature & target data
target = adult_census['class']
features = adult_census.drop(columns='class')

In [5]:
features.dtypes

age                int64
workclass         object
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
dtype: object

In [6]:
from sklearn.compose import make_column_selector as selector

# create selector object based on data type
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

In [7]:
# get columns of interest
numerical_columns = numerical_columns_selector(features)
categorical_columns = categorical_columns_selector(features)

In [8]:
# results in a list containing relevant column names
numerical_columns

['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

In [9]:
numerical_features = features[numerical_columns]
numerical_features.describe()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week
count,48842.0,48842.0,48842.0,48842.0,48842.0
mean,38.643585,10.078089,1079.067626,87.502314,40.422382
std,13.71051,2.570973,7452.019058,403.004552,12.391444
min,17.0,1.0,0.0,0.0,1.0
25%,28.0,9.0,0.0,0.0,40.0
50%,37.0,10.0,0.0,0.0,40.0
75%,48.0,12.0,0.0,0.0,45.0
max,90.0,16.0,99999.0,4356.0,99.0


In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(numerical_features)

In [11]:
scaler.mean_

array([  38.64358544,   10.07808853, 1079.06762622,   87.50231358,
         40.42238238])

In [12]:
scaler.scale_

array([1.37103696e+01, 2.57094644e+00, 7.45194277e+03, 4.03000427e+02,
       1.23913172e+01])

In [13]:
numerical_features_scaled = scaler.transform(numerical_features)
numerical_features_scaled

array([[-0.99512893, -1.19725891, -0.14480353, -0.2171271 , -0.03408696],
       [-0.04694151, -0.41933527, -0.14480353, -0.2171271 ,  0.77292975],
       [-0.77631645,  0.74755018, -0.14480353, -0.2171271 , -0.03408696],
       ...,
       [ 1.41180837, -0.41933527, -0.14480353, -0.2171271 , -0.03408696],
       [-1.21394141, -0.41933527, -0.14480353, -0.2171271 , -1.64812038],
       [ 0.97418341, -0.41933527,  1.87131501, -0.2171271 , -0.03408696]])

In [14]:
# fitting and transforming in one step
scaler.fit_transform(numerical_features)

array([[-0.99512893, -1.19725891, -0.14480353, -0.2171271 , -0.03408696],
       [-0.04694151, -0.41933527, -0.14480353, -0.2171271 ,  0.77292975],
       [-0.77631645,  0.74755018, -0.14480353, -0.2171271 , -0.03408696],
       ...,
       [ 1.41180837, -0.41933527, -0.14480353, -0.2171271 , -0.03408696],
       [-1.21394141, -0.41933527, -0.14480353, -0.2171271 , -1.64812038],
       [ 0.97418341, -0.41933527,  1.87131501, -0.2171271 , -0.03408696]])

In [15]:
numerical_features = pd.DataFrame(
    numerical_features_scaled,
    columns=numerical_columns
)

In [16]:
numerical_features.describe()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week
count,48842.0,48842.0,48842.0,48842.0,48842.0
mean,1.584958e-16,1.594573e-17,2.294458e-16,7.617582e-17,9.071110000000001e-17
std,1.00001,1.00001,1.00001,1.00001,1.00001
min,-1.578629,-3.53103,-0.1448035,-0.2171271,-3.181452
25%,-0.7763164,-0.4193353,-0.1448035,-0.2171271,-0.03408696
50%,-0.119879,-0.03037346,-0.1448035,-0.2171271,-0.03408696
75%,0.6824334,0.7475502,-0.1448035,-0.2171271,0.3694214
max,3.745808,2.303397,13.27438,10.59179,4.727312


In [18]:
# MinMax Scaler

from sklearn.preprocessing import MinMaxScaler

min_max_features = MinMaxScaler(feature_range=(-1,1))
min_max_features.fit_transform(numerical_features)

array([[-7.80821918e-01, -2.00000000e-01, -1.00000000e+00,
        -1.00000000e+00, -2.04081633e-01],
       [-4.24657534e-01,  6.66666667e-02, -1.00000000e+00,
        -1.00000000e+00,  8.32667268e-17],
       [-6.98630137e-01,  4.66666667e-01, -1.00000000e+00,
        -1.00000000e+00, -2.04081633e-01],
       ...,
       [ 1.23287671e-01,  6.66666667e-02, -1.00000000e+00,
        -1.00000000e+00, -2.04081633e-01],
       [-8.63013699e-01,  6.66666667e-02, -1.00000000e+00,
        -1.00000000e+00, -6.12244898e-01],
       [-4.10958904e-02,  6.66666667e-02, -6.99516995e-01,
        -1.00000000e+00, -2.04081633e-01]])

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(), LogisticRegression())
model

In [20]:
from sklearn.model_selection import train_test_split

# split our data into train & test
X_train, X_test, y_train, y_test = train_test_split(numerical_features, target, random_state=123)

# fit our pipeline model
model.fit(X_train, y_train)

# score our model on the test data
model.score(X_test, y_test)

0.8135287855212513

In [21]:
from sklearn.preprocessing import OrdinalEncoder

# let's illustrate with the 'education' feature
education_column = features[["education"]]

encoder = OrdinalEncoder()
education_encoded = encoder.fit_transform(education_column)
education_encoded

array([[ 1.],
       [11.],
       [ 7.],
       ...,
       [11.],
       [11.],
       [11.]])

In [22]:
encoder.categories_

[array([' 10th', ' 11th', ' 12th', ' 1st-4th', ' 5th-6th', ' 7th-8th',
        ' 9th', ' Assoc-acdm', ' Assoc-voc', ' Bachelors', ' Doctorate',
        ' HS-grad', ' Masters', ' Preschool', ' Prof-school',
        ' Some-college'], dtype=object)]

In [23]:
ed_levels = [' Preschool', ' 1st-4th', ' 5th-6th', ' 7th-8th', ' 9th', ' 10th', ' 11th', 
             ' 12th', ' HS-grad', ' Prof-school', ' Some-college', ' Assoc-acdm', 
             ' Assoc-voc', ' Bachelors', ' Masters', ' Doctorate']

encoder = OrdinalEncoder(categories=[ed_levels])
education_encoded = encoder.fit_transform(education_column)
education_encoded

array([[ 6.],
       [ 8.],
       [11.],
       ...,
       [ 8.],
       [ 8.],
       [ 8.]])

In [24]:
encoder.categories_

[array([' Preschool', ' 1st-4th', ' 5th-6th', ' 7th-8th', ' 9th', ' 10th',
        ' 11th', ' 12th', ' HS-grad', ' Prof-school', ' Some-college',
        ' Assoc-acdm', ' Assoc-voc', ' Bachelors', ' Masters',
        ' Doctorate'], dtype=object)]

In [25]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False)
education_encoded = encoder.fit_transform(education_column)
education_encoded

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [26]:
feature_names = encoder.get_feature_names(input_features=["education"])
pd.DataFrame(education_encoded, columns=feature_names)



Unnamed: 0,education_ 10th,education_ 11th,education_ 12th,education_ 1st-4th,education_ 5th-6th,education_ 7th-8th,education_ 9th,education_ Assoc-acdm,education_ Assoc-voc,education_ Bachelors,education_ Doctorate,education_ HS-grad,education_ Masters,education_ Preschool,education_ Prof-school,education_ Some-college
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
48839,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
48840,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [27]:
# get all categorical features
categorical_features = features[categorical_columns]

# one-hot encode all features
categorical_features_encoded = encoder.fit_transform(categorical_features)

# view as a data frame
columns_encoded = encoder.get_feature_names(categorical_features.columns)
pd.DataFrame(categorical_features_encoded, columns=columns_encoded).head()



Unnamed: 0,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,education_ 10th,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [28]:
# drop the duplicated column `"education-num"` as stated in the data exploration notebook
features = features.drop(columns='education-num')

# create selector object based on data type
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

# get columns of interest
numerical_columns = numerical_columns_selector(features)
categorical_columns = categorical_columns_selector(features)

# split into train & test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=123)

In [29]:
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

In [30]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_columns),
    ('standard_scaler', numerical_preprocessor, numerical_columns)
])

In [31]:
model = make_pipeline(preprocessor, LogisticRegression(max_iter=500))
model

In [32]:
# fit our model
_ = model.fit(X_train, y_train)

# score on test set
model.score(X_test, y_test)

0.8503808041929408