In [1]:
from matplotlib.colors import Normalize
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
adult_census = pd.read_csv(r'/Users/oluwa/Desktop/Python/advanced-python-datasci-master/data/adult-census.csv')

In [None]:
adult_census.head()

In [None]:
target_column = 'class'
adult_census[target_column].value_counts()

In [None]:
features = adult_census.drop(columns='class')
features.head()

In [None]:
numeric_columns = features.select_dtypes(include=np.number).columns.values
categorical_columns = features.drop(columns=numeric_columns).columns.values

In [None]:
print(f'''
There are {features.shape[0]} observations and {features.shape[1]} features.

Numeric features: {', '.join(numeric_columns)}.

Categorical features: {', '.join(categorical_columns)}.
''')

In [None]:
adult_census.hist(figsize=(20,14));

In [None]:
adult_census['sex'].value_counts()

In [None]:
adult_census['education'].value_counts()

In [None]:
pd.crosstab(
    index=adult_census['education'],
    columns=adult_census['education-num']
)

In [None]:
# We will plot a subset of the data to keep the plot readable and make the
# plotting faster
n_samples_to_plot = 5000
columns = ['age', 'education-num', 'hours-per-week']
sns.pairplot(data=adult_census[:n_samples_to_plot], vars=columns,
             hue=target_column, plot_kws={'alpha': 0.2},
             height=3, diag_kind='hist', diag_kws={'bins': 30});

In [None]:
ax = sns.scatterplot(
    x="age", y="hours-per-week", data=adult_census[:n_samples_to_plot],
    hue="class", alpha=0.5,
)

age_limit = 27
plt.axvline(x=age_limit, ymin=0, ymax=1, color="black", linestyle="--")

hours_per_week_limit = 40
plt.axhline(
    y=hours_per_week_limit, xmin=0.18, xmax=1, color="black", linestyle="--"
)

plt.annotate("<=50K", (17, 25), rotation=90, fontsize=35)
plt.annotate("<=50K", (35, 20), fontsize=35)
plt.annotate("???", (45, 60), fontsize=35);

Modelling with scikit-learn 

In [None]:
target_col = "class"
feature_col = adult_census.drop(columns=target_col).select_dtypes(np.number).columns.values

In [None]:
target = adult_census[target_col]
target

In [None]:
features = adult_census[feature_col]
features

In [None]:
print(
    f"The dataset contains {features.shape[0]} samples and "
    f"{features.shape[1]} features"
)

In [None]:
from sklearn import set_config
set_config(display='diagram')

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# define the algorithm
model = KNeighborsClassifier()

# fit the model
model.fit(features, target)

target_predicted = model.predict(features)
target_predicted
# accuracy of first 5 predictions
target[:5] == target_predicted[:5]

(target == target_predicted).mean()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    features, 
    target, 
    random_state=123, 
    test_size=0.25,
    stratify=target
)

In [None]:
y_train.shape

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_test.shape

In [None]:
y_train.value_counts(normalize=True)

KNeighbors Classifier

In [None]:
model = KNeighborsClassifier()

model.fit(X_train, y_train)

# score model on test data
accuracy = model.score(X_test, y_test)

print(f'The test accuracy using {model.__class__.__name__} is {round(accuracy, 4) * 100}%')

Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(X_train, y_train)

accuracy_logistic = model.score(X_test, y_test)
accuracy_logistic

In [None]:
# Modular Code
def get_features_and_target():
    '''Split a CSV into a DF of numeric features and a target column.'''
    adult_census = pd.read_csv(r'/Users/oluwa/Desktop/Python/advanced-python-datasci-master/data/adult-census.csv')

    target_col = "class"
    
    raw_features = adult_census.drop(columns=target_col)
    numeric_features = raw_features.select_dtypes(np.number)
    feature_cols = numeric_features.columns.values

    features = adult_census[feature_cols]
    target = adult_census[target_col]
    
    return (features, target)

In [None]:
f, t = get_features_and_target()
f.head()

In [None]:
t.head()

In [None]:
def get_features_and_target(csv_file, target_col):
    '''Split a CSV into a DF of numeric features and a target column.'''
    
    adult_census = pd.read_csv(csv_file)
    
    raw_features = adult_census.drop(columns=target_col)
    numeric_features = raw_features.select_dtypes(np.number)
    feature_cols = numeric_features.columns.values

    features = adult_census[feature_cols]
    target = adult_census[target_col]
    
    return (features, target)

In [None]:
f, t = get_features_and_target(
    csv_file=r'/Users/oluwa/Desktop/Python/advanced-python-datasci-master/data/adult-census.csv',
    target_col='class',
)

In [None]:
features.shape

In [None]:
f.head()

In [None]:
t.head()

In [None]:
import my_module

In [None]:
x = my_module.reverse_and_capitalize('oluwatimilehin')

In [None]:
x

In [None]:
import my_module

In [None]:
features, target = my_module.get_features_and_target(csv_file='../data/adult-census.csv', target_col='class')

In [None]:
features

In [None]:
target

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    features, 
    target, 
    random_state=123, 
    train_size=0.75,
    stratify=target
)

In [None]:
X_train.shape

In [None]:
features.shape

In [None]:
target

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()

model.fit(X_train, y_train)

model.score(X_test, y_test)

Feature Engineering

In [None]:
from sklearn import set_config
set_config(display='diagram')

In [None]:
# import data
adult_census = pd.read_csv('../data/adult-census.csv')

# separate feature & target data
target = adult_census['class']
features = adult_census.drop(columns='class')

In [None]:
features.dtypes

In [None]:
from sklearn.compose import make_column_selector as selector

# create selector object based on data type
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

In [None]:
# get columns of interest
numerical_columns = numerical_columns_selector(features)
categorical_columns = categorical_columns_selector(features)

In [None]:
# results in a list containing relevant column names
numerical_columns

In [None]:
numerical_features = features[numerical_columns]
numerical_features.describe()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(numerical_features)

In [None]:
scaler.mean_

In [None]:
scaler.scale_

In [None]:
numerical_features_scaled = scaler.transform(numerical_features)
numerical_features_scaled

In [None]:
# fitting and transforming in one step
scaler.fit_transform(numerical_features)

In [None]:
numerical_features = pd.DataFrame(
    numerical_features_scaled,
    columns=numerical_columns
)

In [None]:
numerical_features.describe()

In [None]:
# MinMax Scaler

from sklearn.preprocessing import MinMaxScaler

min_max_features = MinMaxScaler(feature_range=(-1,1))
min_max_features.fit_transform(numerical_features)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(), LogisticRegression())
model

In [None]:
from sklearn.model_selection import train_test_split

# split our data into train & test
X_train, X_test, y_train, y_test = train_test_split(numerical_features, target, random_state=123)

# fit our pipeline model
model.fit(X_train, y_train)

# score our model on the test data
model.score(X_test, y_test)

In [None]:
from sklearn.preprocessing import OrdinalEncoder

# let's illustrate with the 'education' feature
education_column = features[["education"]]

encoder = OrdinalEncoder()
education_encoded = encoder.fit_transform(education_column)
education_encoded

In [None]:
encoder.categories_

In [None]:
ed_levels = [' Preschool', ' 1st-4th', ' 5th-6th', ' 7th-8th', ' 9th', ' 10th', ' 11th', 
             ' 12th', ' HS-grad', ' Prof-school', ' Some-college', ' Assoc-acdm', 
             ' Assoc-voc', ' Bachelors', ' Masters', ' Doctorate']

encoder = OrdinalEncoder(categories=[ed_levels])
education_encoded = encoder.fit_transform(education_column)
education_encoded

In [None]:
encoder.categories_

In [None]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False)
education_encoded = encoder.fit_transform(education_column)
education_encoded

In [None]:
feature_names = encoder.get_feature_names(input_features=["education"])
pd.DataFrame(education_encoded, columns=feature_names)

In [None]:
# get all categorical features
categorical_features = features[categorical_columns]

# one-hot encode all features
categorical_features_encoded = encoder.fit_transform(categorical_features)

# view as a data frame
columns_encoded = encoder.get_feature_names(categorical_features.columns)
pd.DataFrame(categorical_features_encoded, columns=columns_encoded).head()

In [None]:
# drop the duplicated column `"education-num"` as stated in the data exploration notebook
features = features.drop(columns='education-num')

# create selector object based on data type
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

# get columns of interest
numerical_columns = numerical_columns_selector(features)
categorical_columns = categorical_columns_selector(features)

# split into train & test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=123)

In [None]:
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

In [None]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_columns),
    ('standard_scaler', numerical_preprocessor, numerical_columns)
])

In [None]:
model = make_pipeline(preprocessor, LogisticRegression(max_iter=500))
model

In [None]:
# fit our model
_ = model.fit(X_train, y_train)

# score on test set
model.score(X_test, y_test)

Model Evaluation & Selection

In [None]:
from sklearn.model_selection import train_test_split

# import data
adult_census = pd.read_csv('../data/adult-census.csv')

# separate feature & target data
target = adult_census['class']
features = adult_census.drop(columns='class')

# drop the duplicated column `"education-num"` as stated in the data exploration notebook
features = features.drop(columns='education-num')

# split into train & test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=123)

In [None]:
# packages used
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# create selector object based on data type
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

# get columns of interest
numerical_columns = numerical_columns_selector(features)
categorical_columns = categorical_columns_selector(features)

# preprocessors to handle numeric and categorical features
numerical_preprocessor = StandardScaler()
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")

# transformer to associate each of these preprocessors with their respective columns
preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_columns),
    ('standard_scaler', numerical_preprocessor, numerical_columns)
])

In [None]:
# packages used
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

# Pipeline object to chain together modeling processes
model = make_pipeline(preprocessor, LogisticRegression(max_iter=500))
model

# fit our model
_ = model.fit(X_train, y_train)

# score on test set
model.score(X_test, y_test)

In [None]:
%%time
from sklearn.model_selection import cross_validate

cv_result = cross_validate(model, X_train, y_train, cv=5)
cv_result

In [None]:
scores = cv_result["test_score"]
print("The mean cross-validation accuracy is: "
      f"{scores.mean():.3f} +/- {scores.std():.3f}")

Using KNeighborsClassifier

In [None]:
# packages used
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline

# Pipeline object to chain together modeling processes
model = make_pipeline(preprocessor, KNeighborsClassifier())
model

knn_results = cross_validate(model, X_train, y_train, cv=5)
knn_results

In [None]:
knn_results['test_score'].mean()

In [None]:
knn_results['test_score'].std()

Evaluation metrics

In [None]:
# toy data
from sklearn.datasets import load_breast_cancer
X_cancer, y_cancer = load_breast_cancer(return_X_y=True)

# fit model
clf = LogisticRegression(solver='liblinear').fit(X_cancer, y_cancer)

# score 
clf.score(X_cancer, y_cancer)

In [None]:
# toy data
from sklearn.datasets import load_boston
X_boston, y_boston = load_boston(return_X_y=True)

# fit model
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_boston, y_boston)

# score
reg.score(X_boston, y_boston)

In [None]:
# packages used
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline

# Pipeline object to chain together modeling processes
model = make_pipeline(preprocessor, KNeighborsClassifier())
model

metrics = ['accuracy', 'roc_auc']
knn_results = cross_validate(model, X_train, y_train, cv=5, scoring=metrics)
knn_results

In [None]:
%%time
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

# set hyperparameter in KNN model 
model = KNeighborsClassifier(n_neighbors=10)

# create preprocessor & modeling pipeline
pipeline = make_pipeline(preprocessor, model)

# 5-fold cross validation using AUC error metric
results = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='roc_auc')

f'KNN model with 10 neighbors: AUC = {np.mean(results):.3f}'

In [None]:
%%time
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# basic model object
knn = KNeighborsClassifier()

# Create grid of hyperparameter values
hyper_grid = {'knn__n_neighbors': [5, 10, 15, 20]}

# create preprocessor & modeling pipeline
pipeline = Pipeline([('prep', preprocessor), ('knn', knn)])

# Tune a knn model using grid search
grid_search = GridSearchCV(pipeline, hyper_grid, cv=5, scoring='roc_auc', n_jobs=-1)
results = grid_search.fit(X_train, y_train)

# Best model's cross validated AUC
abs(results.best_score_)

In [None]:
results.best_params_

In [None]:
from sklearn.ensemble import RandomForestClassifier

# basic model object
rf = RandomForestClassifier(random_state=123)

# create preprocessor & modeling pipeline
pipeline = Pipeline([('prep', preprocessor), ('rf', rf)])

In [None]:
from scipy.stats import loguniform


class loguniform_int:
    """Integer valued version of the log-uniform distribution"""
    def __init__(self, a, b):
        self._distribution = loguniform(a, b)

    def rvs(self, *args, **kwargs):
        """Random variable sample"""
        return self._distribution.rvs(*args, **kwargs).astype(int)

In [None]:
# specify hyperparameter distributions to randomly sample from
param_distributions = {
    'rf__n_estimators': loguniform_int(50, 1000),
    'rf__max_features': loguniform(.1, .5),
    'rf__max_depth': loguniform_int(4, 20),
    'rf__min_samples_leaf': loguniform_int(1, 100),
    'rf__max_samples': loguniform(.5, 1),
}

In [None]:
%%time
from sklearn.model_selection import RandomizedSearchCV

# perform 10 random iterations
random_search = RandomizedSearchCV(
    pipeline, 
    param_distributions=param_distributions, 
    n_iter=10,
    cv=5, 
    scoring='roc_auc',
    verbose=1,
    n_jobs=-1,
)

results = random_search.fit(X_train, y_train)

In [None]:
results.best_score_

In [None]:
results.best_params_

Writing a Preprocessor Function

In [None]:
from my_module import get_features_and_target, make_preprocessor

In [None]:
features, target = get_features_and_target('../data/adult-census.csv', 'class')

In [None]:
features

In [None]:
target

In [None]:
preprocessor = make_preprocessor(features)

In [None]:
print(preprocessor)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(preprocessor, LogisticRegression())

In [None]:
pipeline

In [8]:
import my_module
from sklearn.preprocessing import Normalizer, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

In [9]:
features, target = my_module.get_features_and_target(csv_file='../data/adult-census.csv', target_col='class', 

                                                    )
features = features.drop('education-num', axis=1)
target = target.str.contains('>50K').astype(int)

preprocessor = my_module.make_preprocessor(features, numeric_preprocessor=Normalizer())
model = make_pipeline(preprocessor, LogisticRegression())

X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=123)

_ = model.fit(X_train, y_train)
model.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8434198673327328