# 40-modeling-sklearn
> Modeling using scikit-learn

In this notebook, we train models using the pipeline functionality from scikit-learn.

In [None]:
#tables and visualizations
import pandas as pd
import numpy as np
import seaborn as sns

#machine learning
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, StandardScaler
from sklearn import config_context
from sklearn.metrics import classification_report

# Load the data

In [None]:
# fill in the following filename with the path of your data.
data_filename = ''

In [None]:
#load the data and learn a bit
df = pd.read_csv(data_filename)
display(df.head())
df.info()

# Split the data
Here, we employ the initial split to separate the training from the golden holdout test set.  We may split this in a prior notebook to standardize across modeling strategies.  Make sure to uncomment `class_column` below and fill in the name of the column of interest.

In [None]:
#class_column = 'target_column'
random_seed = 2435

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=class_column), df[class_column],
                                                   test_size=0.25, random_state=random_seed, stratify=df[class_column])

Quick sanity check to make sure that everything seems OK:

In [None]:
# X Train
print('On X train: ')
print('X train dimensions: ', X_train.shape)
display(X_train.head())

# X test
print('\nOn X test: ')
print('X test dimensions: ', X_test.shape)
display(X_test.head())

In [None]:
# X Train
print('On y train: ')
print('y train dimensions: ', y_train.shape)
display(y_train.head())

# X test
print('\nOn y test: ')
print('y test dimensions: ', y_test.shape)
display(y_test.head())

# Create training pipeline
The example below uses logistic regression.

In [None]:
#individual pipelines for differing datatypes
cat_pipeline = Pipeline(steps=[('cat_impute', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
                               ('onehot_cat', OneHotEncoder(drop='if_binary'))])
num_pipeline = Pipeline(steps=[('impute_num', SimpleImputer(missing_values=np.nan, strategy='mean')),
                               ('scale_num', StandardScaler())])     

In [None]:
#establish preprocessing pipeline by columns
preproc = ColumnTransformer([('cat_pipe', cat_pipeline, make_column_selector(dtype_include=object)),
                             ('num_pipe', num_pipeline, make_column_selector(dtype_include=np.number))],
                             remainder='passthrough')

In [None]:
#generate the whole modeling pipeline with preprocessing
pipe = Pipeline(steps=[('preproc', preproc),
                       ('mdl', LogisticRegression(penalty='elasticnet', solver='saga', tol=0.01))])

#visualization for steps
with config_context(display='diagram'):
    display(pipe)

# Cross-validation with hyperparameter tuning

In [None]:
tuning_grid = {'mdl__l1_ratio' : np.linspace(0,1,5),
               'mdl__C': np.logspace(-1, 6, 3) }
grid_search = GridSearchCV(pipe, param_grid = tuning_grid, cv = 5, return_train_score=True)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
print(grid_search.best_score_)
grid_search.best_params_

In [None]:
pd.DataFrame(grid_search.cv_results_)

# Final fit
The final fit may be present due to the grid search.  If so, you can use the best estimator functionality below.

In [None]:
grid_search.best_estimator_

# Variable importance
Now we assess the importance in the selected model to reveal any potential insights.

In [None]:
grid_search.classes_

In [None]:
vip = grid_search.best_estimator_['mdl'].coef_[0]
vip

In [None]:
#get names in correct preproc order
cat_names = grid_search.best_estimator_.named_steps['preproc'].transformers_[0][1].named_steps['onehot_cat'].get_feature_names()
num_names = grid_search.best_estimator_.named_steps['preproc'].transformers_[1][2]

#create df with vip info
coef_info = pd.DataFrame({'feat_names':np.hstack([cat_names, num_names]), 'vip': vip})

#get sign and magnitude information
coef_info = coef_info.assign(coef_mag = abs(coef_info['vip']),
                             coef_sign = np.sign(coef_info['vip']))

#sort and plot
coef_info = coef_info.set_index('feat_names').sort_values(by='coef_mag', ascending=False)
sns.barplot(y=coef_info.index, x='coef_mag', hue='coef_sign', data=coef_info, orient='h', dodge=False);

# Performance metrics on test data

In [None]:
print(classification_report(y_test, grid_search.best_estimator_.predict(X_test)))