## Process as Pipeline
GridSearch-able final pipe

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.metrics import make_scorer, accuracy_score, recall_score, fbeta_score, precision_score, roc_auc_score

from wbcustom.feature_extraction import FeatureExtraction
from wbcustom.nsfw_cleaner import NsfwCleaner

In [17]:
ab_pipe = Pipeline([
    ('features', FeatureUnion([
        ('cvec', CountVectorizer()),
        ('detect_nsfw', FeatureExtraction())
    ])),
    ('removal', NsfwCleaner()),
    ('model', AdaBoostClassifier())
])

ab_pipe_params = {
    'features__cvec__ngram_range': [(1,1)],
    'features__cvec__stop_words': ['english'],
    'model__n_estimators': [10, 20],
}

scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'accuracy_score': make_scorer(accuracy_score)
}

def gs_wrapper(pipe = ab_pipe, pipe_params = ab_pipe_params, scorers=scorers):
    gridsearch = GridSearchCV(pipe, pipe_params, cv=5, verbose = 1, n_jobs=-1, scoring=scorers, refit='recall_score')

    gs = gridsearch.fit(X_train, y_train)

    print('Train Score: ', gs.score(X_train, y_train))
    print('Test Score: ', gs.score(X_test, y_test))

    # get feature importances from the model, plot
    print(classification_report(y_test, gs.predict(X_test)))
    print('---')

    # use the classification report function for more classification metrics
    confusion_matrix(y_test, gs.predict(X_test))

    print(gs.best_params_)

    fi = gs.best_estimator_.steps[1][1].feature_importances_
    fn = gs.best_estimator_.steps[0][1].transformer_list[0][1].steps[1][1].get_feature_names() + \
        gs.best_estimator_.steps[0][1].transformer_list[1][1].get_feature_names()
    #print(fn)
    pd.DataFrame(fi, index=fn).sort_values(by=0, ascending=False).head(20).plot(kind='barh', figsize=(12,7))



gs_wrapper()

NameError: name 'X_train' is not defined

In [13]:
plt.figure(figsize=(13, 13))
plt.title("GridSearchCV evaluating using multiple scorers simultaneously",
          fontsize=16)

plt.xlabel("min_samples_split")
plt.ylabel("Score")

ax = plt.gca()
ax.set_xlim(0, 402)
ax.set_ylim(0.73, 1)

# Get the regular numpy array from the MaskedArray
X_axis = np.array(results['param_min_samples_split'].data, dtype=float)

for scorer, color in zip(sorted(scoring), ['g', 'k']):
    for sample, style in (('train', '--'), ('test', '-')):
        sample_score_mean = results['mean_%s_%s' % (sample, scorer)]
        sample_score_std = results['std_%s_%s' % (sample, scorer)]
        ax.fill_between(X_axis, sample_score_mean - sample_score_std,
                        sample_score_mean + sample_score_std,
                        alpha=0.1 if sample == 'test' else 0, color=color)
        ax.plot(X_axis, sample_score_mean, style, color=color,
                alpha=1 if sample == 'test' else 0.7,
                label="%s (%s)" % (scorer, sample))

    best_index = np.nonzero(results['rank_test_%s' % scorer] == 1)[0][0]
    best_score = results['mean_test_%s' % scorer][best_index]

    # Plot a dotted vertical line at the best score for that scorer marked by x
    ax.plot([X_axis[best_index], ] * 2, [0, best_score],
            linestyle='-.', color=color, marker='x', markeredgewidth=3, ms=8)

    # Annotate the best score for that scorer
    ax.annotate("%0.2f" % best_score,
                (X_axis[best_index], best_score + 0.005))

plt.legend(loc="best")
plt.grid('off')
plt.show()

NameError: name 'plt' is not defined