# Grid Search
Let's incorporate grid search into your modeling process. To start, include an import statement for `GridSearchCV` below.

In [1]:
import os
print(os.cpu_count())

1


In [2]:
import nltk
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger'])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
import re
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [4]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

def tokenize(text):
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens


class StartingVerbExtractor(BaseEstimator, TransformerMixin):

    def starting_verb(self, text):
        sentence_list = nltk.sent_tokenize(text)
        for sentence in sentence_list:
            pos_tags = nltk.pos_tag(tokenize(sentence))
            first_word, first_tag = pos_tags[0]
            if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                return True
        return False

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.starting_verb)
        return pd.DataFrame(X_tagged)

### View parameters in pipeline
Before modifying your build_model method to include grid search, view the parameters in your pipeline here.

In [5]:
pipeline = Pipeline([
    ('features', FeatureUnion([

        ('text_pipeline', Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer())
        ])),

        ('starting_verb', StartingVerbExtractor())
    ])),

    ('clf', RandomForestClassifier())
])

In [6]:
pipeline.get_params()

{'memory': None, 'steps': [('features', FeatureUnion(n_jobs=1,
          transformer_list=[('text_pipeline', Pipeline(memory=None,
        steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_... smooth_idf=True, sublinear_tf=False, use_idf=True))])), ('starting_verb', StartingVerbExtractor())],
          transformer_weights=None)),
  ('clf',
   RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
               max_depth=None, max_features='auto', max_leaf_nodes=None,
               min_impurity_decrease=0.0, min_impurity_split=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
               oob_score=False, random_state=None, verbose=0,
               warm_start=False))], 'features': Featu

### Modify your `build_model` function to return a GridSearchCV object.
Try to grid search some parameters in your data transformation steps as well as those for your classifier! Browse the parameters you can search above.

In [7]:
def build_model():
    pipeline = Pipeline([
        ('features', FeatureUnion([
            
            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ])),

            ('starting_verb', StartingVerbExtractor())
        ])),
    
        ('clf', RandomForestClassifier())
    ])

    # specify parameters for grid search
    parameters = {
        'features__text_pipeline__vect__ngram_range': ((1, 1), (1, 2)),
        'clf__n_estimators': [80, 100],
        'clf__max_depth': [4, 5],
        
    }

    # create grid search object
    cv = GridSearchCV(pipeline, param_grid=parameters, cv=5, refit=True)
    
    return cv


def pipeline_without_transformers():
    # specify parameters for grid search
    
    pipeline = Pipeline([
        ('clf', RandomForestClassifier(random_state=100)) 
    ])
    
    return pipeline
    


def pipeline_with_transformers():
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('scaler', StandardScaler(with_mean=False)),
        ('clf', RandomForestClassifier(random_state=200))
    ])

    return pipeline

### Run program to test
Running grid search can take a while, especially if you are searching over a lot of parameters! If you want to reduce it to a few minutes, try commenting out some of your parameters to grid search over just 1 or 2 parameters with a small number of values each. Once you know that works, feel free to add more parameters and see how well your final model can perform! You can try this out in the next page.

In [8]:
def load_data():
    df = pd.read_csv('corporate_messaging.csv', encoding='latin-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    return X, y


def display_results(cv, y_test, y_pred):
    labels = np.unique(y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred, labels=np.unique(y_test))
    accuracy = (y_pred == y_test).mean()

    print("Predicted Labels:", labels)
    print("True Labels:", np.unique(y_test))
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)
    
    
def main():
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    model = build_model()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    display_results(model, y_test, y_pred)

In [9]:
X, y = load_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

countvectorizer = CountVectorizer(tokenizer=tokenize)
tfidftransformer = TfidfTransformer()
stdscaler = StandardScaler(with_mean=False)

X_train_countvectorizer = countvectorizer.fit_transform(X_train)
X_train_tfidftransformer = tfidftransformer.fit_transform(X_train_countvectorizer)
X_train_stdscaler = stdscaler.fit_transform(X_train_tfidftransformer)

X_test_countvectorizer = countvectorizer.transform(X_test)
X_test_tfidftransformer = tfidftransformer.transform(X_test_countvectorizer)
X_test_stdscaler = stdscaler.transform(X_test_tfidftransformer)

model_without = pipeline_without_transformers()
model_without.fit(X_train_stdscaler, y_train)

y_pred_train = model_without.predict(X_train_stdscaler)
y_pred_test = model_without.predict(X_test_stdscaler)

print('Train metrics without transformers')
display_results(model_without, y_train, y_pred_train)

print('')

print('Test metrics without transformers')
display_results(model_without, y_test, y_pred_test)

Train metrics without transformers
Predicted Labels: ['Action' 'Dialogue' 'Information']
True Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 358    0    3]
 [   0   82    1]
 [   1    0 1357]]
Accuracy: 0.997225305216

Test metrics without transformers
Predicted Labels: ['Action' 'Dialogue' 'Information']
True Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 77   0  18]
 [  1  31   9]
 [ 11   0 454]]
Accuracy: 0.935108153078


In [10]:
model_with = pipeline_with_transformers()
model_with.fit(X_train, y_train)

y_pred_train = model_with.predict(X_train)
y_pred_test = model_with.predict(X_test)

print('Train metrics with transformers')
display_results(model_with, y_train, y_pred_train)

print('')

print('Test metrics with transformers')
display_results(model_with, y_test, y_pred_test)

Train metrics with transformers
Predicted Labels: ['Action' 'Dialogue' 'Information']
True Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 359    0    2]
 [   0   83    0]
 [   2    0 1356]]
Accuracy: 0.997780244173

Test metrics with transformers
Predicted Labels: ['Action' 'Dialogue' 'Information']
True Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 79   0  16]
 [  2  28  11]
 [ 14   1 450]]
Accuracy: 0.926788685524
