# Grid Search
Let's incorporate grid search into your modeling process. To start, include an import statement for `GridSearchCV` below.

In [1]:
import nltk
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger'])

[nltk_data] Downloading package punkt to C:\Users\Victor
[nltk_data]     Pontello\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Victor
[nltk_data]     Pontello\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Victor Pontello\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
import re
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin

In [3]:
def load_data():
    df = pd.read_csv('../data/corporate_messaging.csv', encoding='latin-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    return X, y


def tokenize(text):

    url_str_pattern = "(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?"

    # Identify any urls in `text`, and replace each one with the word, `"urlplaceholder"`.
    # Normalize case
    text = re.sub(url_str_pattern,'urlplaceholder',text.lower())
    # Split `text` into tokens.
    words = word_tokenize(text)
    # For each token: lemmatize, and strip leading and trailing white space.
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word.strip()) for word in words]
    
    return words


def model_pipeline():
    pipeline = Pipeline([
        ('parallel_run',FeatureUnion([
            ('text_pipeline',Pipeline([
                ('vect',CountVectorizer(tokenizer=tokenize)),
                ('tfidf',TfidfTransformer())
            ])),
            ('verb_extractor',StartingVerbExtractor())
        ])),
        ('clf',RandomForestClassifier())
    ])

    return pipeline


def display_results(cv,y_test,y_pred):
    labels = set(y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred,normalize='true')
    accuracy = sum(y_pred==y_test)/y_test.shape[0]

    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)
    print("\nBest Parameters:", cv.best_params_)


class StartingVerbExtractor(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass

    def starting_verb(self,text):
        sents = sent_tokenize(text)
        for sent in sents:
            # tokenize each sentence into words and tag part of speech
            pos_tags = pos_tag(tokenize(sent))

            # index pos_tags to get the first word and part of speech tag
            first_word, first_tag = pos_tags[0]

            # return true if the first word is an appropriate verb or RT for retweet
            if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                return True
            else:
                return False

    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        X_tagged = pd.Series(X).apply(self.starting_verb).values

        return pd.DataFrame(X_tagged)


### View parameters in pipeline
Before modifying your build_model method to include grid search, view the parameters in your pipeline here.

In [4]:
pipeline = model_pipeline()
pipeline.get_params()

{'memory': None,
 'steps': [('parallel_run',
   FeatureUnion(transformer_list=[('text_pipeline',
                                   Pipeline(steps=[('vect',
                                                    CountVectorizer(tokenizer=<function tokenize at 0x000001938A6D7E58>)),
                                                   ('tfidf',
                                                    TfidfTransformer())])),
                                  ('verb_extractor', StartingVerbExtractor())])),
  ('clf', RandomForestClassifier())],
 'verbose': False,
 'parallel_run': FeatureUnion(transformer_list=[('text_pipeline',
                                 Pipeline(steps=[('vect',
                                                  CountVectorizer(tokenizer=<function tokenize at 0x000001938A6D7E58>)),
                                                 ('tfidf',
                                                  TfidfTransformer())])),
                                ('verb_extractor', StartingVerbExt

### Modify your `build_model` function to return a GridSearchCV object.
Try to grid search some parameters in your data transformation steps as well as those for your classifier! Browse the parameters you can search above.

In [5]:
def build_model():
    pipeline = model_pipeline()

    # specify parameters for grid search
    parameters = {
        'clf__n_estimators':[10,50,100,200,400],
        'clf__bootstrap':[True,False],
        'parallel_run__text_pipeline__tfidf__smooth_idf': [True,False]
    }

    # create grid search object
    cv = GridSearchCV(pipeline, param_grid=parameters)
    
    return cv

### Run program to test
Running grid search can take a while, especially if you are searching over a lot of parameters! If you want to reduce it to a few minutes, try commenting out some of your parameters to grid search over just 1 or 2 parameters with a small number of values each. Once you know that works, feel free to add more parameters and see how well your final model can perform! You can try this out in the next page.

In [6]:
def main():
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    model = build_model()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    display_results(model, y_test, y_pred)


main()

Labels: {'Information', 'Action', 'Dialogue'}
Confusion Matrix:
 [[0.78947368 0.         0.21052632]
 [0.         0.87878788 0.12121212]
 [0.00660793 0.         0.99339207]]
Accuracy: 0.9484193011647255

Best Parameters: {'clf__bootstrap': False, 'clf__n_estimators': 50, 'parallel_run__text_pipeline__tfidf__smooth_idf': False}
