# Implementing Feature Union
Using the given custom transformer, `StartingVerbExtractor`, add a feature union to your pipeline to incorporate a feature that indicates with a boolean value whether the starting token of a post is identified as a verb.

In [1]:
import nltk
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger'])

[nltk_data] Downloading package punkt to C:\Users\Victor
[nltk_data]     Pontello\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Victor
[nltk_data]     Pontello\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Victor Pontello\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
import re
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [8]:
from custom_transformer import StartingVerbExtractor

[nltk_data] Downloading package punkt to C:\Users\Victor
[nltk_data]     Pontello\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Victor
[nltk_data]     Pontello\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Victor Pontello\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### Build your pipeline to have this structure:
- Pipeline
    - feature union
        - text pipeline
            - count vectorizer
            - TFIDF transformer
        - starting verb extractor
    - classifier

In [6]:
def load_data():
    df = pd.read_csv('../data/corporate_messaging.csv', encoding='latin-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    return X, y


def tokenize(text):

    url_str_pattern = "(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?"

    # Identify any urls in `text`, and replace each one with the word, `"urlplaceholder"`.
    # Normalize case
    text = re.sub(url_str_pattern,'urlplaceholder',text.lower())
    # Split `text` into tokens.
    words = word_tokenize(text)
    # For each token: lemmatize, and strip leading and trailing white space.
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word.strip()) for word in words]
    
    return words


def display_results(y_test,y_pred):
    labels = set(y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred,normalize='true')
    accuracy = sum(y_pred==y_test)/y_test.shape[0]

    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)

In [3]:
from sklearn.pipeline import FeatureUnion


def model_pipeline():
    pipeline = Pipeline([
        ('parallel_run',FeatureUnion([
            ('text_pipeline',Pipeline([
                ('vect',CountVectorizer(tokenizer=tokenize)),
                ('tfidf',TfidfTransformer())
            ])),
            ('verb_extractor',StartingVerbExtractor())
        ])),
        ('clf',RandomForestClassifier())
    ])

    return pipeline

In [4]:
def main():
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y)    

    model = model_pipeline()
        
    # train classifier
    model.fit(X_train,y_train)

    # predict on test data
    y_pred = model.predict(X_test)

    # display results
    display_results(y_test, y_pred)

In [9]:
main()

Labels: {'Information', 'Dialogue', 'Action'}
Confusion Matrix:
 [[0.89090909 0.         0.10909091]
 [0.03703704 0.77777778 0.18518519]
 [0.00431034 0.         0.99568966]]
Accuracy: 0.9667221297836939
