# Implementing Pipeline
Using what you learning about pipelining, rewrite your machine learning code from the last section to use sklearn's Pipeline. For reference, the previous main function implementation is provided in the second to last cell. Refactor this in the last cell.

In [9]:
import nltk
nltk.download(['punkt', 'wordnet'])

[nltk_data] Downloading package punkt to C:\Users\Victor
[nltk_data]     Pontello\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Victor
[nltk_data]     Pontello\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
import re
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [11]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'


def load_data():
    df = pd.read_csv('../data/corporate_messaging.csv', encoding='latin-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    return X, y


def tokenize(text):

    url_str_pattern = "(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?"

    # Identify any urls in `text`, and replace each one with the word, `"urlplaceholder"`.
    # Normalize case
    text = re.sub(url_str_pattern,'urlplaceholder',text.lower())
    # Split `text` into tokens.
    words = word_tokenize(text)
    # For each token: lemmatize, and strip leading and trailing white space.
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word.strip()) for word in words]
    
    return words


def display_results(y_test,y_pred):
    labels = set(y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred,normalize='true')
    accuracy = sum(y_pred==y_test)/y_test.shape[0]

    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)

In [12]:
def old_main():
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    vect = CountVectorizer(tokenizer=tokenize)
    tfidf = TfidfTransformer()
    clf = RandomForestClassifier()

    # train classifier
    X_train_counts = vect.fit_transform(X_train)
    X_train_tfidf = tfidf.fit_transform(X_train_counts)
    clf.fit(X_train_tfidf, y_train)

    # predict on test data
    X_test_counts = vect.transform(X_test)
    X_test_tfidf = tfidf.transform(X_test_counts)
    y_pred = clf.predict(X_test_tfidf)

    # display results
    display_results(y_test, y_pred)

In [15]:
from sklearn.pipeline import Pipeline


def main():
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    # build pipeline
    pipeline = Pipeline([
        ('vect',CountVectorizer(tokenizer=tokenize)),
        ('tfidf',TfidfTransformer()),
        ('clf',RandomForestClassifier())
    ])

      
        
    # train classifier
    pipeline.fit(X_train,y_train)

    # predict on test data
    y_pred = pipeline.predict(X_test)

    # display results
    display_results(y_test, y_pred)

In [16]:
main()

Labels: {'Dialogue', 'Action', 'Information'}
Confusion Matrix:
 [[0.80530973 0.         0.19469027]
 [0.         0.84       0.16      ]
 [0.00215983 0.00215983 0.99568035]]
Accuracy: 0.9534109816971714


In [17]:
old_main()

Labels: {'Dialogue', 'Action', 'Information'}
Confusion Matrix:
 [[0.73333333 0.         0.26666667]
 [0.03571429 0.85714286 0.10714286]
 [0.0042735  0.         0.9957265 ]]
Accuracy: 0.9434276206322796
