# Machine Learning Workflow
Complete the steps below to complete the machine learning workflow for this classifier.

In [1]:
import nltk
nltk.download(['punkt', 'wordnet'])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
import re
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [3]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

def load_data():
    df = pd.read_csv('corporate_messaging.csv', encoding='latin-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    return X, y

def tokenize(text):
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

### Step 1: Load data and perform a train test split
Hint: you can use the [`train_test_split`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) method from sklearn.

In [4]:
# load data
X, y = load_data()

from sklearn.model_selection import train_test_split
# perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Step 2: Train classifier
* Fit and transform the training data with [`CountVectorizer`](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html). Hint: You can include your tokenize function in the `tokenizer` keyword argument!
* Fit and transform these word counts with [`TfidfTransformer`](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html).
* Fit or train a classifier to these tfidf values. Hint, you can use the [`RandomeForestClassifier`](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html) or other classifier of your choice.


In [5]:
# Instantiate transformers and classifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier

# Todo: initialize count vectorizer object and pass the tokenize function to the `tokenizer` parameter
vect = CountVectorizer(tokenizer=tokenize)

# Todo: initialize tf-idf transformer object. Set smooth_idf parameter to false.
tfidf = TfidfTransformer(smooth_idf=False)


clf = RandomForestClassifier(random_state=42)

# Fit and/or transform each to the training data
# Hint: you can use the fit_transform method

X_train_counts = vect.fit_transform(X_train)
X_train_tfidf= tfidf.fit_transform(X_train_counts)

# Fit or train the classifier
model=clf.fit(X_train_tfidf, y_train)


### Step 3: Predict on test data
* Transform (no fitting) the test data with the same `CountVectorizer` and `TfidfTransformer`
* Predict labels on these tfidf values.

In [6]:
# Transform test data

X_test_counts = vect.transform(X_test)
X_test_tfidf= tfidf.transform(X_test_counts)

# Predict test labels
y_pred = model.predict(X_test_tfidf)

In [7]:
set(y_pred)

{'Action', 'Dialogue', 'Information'}

### Step 4: Display results
Use the variable `labels` to store all unique labels in the model's prediction. Then display a confusion matrix and accuracy score based on the model's predictions. 

Hint: you can use the [`confusion_matrix`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html) in sklearn to generate a confusion matrix.

Hint: to calculate the accuracy, you can use the number of correct predictions divided by the total number of predictions.

In [8]:
labels = ['Action', 'Dialogue', 'Information']
from sklearn.metrics import confusion_matrix
confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

print("Labels:", labels)
print("Confusion Matrix:\n", confusion_mat)
print("Accuracy:", accuracy)

Labels: ['Action', 'Dialogue', 'Information']
Confusion Matrix:
 [[101   0  23]
 [  1  36   9]
 [ 12   2 537]]
Accuracy: 0.934812760055


# Final Step: Refactor
Organize these steps into the following functions.

In [9]:
def display_results(y_test, y_pred):
    # insert step 4 here
    labels = ['Action', 'Dialogue', 'Information']
    confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
    accuracy = accuracy_score(y_test, y_pred)
    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)


def main():
    # insert steps 1 through 3 and the display_results() here
    
    # load data
    X, y = load_data()

 
    # perform train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # initialize count vectorizer object and pass the tokenize function to the `tokenizer` parameter
    vect = CountVectorizer(tokenizer=tokenize)

    # initialize tf-idf transformer object. Set smooth_idf parameter to false.
    tfidf = TfidfTransformer(smooth_idf=False)

    # initialize the classifier
    clf = RandomForestClassifier(random_state=42)
    
    # transform training data
    X_train_counts = vect.fit_transform(X_train)
    X_train_tfidf= tfidf.fit_transform(X_train_counts)

    # Fit or train the classifier
    model=clf.fit(X_train_tfidf, y_train)
    
    # transform testing data
    X_test_counts = vect.transform(X_test)
    X_test_tfidf= tfidf.transform(X_test_counts)

    # Predict test labels
    y_pred = model.predict(X_test_tfidf)
    
    display_results(y_test, y_pred)
    

In [10]:
# run program
main()

Labels: ['Action', 'Dialogue', 'Information']
Confusion Matrix:
 [[101   0  23]
 [  1  36   9]
 [ 12   2 537]]
Accuracy: 0.934812760055
