# Machine Learning Workflow
Complete the steps below to complete the machine learning workflow for this classifier.

In [1]:
import nltk
nltk.download(['punkt', 'wordnet'])

[nltk_data] Downloading package punkt to C:\Users\Victor
[nltk_data]     Pontello\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Victor
[nltk_data]     Pontello\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [28]:
import re
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [3]:
def clean_text(text):

    url_str_pattern = "(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?"

    # Identify any urls in `text`, and replace each one with the word, `"urlplaceholder"`.
    # Normalize case
    text = re.sub(url_str_pattern,'urlplaceholder',text.lower())
    # Split `text` into tokens.
    words = word_tokenize(text)
    # For each token: lemmatize, and strip leading and trailing white space.
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word.strip()) for word in words]
    
    return words

In [6]:
def load_data():
    df = pd.read_csv('../data/corporate_messaging.csv', encoding='latin-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    return X, y

### Step 1: Load data and perform a train test split
Hint: you can use the [`train_test_split`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) method from sklearn.

In [23]:
# load data
X, y =load_data()

# perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Step 2: Train classifier
* Fit and transform the training data with [`CountVectorizer`](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html). Hint: You can include your tokenize function in the `tokenizer` keyword argument!
* Fit and transform these word counts with [`TfidfTransformer`](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html).
* Fit or train a classifier to these tfidf values. Hint, you can use the [`RandomeForestClassifier`](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html) or other classifier of your choice.


In [24]:
# Instantiate transformers and classifier
vect = CountVectorizer(tokenizer=clean_text)
tfidf = TfidfTransformer(smooth_idf=False)
clf = RandomForestClassifier()

# Fit and/or transform each to the training data
# Hint: you can use the fit_transform method
X_train = vect.fit_transform(X_train)
X_train = tfidf.fit_transform(X_train)

# Fit or train the classifier
clf.fit(X_train,y_train)


RandomForestClassifier()

### Step 3: Predict on test data
* Transform (no fitting) the test data with the same `CountVectorizer` and `TfidfTransformer`
* Predict labels on these tfidf values.

In [25]:
X_test = vect.transform(X_test)
X_test = tfidf.transform(X_test)

y_pred = clf.predict(X_test)

### Step 4: Display results
Use the variable `labels` to store all unique labels in the model's prediction. Then display a confusion matrix and accuracy score based on the model's predictions. 

Hint: you can use the [`confusion_matrix`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html) in sklearn to generate a confusion matrix.

Hint: to calculate the accuracy, you can use the number of correct predictions divided by the total number of predictions.

In [32]:
labels = list(set(y_pred))
confusion_mat = confusion_matrix(y_test, y_pred,normalize='true')
accuracy = sum(y_pred==y_test)/y_test.shape[0]

print("Labels:", labels)
print("Confusion Matrix:\n", confusion_mat)
print("Accuracy:", accuracy)

Labels: ['Information', 'Dialogue', 'Action']
Confusion Matrix:
 [[0.77419355 0.         0.22580645]
 [0.06521739 0.7826087  0.15217391]
 [0.00725953 0.00181488 0.99092559]]
Accuracy: 0.9403606102635229


# Final Step: Refactor
Organize these steps into the following functions.

In [35]:
def display_results(y_test,y_pred):
    # insert step 4 here
    labels = set(y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred,normalize='true')
    accuracy = sum(y_pred==y_test)/y_test.shape[0]

    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)


def main():

    # load data
    X, y =load_data()

    # perform train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Instantiate transformers and classifier
    vect = CountVectorizer(tokenizer=clean_text)
    tfidf = TfidfTransformer(smooth_idf=False)
    clf = RandomForestClassifier()

    # Fit and/or transform each to the training data
    # Hint: you can use the fit_transform method
    X_train = vect.fit_transform(X_train)
    X_train = tfidf.fit_transform(X_train)

    # Fit or train the classifier
    clf.fit(X_train,y_train)

    
    X_test = vect.transform(X_test)
    X_test = tfidf.transform(X_test)

    y_pred = clf.predict(X_test)

    display_results(y_test,y_pred)

In [36]:
# run program
main()

Labels: {'Information', 'Dialogue', 'Action'}
Confusion Matrix:
 [[0.75806452 0.         0.24193548]
 [0.06521739 0.76086957 0.17391304]
 [0.01088929 0.00181488 0.98729583]]
Accuracy: 0.9334257975034674
