# Machine Learning Workflow
Complete the steps below to complete the machine learning workflow for this classifier.

In [1]:
import nltk
nltk.download(['punkt', 'wordnet'])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DJ\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DJ\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
import re
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [3]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

def load_data():
    df = pd.read_csv('corporate_messaging.csv', encoding='latin-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    return X, y

def tokenize(text):
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

### Step 1: Load data and perform a train test split

In [4]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

In [5]:
# load data
X, y = load_data()

# perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [21]:
y_train

array(['Information', 'Information', 'Information', ..., 'Action',
       'Information', 'Dialogue'], dtype=object)

In [22]:
X_train

array(['Limited Mortgage Finance Role for U.S. Government Gains Support:  http://t.co/drV92iu2 #Citigroup #BRK',
       'L.C.: Our mission is to enhance quality of life, with good food & beverages & personalized nutrition http://bit.ly/l4GbIt #Nestle #NestleIR',
       'John Foreyt:  The goal of behavioural intervention is to help obese individuals adhere to a healthy lifestyle  #NINS2013',
       ...,
       'Is your community impacted by crime? See how @TheCCJamaica reduced crime w/ fish farms: http://t.co/0uoWaOuX9h #FTCitiAwards #progressmakers',
       "Banks Looking for 'Right Thing' on CEO Pay: Krawcheck: Succession Planning for Business Owners... http://t.co/TZGPWad2YK #Citigroup #BRK",
       'Happy #WorldWaterDay! We are committed to improving access to clean water.  Thank you @safewaternet for all you do!   http://t.co/tWElyXNdZu'],
      dtype=object)

### Step 2: Train classifier
* Fit and transform the training data with `CountVectorizer`. Hint: You can include your tokenize function in the `tokenizer` keyword argument!
* Fit and transform these word counts with `TfidfTransformer`.
* Fit a classifier to these tfidf values.

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

# Instantiate transformers and classifier
vect = CountVectorizer(tokenizer=tokenize)

# Fit and/or transform each to the data
X_train_counts = vect.fit_transform(X_train)


In [23]:
X_train_counts.shape

(1802, 5502)

In [17]:
print(X_train_counts)

  (0, 1)	2
  (0, 333)	1
  (0, 911)	1
  (0, 1154)	1
  (0, 2091)	1
  (0, 2152)	1
  (0, 2247)	1
  (0, 2335)	1
  (0, 3045)	1
  (0, 3358)	1
  (0, 4270)	1
  (0, 4741)	1
  (0, 5023)	1
  (0, 5082)	1
  (1, 1)	2
  (1, 4)	2
  (1, 43)	1
  (1, 57)	1
  (1, 333)	1
  (1, 776)	1
  (1, 1850)	1
  (1, 2139)	1
  (1, 2329)	1
  (1, 2795)	1
  (1, 2937)	1
  :	:
  (1800, 3814)	1
  (1800, 4722)	1
  (1800, 4870)	1
  (1800, 5082)	1
  (1801, 0)	2
  (1801, 1)	1
  (1801, 57)	1
  (1801, 337)	1
  (1801, 370)	1
  (1801, 485)	1
  (1801, 592)	1
  (1801, 1183)	1
  (1801, 1257)	1
  (1801, 1653)	1
  (1801, 2152)	1
  (1801, 2434)	1
  (1801, 2658)	1
  (1801, 4323)	1
  (1801, 4843)	1
  (1801, 4900)	2
  (1801, 5082)	1
  (1801, 5227)	1
  (1801, 5236)	1
  (1801, 5358)	1
  (1801, 5415)	2


In [8]:
from sklearn.feature_extraction.text import TfidfTransformer

# Instantiate transformers and classifier
tfidf = TfidfTransformer()

# Fit and/or transform each to the data using counts from count vectorizer results to compute tf-idf values
X_train_tfidf = tfidf.fit_transform(X_train_counts)


In [9]:
from sklearn.ensemble import RandomForestClassifier
# Fit a classifier to these tfidf values
clf = RandomForestClassifier()
clf.fit(X_train_tfidf, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

### Step 3: Predict on test data
* **Transform (no fitting) the test data with the same CountVectorizer and TfidfTransformer**
* Predict labels on these tfidf values.

In [10]:
# Transform test data
X_test_counts = vect.transform(X_test)
X_test_tfidf = tfidf.transform(X_test_counts)

# Predict test labels
y_pred = clf.predict(X_test_tfidf)

In [11]:
y_pred.shape

(601,)

### Step 4: Display results
Display a confusion matrix and accuracy score based on the model's predictions.

In [12]:
import numpy as np

In [13]:
labels = np.unique(y_pred)
confusion_mat = confusion_matrix(y_test, y_pred, labels = labels)
accuracy = (y_pred == y_test).mean()

print("Labels:", labels)
print("Confusion Matrix:\n", confusion_mat)
print("Accuracy:", accuracy)

Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 75   0  18]
 [  0  29   4]
 [  8   0 467]]
Accuracy: 0.9500831946755408


# Final Step: Refactor
Organize these steps into the following functions.

In [24]:
import nltk
nltk.download(['punkt', 'wordnet'])

import re
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DJ\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DJ\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [25]:
def display_results(y_test,  y_pred):
    # insert step 4 here
    labels = np.unique(y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred, labels = labels)
    accuracy = (y_pred == y_test).mean()

    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)
    
    pass


def main():
    # insert steps 1 through 3 here
    # load data
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    vect = CountVectorizer(tokenizer=tokenize)
    tfidf = TfidfTransformer()
    clf = RandomForestClassifier()   
    
    X_train_counts = vect.fit_transform(X_train)   
    X_train_tfidf = tfidf.fit_transform(X_train_counts) 
    clf.fit(X_train_tfidf, y_train)

    X_test_counts = vect.transform(X_test)
    X_test_tfidf = tfidf.transform(X_test_counts)
    y_pred = clf.predict(X_test_tfidf)
    
    # display results
    display_results(y_test, y_pred)

    pass

In [26]:
# run program
main()



Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 94   0  34]
 [  0  19   3]
 [  5   0 446]]
Accuracy: 0.930116472545757
