# Machine Learning Workflow
Complete the steps below to complete the machine learning workflow for this classifier.

In [10]:
import nltk
nltk.download(['punkt', 'wordnet'])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\victo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\victo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
import re
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

In [12]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

def load_data():
    df = pd.read_csv('corporate_messaging.csv', encoding='latin-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    return X, y

def tokenize(text):
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

### Step 1: Load data and perform a train test split

In [13]:
# load data
X, y =load_data()

# perform train test split
x_train, x_test,y_train,y_test=train_test_split(X,y,random_state=42)

In [14]:
y_train.shape

(1802,)

### Step 2: Train classifier
* Fit and transform the training data with `CountVectorizer`. Hint: You can include your tokenize function in the `tokenizer` keyword argument!
* Fit and transform these word counts with `TfidfTransformer`.
* Fit a classifier to these tfidf values.

In [15]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.naive_bayes import GaussianNB
# Instantiate transformers and classifier
vect = CountVectorizer(tokenizer=tokenize)
tfidf = TfidfTransformer()
clf = GaussianNB()

# Fit and/or transform each to the data
vect.fit(x_train)
x_train_transfomred=vect.transform(x_train)
print(x_train_transfomred.shape)

tfidf.fit(x_train_transfomred)
x_train_tfidf=tfidf.transform(x_train_transfomred)
print(x_train_tfidf.shape)

# toarray
x_train_tfidf=x_train_tfidf.todense()

# train
clf.fit(x_train_tfidf, y_train)



(1802, 5503)
(1802, 5503)


GaussianNB(priors=None, var_smoothing=1e-09)

### Step 3: Predict on test data
* Transform (no fitting) the test data with the same CountVectorizer and TfidfTransformer
* Predict labels on these tfidf values.

In [16]:
# Transform test data
x_test_transformed=vect.transform(x_test)
x_test_tfidf=tfidf.transform(x_test_transformed)
# Predict test labels
y_pred = clf.predict(x_test_tfidf.toarray())

### Step 4: Display results
Display a confusion matrix and accuracy score based on the model's predictions.

In [17]:
y_pred.shape

(601,)

In [18]:
from sklearn.metrics import confusion_matrix, accuracy_score
import numpy as np
labels = np.unique(y_test)
confusion_mat = confusion_matrix(y_test,y_pred,labels=labels)
accuracy = accuracy_score(y_test,y_pred)

print("Labels:", labels)
print("Confusion Matrix:\n", confusion_mat)
print("Accuracy:", accuracy)

Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 57   0  38]
 [  8  21  12]
 [ 19   1 445]]
Accuracy: 0.870216306156406


# Final Step: Refactor
Organize these steps into the following functions.

In [19]:

def display_results(y_test,y_pred):
    # insert step 4 here
    from sklearn.metrics import confusion_matrix, accuracy_score
    import numpy as np
    labels = np.unique(y_test)
    confusion_mat = confusion_matrix(y_test,y_pred,labels=labels)
    accuracy = accuracy_score(y_test,y_pred)
    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)
    
def main():
    # insert steps 1 through 3 here
    from sklearn.model_selection import train_test_split
    # load data
    X, y =load_data()
    # perform train test split
    x_train, x_test,y_train,y_test=train_test_split(X,y, test_size=.2,random_state=42)
    
    from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
    from sklearn.naive_bayes import GaussianNB
    # Instantiate transformers and classifier
    vect = CountVectorizer(tokenizer=tokenize)
    tfidf = TfidfTransformer()
    clf = GaussianNB()

    # Fit and/or transform each to the data
    x_train_transfomred=vect.fit_transform(x_train)

    x_train_tfidf=tfidf.fit_transform(x_train_transfomred)

    # toarray
    x_train_tfidf=x_train_tfidf.todense()

    # train
    clf.fit(x_train_tfidf, y_train)
    
    
    # Transform test data
    x_test_transformed=vect.transform(x_test)
    x_test_tfidf=tfidf.transform(x_test_transformed)
    # Predict test labels
    y_pred = clf.predict(x_test_tfidf.toarray())
    
    display_results(y_test, y_pred)


In [20]:
# run program
main()

Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 50   0  29]
 [  8  20   8]
 [ 16   1 349]]
Accuracy: 0.8711018711018711
