### Train a Model for analyzing Tweets

This should be familiar from previous weeks. This code trains a model from the nltk twitter dataset. 

In [1]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import twitter_samples, stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk import FreqDist, classify, NaiveBayesClassifier
import nltk
import re, string, random

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('twitter_samples')

[nltk_data] Downloading package stopwords to /home/warner/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/warner/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/warner/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/warner/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package twitter_samples to
[nltk_data]     /home/warner/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [3]:
def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

In [4]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]

stop_words = stopwords.words('english')

positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

all_pos_words = get_all_words(positive_cleaned_tokens_list)

freq_dist_pos = FreqDist(all_pos_words)
print(freq_dist_pos.most_common(10))

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

positive_dataset = [(tweet_dict, "Positive")
                     for tweet_dict in positive_tokens_for_model]

negative_dataset = [(tweet_dict, "Negative")
                     for tweet_dict in negative_tokens_for_model]
dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

[(':)', 3691), (':-)', 701), (':d', 658), ('thanks', 388), ('follow', 357), ('love', 333), ('...', 290), ('good', 283), ('get', 263), ('thank', 253)]
Accuracy is: 0.995
Most Informative Features
                      :( = True           Negati : Positi =   2060.1 : 1.0
                      :) = True           Positi : Negati =    998.1 : 1.0
                     sad = True           Negati : Positi =     35.5 : 1.0
                follower = True           Positi : Negati =     32.0 : 1.0
                    glad = True           Positi : Negati =     18.5 : 1.0
               community = True           Positi : Negati =     17.2 : 1.0
                 welcome = True           Positi : Negati =     16.3 : 1.0
                     x15 = True           Negati : Positi =     15.5 : 1.0
                     ugh = True           Negati : Positi =     14.9 : 1.0
                     bro = True           Positi : Negati =     13.1 : 1.0
None


In [5]:
custom_tweet = "Big bird is on tonight at 12."

custom_tokens = remove_noise(word_tokenize(custom_tweet))

print(custom_tweet, classifier.classify(dict([token, True] for token in custom_tokens)))

Big bird is on tonight at 12. Negative


### Preserving our Model

In Python it can be useful to preserve ojbects for later use. Most objects can be preserved (pickled). It would not be very convenient if we had to train a model just before use. So we train a model and save it by pickling it. Python typically uses a module called `pickle` for preserving objects. However, according to this documentation I just found, `sklearn` prefers the use of `joblib`. When in doubt, follow the docs. :-) We will save our trained model as a binary file that can be imported by python files. 

* Learn more about `pickle` [here.](https://realpython.com/python-pickle-module/)

* Learn more about `sklearn` preserving models [here.](https://scikit-learn.org/stable/modules/model_persistence.html)


In [6]:
from joblib import dump, load

# dump our classifer object to a file called "tweet_classifier.joblib"
dump(classifier, 'tweet_classifier.joblib') 

['tweet_classifier.joblib']

In [7]:
# Test our joblib classifer
joblib_classifier = load('tweet_classifier.joblib') 

print(custom_tweet, joblib_classifier.classify(dict([token, True] for token in custom_tokens)))

print('Hooray! We preserved our model!')

Big bird is on tonight at 12. Negative
Hooray! We preserved our model!
