In [1]:
import pandas as pd
import os
import urllib.request 
import shutil
import zipfile

In [2]:
base_folder = os.getcwd()

In [3]:
# Unzip file on a temporary folder
temporary_folder = os.path.join(os.getcwd(), 'tmp')
if os.path.exists(temporary_folder):
    shutil.rmtree(temporary_folder)
    
if not os.path.exists(temporary_folder):
    os.makedirs(temporary_folder)
    
local_file_name = os.path.join(base_folder, "training_dataset", "trainingandtestdata.zip")


with zipfile.ZipFile(local_file_name, 'r') as zip_ref:
    zip_ref.extractall(temporary_folder)

### Load Training Dataset

The following function loads the training file and split it into training and test datasets

It received the following args:

* **sample_size**: the amount of rows from the file that we want to load. The whole file has 1.6MM of rows and it is unpractical to work with this amount on a local machine. For the final training with the whole dataset, a Hadoop cliuster are advised. If the arg is not informed, the function will return all the lines into two lists of dicts: one for training and another for testing
* **test_size_frac**: the fraction of lines that will be reserved for testing the model

**Note**: we are converting the Pandas DataFrame to a list of dict because nltk package does not work with Pandas

In [4]:
def load_training_dataset(sample_size = None, test_size_frac = 0.5):
    training_dataset_path = os.path.join(
        temporary_folder, 
        "training.1600000.processed.noemoticon.csv")

    training_dataset = pd.read_csv(
        training_dataset_path, 
        encoding="latin-1", 
        warn_bad_lines=True,
        error_bad_lines=False,
        header=None, 
        names=["polarity", "tweet_id", "date", "query", "user", "tweet"])
    if sample_size != None:
        training_dataset = training_dataset.sample(sample_size)

    #training_dataset = training_dataset[["tweet_id", "polarity", "tweet"]]
    
    testing_dataset = training_dataset.sample(frac = test_size_frac)

    training_dataset = training_dataset.drop(testing_dataset.index)
 
    return training_dataset.to_dict("records"), testing_dataset.to_dict("records")

In [5]:
# Load test and training dataset for exploration
training_data, testing_data = load_training_dataset(sample_size = None, test_size_frac=.5)

In [6]:
pd.DataFrame(training_data).head(10)

Unnamed: 0,polarity,tweet_id,date,query,user,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
5,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
6,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug
7,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
8,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,@Tatiana_K nope they didn't have it
9,0,1467812579,Mon Apr 06 22:20:17 PDT 2009,NO_QUERY,pardonlauren,I just re-pierced my ears


Column definitions:

0 - the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)

1 - the id of the tweet

2 - the date of the tweet

3 - the query. If there is no query, then this value is NO_QUERY.

5 - the text of the tweet

In [7]:
# We noticed that there is not a single case of Neutral (2) polarity
pd.DataFrame(training_data + testing_data).polarity.value_counts()

4    800000
0    800000
Name: polarity, dtype: int64

### Pre-process Tweets

The following class prepares the dataset by:

* Extracting the text from HTML (for the training dataset provided, we already have the text, but we want to avoid using any HTML tag for classification
* Converting all words to lower case
* Replacing any URL with "URL" constant (to enable the removal of them on a further step)
* Replacing any tagging of users with "USERTAGGING" (to enable the removal of them in a further step)
* Removing any "#" from hashtags
* Removing punctuation (has little or no weight on classification as it can be used for both intentions)
* Tokenizing (create a list of words)
* And finally, removing words and punctuation that has little or no weight on classification (and can even create biases):
    * Stop words: set of common words that are used doesn't matter the intenttion (things like it, that, a, the)
    * Remove the two constants that we used to replace user tagging and URLs
    
**Note**: we are creating a class for this process because we want to "pickle" (serialize and save as a file) it for usage on the implementation of the streaming process 

In [8]:
import re
from nltk.tokenize import word_tokenize
from string import punctuation 
from nltk.corpus import stopwords 
from bs4 import BeautifulSoup

class PreProcessTweets:
    def __init__(self):
        self._stopwords = set(stopwords.words("english") + ["USERTAGGING","URL"])
        
    def processTweets(self, list_of_tweets):
        processedTweets=[]
        for tweet in list_of_tweets:
            processedTweets.append(
                (
                    self.processTweet(tweet["tweet"]),
                    tweet["polarity"]                    
                )
            )
        return processedTweets
    
    def processTweet(self, tweet):
        tweet = BeautifulSoup(tweet).get_text() # Extracts text from HTML (just in case!)
        tweet = tweet.lower() # Converts text to lower-case
        tweet = re.sub("((www\.[^\s]+)|(https?://[^\s]+))", "URL", tweet) # Replces URLs by URL constan
        tweet = re.sub("@[^\s]+", "USERTAGGING", tweet) # Replaces usernames by USERTAGGING constant 
        tweet = re.sub(r"#([^\s]+)", r"\1", tweet) # Removes the # in #hashtag
        for p in punctuation: 
            tweet = tweet.replace(p, "") # Removes punctiation
        tweet = word_tokenize(tweet) # Creates a list of words
        return [word for word in tweet if word not in self._stopwords]

In [9]:
# Load test and training dataset for modeling
training_data, testing_data = load_training_dataset(sample_size = 10000, test_size_frac=.5)

In [10]:
#Preprocessing Tweets
tweet_processor = PreProcessTweets()
pp_training_data = tweet_processor.processTweets(training_data)

In [11]:
# Let's take a look on how some tweets look like after cleansing and tokenization
pp_training_data[:4]

[(['afternoon',
   'took',
   'boys',
   'swimming',
   'morning',
   'lovely',
   'pink',
   'hair',
   'gone',
   'yellow'],
  0),
 (['trying', 'get', 'swine', 'flu'], 4),
 (['lol'], 4),
 (['guys',
   'always',
   'hang',
   'conserv',
   'never',
   'invite',
   'hahaha',
   'emo',
   'shiz',
   'hi',
   'charles'],
  0)]

### Build Vocabulary

The function below builds the vocabulary, it means the list of all words that we are going to use to train our model and later use to evaluate the tweet

Some people argues that it is better to focus on the most used words (e.g. 2500 most used in our training dataset) and/or the words more present on documents (in our case tweets - like the words that are more present in more tweets)

For the sake of this project, as it is not focused on the assertiveness of the model itself, but in the implementation of a pipeline using a model, we are going to use all words

In [12]:
import nltk 

def build_vocabulary(preprocessed_training_dataset):
    all_words = []
    
    for (words, polarity) in preprocessed_training_dataset:
        all_words.extend(words)

    word_list = nltk.FreqDist(all_words)
    word_features = word_list.keys()
    
    return word_features

In [13]:
# Then we build our vocabulary
word_features = build_vocabulary(pp_training_data)

In [14]:
# and let's take a look on it:
list(word_features)[:10]

['afternoon',
 'took',
 'boys',
 'swimming',
 'morning',
 'lovely',
 'pink',
 'hair',
 'gone',
 'yellow']

### Generating Features
The function below needs to be called for each one of the tweets and basically tags (with True) on a instance of the dictionary previously built which words in that instance of the dictionary that are used in that specific tweet. Thus, the majority of words will ba tagged as False and a small number of them (the ones contained in the tweet) as True 

**To-do**: this should also be encapsulated on a class in order to have it pickled. Or maybe encapsulate the whole code?!?!?

In [15]:
def extract_features(tweet):
    tweet_words=set(tweet)
    features={}
    for word in word_features:
        features['contains(%s)' % word]=(word in tweet_words)
    return features 

In [16]:
# Building the training features
training_features = nltk.classify.apply_features(extract_features,pp_training_data)

In [17]:
# And taking a look into it
print(str(list(training_features)[:1])[0:1000], "...")

[({'contains(afternoon)': True, 'contains(took)': True, 'contains(boys)': True, 'contains(swimming)': True, 'contains(morning)': True, 'contains(lovely)': True, 'contains(pink)': True, 'contains(hair)': True, 'contains(gone)': True, 'contains(yellow)': True, 'contains(trying)': False, 'contains(get)': False, 'contains(swine)': False, 'contains(flu)': False, 'contains(lol)': False, 'contains(guys)': False, 'contains(always)': False, 'contains(hang)': False, 'contains(conserv)': False, 'contains(never)': False, 'contains(invite)': False, 'contains(hahaha)': False, 'contains(emo)': False, 'contains(shiz)': False, 'contains(hi)': False, 'contains(charles)': False, 'contains(believe)': False, 'contains(better)': False, 'contains(nick)': False, 'contains(matter)': False, 'contains(sounds)': False, 'contains(much)': False, 'contains(burntwell)': False, 'contains(would)': False, 'contains(wiser)': False, 'contains(put)': False, 'contains(salad)': False, 'contains(dressing)': False, 'contains(n

### Training the model
And finally, we are going to train the model using Naive Bayes. We could have tried other classification algorithms but again, the main purpose of this project is the implementation of the pipeline, not the accuracy of the model

In [18]:
NBayesClassifier = nltk.NaiveBayesClassifier.train(training_features)

### Using the model
The following code uses the model trained to classify each one of the tweets of our testing dataset

Note that before we do the classification, we need to apply the preprocess (cleansing and tokenizing) that we have built before and extract the features using our dictionary

In [19]:
li = []
threshold = 0  #.65
for each_tweet in testing_data:
    words = tweet_processor.processTweet(each_tweet["tweet"])
    features = extract_features(words)
    predicted = NBayesClassifier.classify(features)
    probability = NBayesClassifier.prob_classify(features).prob(predicted)
    row = {
        "polarity": each_tweet["polarity"],
        "tweet_id": each_tweet["tweet_id"],
        "date": each_tweet["date"],
        "query": each_tweet["query"],
        "user": each_tweet["user"],
        "tweet": each_tweet["tweet"],
        "predicted": predicted if probability > threshold else 2,
        "probability": probability
    }

    li.append(row)                                

The next code snippet just creates a Pandas DataFrame with the results of our prediction along with some variables that we are going to use on our evaluation of the model

In [20]:
final_dataset = pd.DataFrame(li)
Y_test = final_dataset["polarity"]
predicted = final_dataset["predicted"]
final_dataset

Unnamed: 0,polarity,tweet_id,date,query,user,tweet,predicted,probability
0,4,1752963165,Sat May 09 23:36:37 PDT 2009,NO_QUERY,iMartha182,"well i guess it all depends,undergarments.",0,0.531430
1,0,2262731169,Sat Jun 20 23:06:48 PDT 2009,NO_QUERY,hisydneyxo,"cant sleep. blaring inseparable, just friends,...",4,0.977596
2,0,2048547569,Fri Jun 05 15:22:44 PDT 2009,NO_QUERY,megan_spfan,@beckyscherger aww... i'm so sorry about your ...,0,0.800908
3,4,2175246659,Mon Jun 15 00:28:10 PDT 2009,NO_QUERY,laura_dolcepics,@MsYuri I know what you mean about bad sleep s...,0,0.871236
4,4,2004969209,Tue Jun 02 09:01:06 PDT 2009,NO_QUERY,mishhhx3,Ok... I lied. Mini tacos for breakfast: not so...,4,0.996309
...,...,...,...,...,...,...,...,...
4995,4,1565430530,Mon Apr 20 06:44:25 PDT 2009,NO_QUERY,wipeout_ut,@BearMate hard to be productive on Mondays.,0,0.523514
4996,0,1694562733,Mon May 04 03:11:12 PDT 2009,NO_QUERY,lenje,"@Silverlines Oh well, though I don't like meeb...",0,0.968952
4997,4,2063449641,Sun Jun 07 02:42:21 PDT 2009,NO_QUERY,thinkadrian,Where can I see live #eu09 results on the inte...,4,0.795563
4998,4,1962308800,Fri May 29 10:50:47 PDT 2009,NO_QUERY,niamhsmith,@mrelihan oh hey thanks for the recommendation...,4,0.995011


Here is the Confusion Matrix (just reminding that we did not use the whole training dataset, just a sample of it)

In [21]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("Confusion Matrix:\n", confusion_matrix(Y_test,predicted))

Confusion Matrix:
 [[1760  744]
 [ 727 1769]]


And here our classification report

In [22]:
print("Classification Report:\n", classification_report(Y_test,predicted))

Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.70      0.71      2504
           4       0.70      0.71      0.71      2496

    accuracy                           0.71      5000
   macro avg       0.71      0.71      0.71      5000
weighted avg       0.71      0.71      0.71      5000



Just extracting the precision (with more precision....hahaha)

In [23]:
print("Precision:\n", accuracy_score(Y_test, predicted))

Precision:
 0.7058


In [28]:
import dill as pickle

In [30]:
model_folder = os.path.join(os.getcwd(), 'saved_models')
    
if not os.path.exists(model_folder):
    os.makedirs(model_folder)
    
model_full_path = os.path.join(model_folder, "twitter_sentiment.pk")

In [32]:
pickle.dump(NBayesClassifier, open(model_full_path, "wb"))

In [33]:
test_pickle = pickle.load(open(model_full_path, "rb"))

In [34]:
features = extract_features(words)
predicted = test_pickle.classify(features)
probability = test_pickle.prob_classify(features).prob(predicted)

print(words, predicted, probability)

['trotzdem', 'guten', 'morgen', 'alle', 'followfriday'] 4 0.9822019301460024


In [6]:
test_dataset_path = os.path.join(
    temporary_folder, 
    "testdata.manual.2009.06.14.csv")

test_dataset = pd.read_csv(
    test_dataset_path, 
    encoding="latin-1", 
    warn_bad_lines=True,
    error_bad_lines=False,
    header=None, 
    names=["polarity", "tweet_id", "date", "query", "user", "tweet"])

test_dataset

Unnamed: 0,polarity,tweet_id,date,query,user,tweet
0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
2,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
4,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...
...,...,...,...,...,...,...
493,2,14072,Sun Jun 14 04:31:43 UTC 2009,latex,proggit,Ask Programming: LaTeX or InDesign?: submitted...
494,0,14073,Sun Jun 14 04:32:17 UTC 2009,latex,sam33r,"On that note, I hate Word. I hate Pages. I hat..."
495,4,14074,Sun Jun 14 04:36:34 UTC 2009,latex,iamtheonlyjosie,Ahhh... back in a *real* text editing environm...
496,0,14075,Sun Jun 14 21:36:07 UTC 2009,iran,plutopup7,"Trouble in Iran, I see. Hmm. Iran. Iran so far..."


In [24]:
# Delete temporary folder
if os.path.exists(temporary_folder):
    shutil.rmtree(temporary_folder)