In [1]:
import os
import shutil
import zipfile

In [2]:
base_folder = os.getcwd()
temporary_folder = os.path.join(os.getcwd(), "tmp")

In [3]:
def unzip_files():
# Unzip file on a temporary folder
    if os.path.exists(temporary_folder):
        shutil.rmtree(temporary_folder)
        
    if not os.path.exists(temporary_folder):
        os.makedirs(temporary_folder)
        
    local_file_name = os.path.join(base_folder, "training_dataset", "trainingandtestdata.zip")
    with zipfile.ZipFile(local_file_name, 'r') as zip_ref:
        zip_ref.extractall(temporary_folder)

### Pre-process Tweets

The following function prepares the tweet by:

* Extracting the text from HTML (for the training dataset provided, we already have the text, but we want to avoid using any HTML tag for classification
* Converting all words to lower case
* Replacing any URL with "URL" constant (to enable the removal of them on a further step)
* Replacing any tagging of users with "USERTAGGING" (to enable the removal of them in a further step)
* Removing any "#" from hashtags
* Removing punctuation (has little or no weight on classification as it can be used for both intentions)
* And finally, removing words and punctuation that has little or no weight on classification (and can even create biases):
    * Stop words: set of common words that are used doesn't matter the intenttion (things like it, that, a, the)
    * Remove the two constants that we used to replace user tagging and URLs

In [4]:
def cleansing(tweet):
# Cleansing tweet
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords 
    from string import punctuation 
    from bs4 import BeautifulSoup
    import re
    
    terms_to_remove = set(stopwords.words("english") + ["USERTAGGING","URL"])
    tweet = BeautifulSoup(tweet, 'html.parser').get_text() # Extracts text from HTML (just in case!)
    tweet = tweet.lower() # Converts text to lower-case
    tweet = re.sub("((www\.[^\s]+)|(https?://[^\s]+))", "URL", tweet) # Replces URLs by URL constan
    tweet = re.sub("@[^\s]+", "USERTAGGING", tweet) # Replaces usernames by USERTAGGING constant 
    tweet = re.sub(r"#([^\s]+)", r"\1", tweet) # Removes the # in #hashtag
    for p in punctuation: 
        tweet = tweet.replace(p, "") # Removes punctiation
    tweet = word_tokenize(tweet) # Creates a list of words
    words = ""
    for each_word in tweet:
        if each_word not in terms_to_remove:
            words = words + " " + each_word
    # return [word for word in tweet if word not in terms_to_remove]
    return words[1:]

In [5]:
unzip_files()

In [6]:
# Start Spark session, load the test dataset into a Spark DataFrame and then adjust column names
from pyspark.sql import SparkSession, functions

spark = SparkSession.builder.master("local").appName("Training Twitter Sentiment Analysis").getOrCreate()
test_data = spark.read.load(
    "tmp/testdata.manual.2009.06.14.csv",
    format="csv")
test_data = test_data.withColumnRenamed("_c0", "label") \
    .withColumnRenamed("_c1", "tweet_id") \
    .withColumnRenamed("_c2", "date") \
    .withColumnRenamed("_c3", "query") \
    .withColumnRenamed("_c4", "user") \
    .withColumnRenamed("_c5", "tweet")

In [7]:
# Run the cleansing UDF for tweet column
udf_cleansing = functions.udf(cleansing)
test_data = test_data.withColumn("tweet_cleansed", udf_cleansing(functions.col("tweet")))
test_data.show(5)

+-----+--------+--------------------+-------+--------+--------------------+--------------------+
|label|tweet_id|                date|  query|    user|               tweet|      tweet_cleansed|
+-----+--------+--------------------+-------+--------+--------------------+--------------------+
|    4|       3|Mon May 11 03:17:...|kindle2|  tpryan|@stellargirl I lo...|loooooooovvvvvvee...|
|    4|       4|Mon May 11 03:18:...|kindle2|  vcu451|Reading my kindle...|reading kindle2 l...|
|    4|       5|Mon May 11 03:18:...|kindle2|  chadfu|Ok, first assesme...|ok first assesmen...|
|    4|       6|Mon May 11 03:19:...|kindle2|   SIX15|@kenburbary You'l...|youll love kindle...|
|    4|       7|Mon May 11 03:21:...|kindle2|yamarama|@mikefish  Fair e...|fair enough kindl...|
+-----+--------+--------------------+-------+--------+--------------------+--------------------+
only showing top 5 rows



### Tokenizing

The following code snippet creates a list of every remaining word (after cleansing) that will be used to build the features for training the model

In [8]:
from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol="tweet_cleansed", outputCol="words")
test_data = tokenizer.transform(test_data)
test_data.show(5)

+-----+--------+--------------------+-------+--------+--------------------+--------------------+--------------------+
|label|tweet_id|                date|  query|    user|               tweet|      tweet_cleansed|               words|
+-----+--------+--------------------+-------+--------+--------------------+--------------------+--------------------+
|    4|       3|Mon May 11 03:17:...|kindle2|  tpryan|@stellargirl I lo...|loooooooovvvvvvee...|[loooooooovvvvvve...|
|    4|       4|Mon May 11 03:18:...|kindle2|  vcu451|Reading my kindle...|reading kindle2 l...|[reading, kindle2...|
|    4|       5|Mon May 11 03:18:...|kindle2|  chadfu|Ok, first assesme...|ok first assesmen...|[ok, first, asses...|
|    4|       6|Mon May 11 03:19:...|kindle2|   SIX15|@kenburbary You'l...|youll love kindle...|[youll, love, kin...|
|    4|       7|Mon May 11 03:21:...|kindle2|yamarama|@mikefish  Fair e...|fair enough kindl...|[fair, enough, ki...|
+-----+--------+--------------------+-------+--------+--

### HashingTF

The following code snippet creates the features, which means a list of all terms available on all "documents" with a "tag" indicating its presence or not on the specific "document"

In [9]:
from pyspark.ml.feature import HashingTF
hashingTF = HashingTF(inputCol="words", outputCol="features")
test_data = hashingTF.transform(test_data)
test_data.show(5)

+-----+--------+--------------------+-------+--------+--------------------+--------------------+--------------------+--------------------+
|label|tweet_id|                date|  query|    user|               tweet|      tweet_cleansed|               words|            features|
+-----+--------+--------------------+-------+--------+--------------------+--------------------+--------------------+--------------------+
|    4|       3|Mon May 11 03:17:...|kindle2|  tpryan|@stellargirl I lo...|loooooooovvvvvvee...|[loooooooovvvvvve...|(262144,[12524,83...|
|    4|       4|Mon May 11 03:18:...|kindle2|  vcu451|Reading my kindle...|reading kindle2 l...|[reading, kindle2...|(262144,[53570,73...|
|    4|       5|Mon May 11 03:18:...|kindle2|  chadfu|Ok, first assesme...|ok first assesmen...|[ok, first, asses...|(262144,[41748,12...|
|    4|       6|Mon May 11 03:19:...|kindle2|   SIX15|@kenburbary You'l...|youll love kindle...|[youll, love, kin...|(262144,[1546,218...|
|    4|       7|Mon May 11 

In [10]:
# Creates a numeric index for the labels
from pyspark.ml.feature import StringIndexer
stringIndexer = StringIndexer(inputCol="label", outputCol="labelIndex")
model = stringIndexer.fit(test_data)
test_data = model.transform(test_data)
test_data.show(5)

+-----+--------+--------------------+-------+--------+--------------------+--------------------+--------------------+--------------------+----------+
|label|tweet_id|                date|  query|    user|               tweet|      tweet_cleansed|               words|            features|labelIndex|
+-----+--------+--------------------+-------+--------+--------------------+--------------------+--------------------+--------------------+----------+
|    4|       3|Mon May 11 03:17:...|kindle2|  tpryan|@stellargirl I lo...|loooooooovvvvvvee...|[loooooooovvvvvve...|(262144,[12524,83...|       0.0|
|    4|       4|Mon May 11 03:18:...|kindle2|  vcu451|Reading my kindle...|reading kindle2 l...|[reading, kindle2...|(262144,[53570,73...|       0.0|
|    4|       5|Mon May 11 03:18:...|kindle2|  chadfu|Ok, first assesme...|ok first assesmen...|[ok, first, asses...|(262144,[41748,12...|       0.0|
|    4|       6|Mon May 11 03:19:...|kindle2|   SIX15|@kenburbary You'l...|youll love kindle...|[you

### Loading saved model

Here we load the saved model that we have previously persisted on disk and use it to classify our test set (which is different from the one we used to evaluate when modeling). It is important that the DataFrame being used have the same columns (features and labelIndex) that we used when training it.

In [11]:
model_folder = os.path.join(os.getcwd(), 'saved_models')
model_full_path = os.path.join(model_folder, "twitter_sentiment_spark")
if not os.path.exists(model_folder):
    print("model does not exists")

from pyspark.ml.classification import NaiveBayesModel
loadModel = NaiveBayesModel.load(model_full_path)


In [12]:
# Classifying using saved modelk
predicted = loadModel.transform(test_data)

In [13]:
predicted.select("label", "tweet", "labelIndex", "NB_pred").show(5)

+-----+--------------------+----------+-------+
|label|               tweet|labelIndex|NB_pred|
+-----+--------------------+----------+-------+
|    4|@stellargirl I lo...|       0.0|    0.0|
|    4|Reading my kindle...|       0.0|    0.0|
|    4|Ok, first assesme...|       0.0|    0.0|
|    4|@kenburbary You'l...|       0.0|    0.0|
|    4|@mikefish  Fair e...|       0.0|    1.0|
+-----+--------------------+----------+-------+
only showing top 5 rows



In [14]:
# Evaluating the results with test dataset
# It is important to note that our training set didn't have any Neutral (polarity = 2) single case
total = predicted.count()
correct = predicted.where(predicted['labelIndex'] == predicted['NB_pred']).count()
accuracy = correct/total
print(
    "\nTotal:", total, 
    "\nCorrect:", correct, 
    "\nAccuracy:", accuracy)


Total: 498 
Correct: 277 
Accuracy: 0.5562248995983936


### Applying the initial label

As we needed to create an index for our labels, we now need to "translate" back the predicted label index to our label. So, we firts create a "domain" table with them

In [15]:
labels = predicted.select("labelIndex", "label").distinct() \
    .withColumnRenamed("label", "label_predicted") \
    .withColumnRenamed("labelIndex", "label_id")
labels.show()

+--------+---------------+
|label_id|label_predicted|
+--------+---------------+
|     0.0|              4|
|     2.0|              2|
|     1.0|              0|
+--------+---------------+



Now we join our "domain" table back to get the original labels

In [16]:
predicted = predicted.join(labels, predicted["NB_pred"] == labels["label_id"])

In [17]:
predicted.select("label", "tweet_id", "date", "user", "tweet", "label_predicted", "NB_prob").show()

+-----+--------+--------------------+--------------+--------------------+---------------+--------------------+
|label|tweet_id|                date|          user|               tweet|label_predicted|             NB_prob|
+-----+--------+--------------------+--------------+--------------------+---------------+--------------------+
|    4|       3|Mon May 11 03:17:...|        tpryan|@stellargirl I lo...|              4|[0.95150414342958...|
|    4|       4|Mon May 11 03:18:...|        vcu451|Reading my kindle...|              4|[0.98968317981950...|
|    4|       5|Mon May 11 03:18:...|        chadfu|Ok, first assesme...|              4|[0.70624622765694...|
|    4|       6|Mon May 11 03:19:...|         SIX15|@kenburbary You'l...|              4|[0.59613169847051...|
|    4|       7|Mon May 11 03:21:...|      yamarama|@mikefish  Fair e...|              0|[0.43800417559941...|
|    4|       8|Mon May 11 03:22:...|  GeorgeVHulme|@richardebaker no...|              4|[0.83254331905408...|
|

In [18]:

def get_probability(probability_vector, predicted_label_index):
    probability_array = probability_vector.tolist()
    return probability_array[int(predicted_label_index)]

In [19]:
udf_get_probability = functions.udf(get_probability)

In [20]:

predicted_with_prob = predicted.withColumn("probability", udf_get_probability(
    functions.col("NB_prob"), functions.col("NB_pred")))

In [21]:
predicted_with_prob.select("NB_pred", "NB_prob", "probability").show()

+-------+--------------------+------------------+
|NB_pred|             NB_prob|       probability|
+-------+--------------------+------------------+
|    0.0|[0.95150414342958...| 0.951504143429587|
|    0.0|[0.98968317981950...|0.9896831798195027|
|    0.0|[0.70624622765694...|0.7062462276569403|
|    0.0|[0.59613169847051...|0.5961316984705186|
|    1.0|[0.43800417559941...|0.5619958244005896|
|    0.0|[0.83254331905408...|0.8325433190540862|
|    1.0|[0.01199913436638...|0.9880008656336186|
|    0.0|[0.88875378612207...| 0.888753786122072|
|    0.0|[0.62671938887919...|0.6267193888791934|
|    0.0|[0.71466638750544...|0.7146663875054433|
|    0.0|[0.86587624628720...|0.8658762462872054|
|    1.0|[0.13753544592722...| 0.862464554072773|
|    1.0|[0.25473063985281...|0.7452693601471875|
|    0.0|[0.59915849188848...|0.5991584918884824|
|    1.0|[0.08884416442885...|0.9111558355711441|
|    1.0|[0.07365394457048...|0.9263460554295169|
|    1.0|[0.00658412496222...|0.9934158750377707|


In [22]:
probability_threshold = .70
predicted_with_prob = predicted_with_prob.withColumn(
    "label_predicted", 
     functions.when(predicted_with_prob.probability < probability_threshold, "2")
    .otherwise(predicted_with_prob.label_predicted))

In [23]:
predicted_with_prob.select("label", "tweet_id", "date", "user", "tweet", "label_predicted", "NB_prob").show()

+-----+--------+--------------------+--------------+--------------------+---------------+--------------------+
|label|tweet_id|                date|          user|               tweet|label_predicted|             NB_prob|
+-----+--------+--------------------+--------------+--------------------+---------------+--------------------+
|    4|       3|Mon May 11 03:17:...|        tpryan|@stellargirl I lo...|              4|[0.95150414342958...|
|    4|       4|Mon May 11 03:18:...|        vcu451|Reading my kindle...|              4|[0.98968317981950...|
|    4|       5|Mon May 11 03:18:...|        chadfu|Ok, first assesme...|              4|[0.70624622765694...|
|    4|       6|Mon May 11 03:19:...|         SIX15|@kenburbary You'l...|              2|[0.59613169847051...|
|    4|       7|Mon May 11 03:21:...|      yamarama|@mikefish  Fair e...|              2|[0.43800417559941...|
|    4|       8|Mon May 11 03:22:...|  GeorgeVHulme|@richardebaker no...|              4|[0.83254331905408...|
|

In [24]:
total_ajd = predicted_with_prob.count()
correct_adj = predicted_with_prob.where(predicted_with_prob['label'] == predicted_with_prob['label_predicted']).count()
accuracy_adj = correct_adj/total_ajd
print(
    "\nTotal:", total_ajd, " was ", total, 
    "\nCorrect:", correct_adj, " was ", correct,
    "\nAccuracy:", accuracy_adj, " was ", accuracy)


Total: 498  was  498 
Correct: 279  was  277 
Accuracy: 0.5602409638554217  was  0.5562248995983936


In [25]:
# Delete temporary folder
if os.path.exists(temporary_folder):
    shutil.rmtree(temporary_folder)   