In [1]:
import os
import shutil
import zipfile

In [2]:
base_folder = os.getcwd()
temporary_folder = os.path.join(os.getcwd(), "tmp")

In [3]:
def unzip_files():
# Unzip file on a temporary folder
    if os.path.exists(temporary_folder):
        shutil.rmtree(temporary_folder)
        
    if not os.path.exists(temporary_folder):
        os.makedirs(temporary_folder)
        
    local_file_name = os.path.join(base_folder, "training_dataset", "trainingandtestdata.zip")
    with zipfile.ZipFile(local_file_name, 'r') as zip_ref:
        zip_ref.extractall(temporary_folder)

### Pre-process Tweets

The following function prepares the tweet by:

* Extracting the text from HTML (for the training dataset provided, we already have the text, but we want to avoid using any HTML tag for classification
* Converting all words to lower case
* Replacing any URL with "URL" constant (to enable the removal of them on a further step)
* Replacing any tagging of users with "USERTAGGING" (to enable the removal of them in a further step)
* Removing any "#" from hashtags
* Removing punctuation (has little or no weight on classification as it can be used for both intentions)
* And finally, removing words and punctuation that has little or no weight on classification (and can even create biases):
    * Stop words: set of common words that are used doesn't matter the intenttion (things like it, that, a, the)
    * Remove the two constants that we used to replace user tagging and URLs

In [4]:
def cleansing(tweet):
# Cleansing tweet
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords 
    from string import punctuation 
    from bs4 import BeautifulSoup
    import re
    
    terms_to_remove = set(stopwords.words("english") + ["USERTAGGING","URL"])
    tweet = BeautifulSoup(tweet, 'html.parser').get_text() # Extracts text from HTML (just in case!)
    tweet = tweet.lower() # Converts text to lower-case
    tweet = re.sub("((www\.[^\s]+)|(https?://[^\s]+))", "URL", tweet) # Replces URLs by URL constan
    tweet = re.sub("@[^\s]+", "USERTAGGING", tweet) # Replaces usernames by USERTAGGING constant 
    tweet = re.sub(r"#([^\s]+)", r"\1", tweet) # Removes the # in #hashtag
    for p in punctuation: 
        tweet = tweet.replace(p, "") # Removes punctiation
    tweet = word_tokenize(tweet) # Creates a list of words
    words = ""
    for each_word in tweet:
        if each_word not in terms_to_remove:
            words = words + " " + each_word
    # return [word for word in tweet if word not in terms_to_remove]
    return words[1:]

In [5]:
unzip_files()

In [6]:
# Start Spark session, load the dataset into a Spark DataFrame and then adjust column names
from pyspark.sql import SparkSession, functions

spark = SparkSession.builder.master("local").appName("Training Twitter Sentiment Analysis").getOrCreate()
training_data = spark.read.load(
    "tmp/training.1600000.processed.noemoticon.csv",
    format="csv")
training_data = training_data.withColumnRenamed("_c0", "label") \
    .withColumnRenamed("_c1", "tweet_id") \
    .withColumnRenamed("_c2", "date") \
    .withColumnRenamed("_c3", "query") \
    .withColumnRenamed("_c4", "user") \
    .withColumnRenamed("_c5", "tweet")


In [7]:
# We are loading just a bunch of lines locally. On the server we will use the whole dataset to train the model
sample_size = 20000
training_data = training_data.sample(sample_size / training_data.count())

training_data = training_data.select(functions.col("label"), functions.col("tweet"))

In [8]:
# Run the cleansing UDF for tweet column
udf_cleansing = functions.udf(cleansing)
training_data = training_data.withColumn("tweet_cleansed", udf_cleansing(functions.col("tweet")))
training_data.show(5)

+-----+--------------------+--------------------+
|label|               tweet|      tweet_cleansed|
+-----+--------------------+--------------------+
|    0|spring break in p...|spring break plai...|
|    0|i think my arms a...|think arms sore t...|
|    0|@SarahReedSC trea...| treaty isnt defined|
|    0|Think I'm going t...|think im going be...|
|    0|Uh oh... I think ...|uh oh think getti...|
+-----+--------------------+--------------------+
only showing top 5 rows



### Tokenizing

The following code snippet creates a list of every remaining word (after cleansing) that will be used to build the features for training the model

In [9]:
from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol="tweet_cleansed", outputCol="words")
training_data = tokenizer.transform(training_data)
training_data.show(5)

+-----+--------------------+--------------------+--------------------+
|label|               tweet|      tweet_cleansed|               words|
+-----+--------------------+--------------------+--------------------+
|    0|spring break in p...|spring break plai...|[spring, break, p...|
|    0|i think my arms a...|think arms sore t...|[think, arms, sor...|
|    0|@SarahReedSC trea...| treaty isnt defined|[treaty, isnt, de...|
|    0|Think I'm going t...|think im going be...|[think, im, going...|
|    0|Uh oh... I think ...|uh oh think getti...|[uh, oh, think, g...|
+-----+--------------------+--------------------+--------------------+
only showing top 5 rows



### HashingTF

The following code snippet creates the features, which means a list of all terms available on all "documents" with a "tag" indicating its presence or not on the specific "document"

In [10]:
from pyspark.ml.feature import HashingTF
hashingTF = HashingTF(inputCol="words", outputCol="features")
training_data = hashingTF.transform(training_data)
training_data.show(5)

+-----+--------------------+--------------------+--------------------+--------------------+
|label|               tweet|      tweet_cleansed|               words|            features|
+-----+--------------------+--------------------+--------------------+--------------------+
|    0|spring break in p...|spring break plai...|[spring, break, p...|(262144,[36879,12...|
|    0|i think my arms a...|think arms sore t...|[think, arms, sor...|(262144,[46044,13...|
|    0|@SarahReedSC trea...| treaty isnt defined|[treaty, isnt, de...|(262144,[113957,1...|
|    0|Think I'm going t...|think im going be...|[think, im, going...|(262144,[21641,31...|
|    0|Uh oh... I think ...|uh oh think getti...|[uh, oh, think, g...|(262144,[18184,15...|
+-----+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [11]:
# Creates a numeric index for the labels
from pyspark.ml.feature import StringIndexer
stringIndexer = StringIndexer(inputCol="label", outputCol="labelIndex")
model = stringIndexer.fit(training_data)
training_data = model.transform(training_data)
training_data.show(5)

+-----+--------------------+--------------------+--------------------+--------------------+----------+
|label|               tweet|      tweet_cleansed|               words|            features|labelIndex|
+-----+--------------------+--------------------+--------------------+--------------------+----------+
|    0|spring break in p...|spring break plai...|[spring, break, p...|(262144,[36879,12...|       0.0|
|    0|i think my arms a...|think arms sore t...|[think, arms, sor...|(262144,[46044,13...|       0.0|
|    0|@SarahReedSC trea...| treaty isnt defined|[treaty, isnt, de...|(262144,[113957,1...|       0.0|
|    0|Think I'm going t...|think im going be...|[think, im, going...|(262144,[21641,31...|       0.0|
|    0|Uh oh... I think ...|uh oh think getti...|[uh, oh, think, g...|(262144,[18184,15...|       0.0|
+-----+--------------------+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [12]:
# Split dataset into training and test
training, test = training_data.randomSplit([0.5, 0.5])

### Training and evaluating the model
In this step we train the NaiveBayes model using our training slice. Then, we use the test slice to evaluate the level of accuracy

In [13]:
from pyspark.ml.classification import NaiveBayes
#Naive bayes
nb = NaiveBayes(featuresCol="features", labelCol="labelIndex", predictionCol="NB_pred",
                probabilityCol="NB_prob", rawPredictionCol="NB_rawPred")
nbModel = nb.fit(training)
cv = nbModel.transform(test)
total = cv.count()
correct = cv.where(cv['labelIndex'] == cv['NB_pred']).count()
accuracy = correct/total

print(
    "\nTotal:", total, 
    "\nCorrect:", correct, 
    "\nAccuracy:", accuracy)


Total: 10242 
Correct: 7393 
Accuracy: 0.7218316735012693


In [14]:
cv.select("label", "tweet", "labelIndex", "NB_pred", "NB_rawPred", "NB_prob").show()

+-----+--------------------+----------+-------+--------------------+--------------------+
|label|               tweet|labelIndex|NB_pred|          NB_rawPred|             NB_prob|
+-----+--------------------+----------+-------+--------------------+--------------------+
|    0|       FS keeps c...|       0.0|    0.0|[-78.703316673940...|[0.79291434174872...|
|    0|   (must i say mo...|       0.0|    1.0|[-19.507001476810...|[0.35424702859924...|
|    0|  I TALKED TO U B...|       0.0|    0.0|[-56.236574224334...|[0.73819625884837...|
|    0|  i was too slow ...|       0.0|    0.0|[-38.401551806386...|[0.63381871955419...|
|    0|  im sick  'cough...|       0.0|    0.0|[-36.952716408033...|[0.99598596262325...|
|    0| #IMISSCATH #IMIS...|       0.0|    1.0|[-152.00561764964...|[0.47699695038052...|
|    0| #p1wimax no sign...|       0.0|    1.0|[-49.748570255830...|[0.49022116629620...|
|    0| ....  i don't kn...|       0.0|    0.0|[-15.160465677834...|[0.67468331301980...|
|    0| ;(

In [15]:
# Saving trained model for usage in a Pipeline (so you don't need to re-train everytime you need to use it)
model_folder = os.path.join(os.getcwd(), 'saved_models')

if not os.path.exists(model_folder):
    os.makedirs(model_folder)
    
model_full_path = os.path.join(model_folder, "twitter_sentiment_spark")
nbModel.write().overwrite().save(model_full_path)

In [16]:
# Delete temporary folder
if os.path.exists(temporary_folder):
    shutil.rmtree(temporary_folder)    