In [1]:
import os
import shutil
import zipfile

In [2]:
base_folder = os.getcwd()
temporary_folder = os.path.join(os.getcwd(), "tmp")

In [3]:
def unzip_files():
# Unzip file on a temporary folder
    if os.path.exists(temporary_folder):
        shutil.rmtree(temporary_folder)
        
    if not os.path.exists(temporary_folder):
        os.makedirs(temporary_folder)
        
    local_file_name = os.path.join(base_folder, "training_dataset", "trainingandtestdata.zip")
    with zipfile.ZipFile(local_file_name, 'r') as zip_ref:
        zip_ref.extractall(temporary_folder)

### Pre-process Tweets

The following function prepares the tweet by:

* Extracting the text from HTML (for the training dataset provided, we already have the text, but we want to avoid using any HTML tag for classification
* Converting all words to lower case
* Replacing any URL with "URL" constant (to enable the removal of them on a further step)
* Replacing any tagging of users with "USERTAGGING" (to enable the removal of them in a further step)
* Removing any "#" from hashtags
* Removing punctuation (has little or no weight on classification as it can be used for both intentions)
* And finally, removing words and punctuation that has little or no weight on classification (and can even create biases):
    * Stop words: set of common words that are used doesn't matter the intenttion (things like it, that, a, the)
    * Remove the two constants that we used to replace user tagging and URLs

In [4]:
def cleansing(tweet):
# Cleansing tweet
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords 
    from string import punctuation 
    from bs4 import BeautifulSoup
    import re
    
    terms_to_remove = set(stopwords.words("english") + ["USERTAGGING","URL"])
    tweet = BeautifulSoup(tweet, 'html.parser').get_text() # Extracts text from HTML (just in case!)
    tweet = tweet.lower() # Converts text to lower-case
    tweet = re.sub("((www\.[^\s]+)|(https?://[^\s]+))", "URL", tweet) # Replces URLs by URL constan
    tweet = re.sub("@[^\s]+", "USERTAGGING", tweet) # Replaces usernames by USERTAGGING constant 
    tweet = re.sub(r"#([^\s]+)", r"\1", tweet) # Removes the # in #hashtag
    for p in punctuation: 
        tweet = tweet.replace(p, "") # Removes punctiation
    tweet = word_tokenize(tweet) # Creates a list of words
    words = ""
    for each_word in tweet:
        if each_word not in terms_to_remove:
            words = words + " " + each_word
    # return [word for word in tweet if word not in terms_to_remove]
    return words[1:]

In [5]:
unzip_files()

In [6]:
# Start Spark session, load the dataset into a Spark DataFrame and then adjust column names
from pyspark.sql import SparkSession, functions

spark = SparkSession.builder.master("local").appName("Training Twitter Sentiment Analysis").getOrCreate()
training_data = spark.read.load(
    "tmp/training.1600000.processed.noemoticon.csv",
    format="csv")
training_data = training_data.withColumnRenamed("_c0", "label") \
    .withColumnRenamed("_c1", "tweet_id") \
    .withColumnRenamed("_c2", "date") \
    .withColumnRenamed("_c3", "query") \
    .withColumnRenamed("_c4", "user") \
    .withColumnRenamed("_c5", "tweet")


In [7]:
# We are loading just a bunch of lines locally. On the server we will use the whole dataset to train the model
sample_size = 20000
training_data = training_data.sample(sample_size / training_data.count())

training_data = training_data.select(functions.col("label"), functions.col("tweet"))

In [8]:
# Run the cleansing UDF for tweet column
udf_cleansing = functions.udf(cleansing)
training_data = training_data.withColumn("tweet_cleansed", udf_cleansing(functions.col("tweet")))
training_data.show(5)

+-----+--------------------+--------------------+
|label|               tweet|      tweet_cleansed|
+-----+--------------------+--------------------+
|    0|blagh class at 8 ...|blagh class 8 tom...|
|    0|@tea oh! i'm so s...|oh im sorry didnt...|
|    0|A bad nite for th...|bad nite favorite...|
|    0|@ColinDeMar Far t...|   far way rail tips|
|    0|@rcompo RACHEL! h...|rachel hang outag...|
+-----+--------------------+--------------------+
only showing top 5 rows



### Tokenizing

The following code snippet creates a list of every remaining word (after cleansing) that will be used to build the features for training the model

In [9]:
from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol="tweet_cleansed", outputCol="words")
training_data = tokenizer.transform(training_data)
training_data.show(5)

+-----+--------------------+--------------------+--------------------+
|label|               tweet|      tweet_cleansed|               words|
+-----+--------------------+--------------------+--------------------+
|    0|blagh class at 8 ...|blagh class 8 tom...|[blagh, class, 8,...|
|    0|@tea oh! i'm so s...|oh im sorry didnt...|[oh, im, sorry, d...|
|    0|A bad nite for th...|bad nite favorite...|[bad, nite, favor...|
|    0|@ColinDeMar Far t...|   far way rail tips|[far, way, rail, ...|
|    0|@rcompo RACHEL! h...|rachel hang outag...|[rachel, hang, ou...|
+-----+--------------------+--------------------+--------------------+
only showing top 5 rows



### HashingTF

The following code snippet creates the features, which means a list of all terms available on all "documents" with a "tag" indicating its presence or not on the specific "document"

In [10]:
from pyspark.ml.feature import HashingTF
hashingTF = HashingTF(inputCol="words", outputCol="features")
training_data = hashingTF.transform(training_data)
training_data.show(5)

+-----+--------------------+--------------------+--------------------+--------------------+
|label|               tweet|      tweet_cleansed|               words|            features|
+-----+--------------------+--------------------+--------------------+--------------------+
|    0|blagh class at 8 ...|blagh class 8 tom...|[blagh, class, 8,...|(262144,[8254,291...|
|    0|@tea oh! i'm so s...|oh im sorry didnt...|[oh, im, sorry, d...|(262144,[18184,31...|
|    0|A bad nite for th...|bad nite favorite...|[bad, nite, favor...|(262144,[9664,623...|
|    0|@ColinDeMar Far t...|   far way rail tips|[far, way, rail, ...|(262144,[51471,16...|
|    0|@rcompo RACHEL! h...|rachel hang outag...|[rachel, hang, ou...|(262144,[3386,178...|
+-----+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [11]:
# Creates a numeric index for the labels
from pyspark.ml.feature import StringIndexer
stringIndexer = StringIndexer(inputCol="label", outputCol="labelIndex")
model = stringIndexer.fit(training_data)
training_data = model.transform(training_data)
training_data.show(5)

+-----+--------------------+--------------------+--------------------+--------------------+----------+
|label|               tweet|      tweet_cleansed|               words|            features|labelIndex|
+-----+--------------------+--------------------+--------------------+--------------------+----------+
|    0|blagh class at 8 ...|blagh class 8 tom...|[blagh, class, 8,...|(262144,[8254,291...|       1.0|
|    0|@tea oh! i'm so s...|oh im sorry didnt...|[oh, im, sorry, d...|(262144,[18184,31...|       1.0|
|    0|A bad nite for th...|bad nite favorite...|[bad, nite, favor...|(262144,[9664,623...|       1.0|
|    0|@ColinDeMar Far t...|   far way rail tips|[far, way, rail, ...|(262144,[51471,16...|       1.0|
|    0|@rcompo RACHEL! h...|rachel hang outag...|[rachel, hang, ou...|(262144,[3386,178...|       1.0|
+-----+--------------------+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [12]:
# Split dataset into training and test
training, test = training_data.randomSplit([0.5, 0.5])

In [13]:
from pyspark.ml.classification import NaiveBayes
#Naive bayes
nb = NaiveBayes(featuresCol="features", labelCol="labelIndex", predictionCol="NB_pred",
                probabilityCol="NB_prob", rawPredictionCol="NB_rawPred")
nbModel = nb.fit(training)
cv = nbModel.transform(test)
total = cv.count()
correct = cv.where(cv['labelIndex'] == cv['NB_pred']).count()
accuracy = correct/total

In [14]:
print(
    "\nTotal:", total, 
    "\nCorrect:", correct, 
    "\nAccuracy:", accuracy)


Total: 10024 
Correct: 7159 
Accuracy: 0.7141859537110934


In [15]:
cv.select("label", "tweet", "labelIndex", "NB_pred").show()

+-----+--------------------+----------+-------+
|label|               tweet|labelIndex|NB_pred|
+-----+--------------------+----------+-------+
|    0| 1 dinger in and ...|       1.0|    0.0|
|    0| @shanedawson i'm...|       1.0|    1.0|
|    0| @thesmartmama I ...|       1.0|    1.0|
|    0| I caught up. I W...|       1.0|    1.0|
|    0| I guess I'm no S...|       1.0|    1.0|
|    0| I need another w...|       1.0|    1.0|
|    0| Just finished th...|       1.0|    1.0|
|    0| Misses someone  ...|       1.0|    1.0|
|    0| My stomach is ac...|       1.0|    1.0|
|    0| Only @ninapolita...|       1.0|    0.0|
|    0| That guys superr...|       1.0|    1.0|
|    0| Thats cause the ...|       1.0|    1.0|
|    0| Too Emotional ri...|       1.0|    1.0|
|    0| a kid at my scho...|       1.0|    1.0|
|    0|     called in again|       1.0|    1.0|
|    0| cant afford to s...|       1.0|    0.0|
|    0| damn, Kayley doe...|       1.0|    1.0|
|    0| don't go to jail...|       1.0| 

In [16]:
model_folder = os.path.join(os.getcwd(), 'saved_models')

if not os.path.exists(model_folder):
    os.makedirs(model_folder)

In [17]:
model_full_path = os.path.join(model_folder, "twitter_sentiment_spark")
nbModel.write().overwrite().save(model_full_path)

In [18]:
# Delete temporary folder
if os.path.exists(temporary_folder):
    shutil.rmtree(temporary_folder)    