In [1]:
import os
import shutil
import zipfile

In [2]:
base_folder = os.getcwd()
temporary_folder = os.path.join(os.getcwd(), "tmp")

In [3]:
def unzip_files():
# Unzip file on a temporary folder
    if os.path.exists(temporary_folder):
        shutil.rmtree(temporary_folder)
        
    if not os.path.exists(temporary_folder):
        os.makedirs(temporary_folder)
        
    local_file_name = os.path.join(base_folder, "training_dataset", "trainingandtestdata.zip")
    with zipfile.ZipFile(local_file_name, 'r') as zip_ref:
        zip_ref.extractall(temporary_folder)

In [4]:
def cleansing_and_tokenizing(tweet):
# Cleansing and tokenizing tweet
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords 
    from string import punctuation 
    from bs4 import BeautifulSoup
    import re
    
    terms_to_remove = set(stopwords.words("english") + ["USERTAGGING","URL"])
    tweet = BeautifulSoup(tweet, 'html.parser').get_text() # Extracts text from HTML (just in case!)
    tweet = tweet.lower() # Converts text to lower-case
    tweet = re.sub("((www\.[^\s]+)|(https?://[^\s]+))", "URL", tweet) # Replces URLs by URL constan
    tweet = re.sub("@[^\s]+", "USERTAGGING", tweet) # Replaces usernames by USERTAGGING constant 
    tweet = re.sub(r"#([^\s]+)", r"\1", tweet) # Removes the # in #hashtag
    for p in punctuation: 
        tweet = tweet.replace(p, "") # Removes punctiation
    tweet = word_tokenize(tweet) # Creates a list of words
    words = ""
    for each_word in tweet:
        if each_word not in terms_to_remove:
            words = words + " " + each_word
    # return [word for word in tweet if word not in terms_to_remove]
    return words[1:]

In [5]:
unzip_files()

In [6]:
from pyspark.sql import SparkSession, functions

spark = SparkSession.builder.master("local").appName("Training Twitter Sentiment Analysis").getOrCreate()
training_data = spark.read.load(
    "tmp/training.1600000.processed.noemoticon.csv",
    format="csv")
training_data = training_data.withColumnRenamed("_c0", "label") \
    .withColumnRenamed("_c1", "tweet_id") \
    .withColumnRenamed("_c2", "date") \
    .withColumnRenamed("_c3", "query") \
    .withColumnRenamed("_c4", "user") \
    .withColumnRenamed("_c5", "tweet")

In [7]:
sample_size = 10000
training_data = training_data.sample(sample_size / 1600000)

training_data = training_data.select(functions.col("label"), functions.col("tweet"))

In [8]:
udf_cleansing_and_tokenizing = functions.udf(cleansing_and_tokenizing)
training_data = training_data.withColumn("tweet_cleansed", udf_cleansing_and_tokenizing(functions.col("tweet")))
training_data = training_data.withColumn("tweet_cleansed", functions.split("tweet_cleansed", " ")) 

In [9]:
training_data.show(5)

+-----+--------------------+--------------------+
|label|               tweet|      tweet_cleansed|
+-----+--------------------+--------------------+
|    0|Saw an ad on Crai...|[saw, ad, craigsl...|
|    0|@maxime68 @megeld...|[im, useless, don...|
|    0|doesn't enjoy lea...|[doesnt, enjoy, l...|
|    0|@theresawhite thi...|[true, lol, still...|
|    0|@siddharth_ind ye...|[yeah, applicatio...|
+-----+--------------------+--------------------+
only showing top 5 rows



In [10]:
vocabulary = training_data.withColumn("word", functions.explode("tweet_cleansed")).select(functions.col("word"))
#print("Count:", vocabulary.count())
vocabulary = vocabulary.distinct()
#print("Distinct count:", vocabulary.count())
vocabulary = vocabulary.withColumn("dummy_col", functions.lit(1))
vocabulary_list = vocabulary.groupBy("dummy_col").agg(functions.collect_list("word"))
vocabulary_list = vocabulary_list.withColumnRenamed("collect_list(word)", "words")


Count: 72070
Distinct count: 14628


In [11]:
training_data = training_data.join(vocabulary_list.select("words"))
#df.withColumnRenamed('id', 'id1').crossJoin(df.withColumnRenamed('id', 'id2')).show()
training_data.show(5)

+-----+--------------------+--------------------+--------------------+
|label|               tweet|      tweet_cleansed|               words|
+-----+--------------------+--------------------+--------------------+
|    0|Saw an ad on Crai...|[saw, ad, craigsl...|[still, online, a...|
|    0|@maxime68 @megeld...|[im, useless, don...|[still, online, a...|
|    0|doesn't enjoy lea...|[doesnt, enjoy, l...|[still, online, a...|
|    0|@theresawhite thi...|[true, lol, still...|[still, online, a...|
|    0|@siddharth_ind ye...|[yeah, applicatio...|[still, online, a...|
+-----+--------------------+--------------------+--------------------+
only showing top 5 rows



In [12]:
def extract_features(tweet, vocabulary):
# Extract features (tag words using on twitter into a instance of dictionary)
    tweet_words=set(tweet)
    features={}
    for word in vocabulary:
        features['contains(%s)' % word]=(word in tweet_words)
    return features 

In [13]:
udf_extract_features = functions.udf(extract_features)
training_data = training_data.withColumn(
    "features", 
    udf_extract_features(functions.col("tweet"), functions.col("words"))
)

In [14]:
training_data.show(5)

+-----+--------------------+--------------------+--------------------+--------------------+
|label|               tweet|      tweet_cleansed|               words|            features|
+-----+--------------------+--------------------+--------------------+--------------------+
|    0|Saw an ad on Crai...|[saw, ad, craigsl...|[still, online, a...|{contains(coding)...|
|    0|@maxime68 @megeld...|[im, useless, don...|[still, online, a...|{contains(coding)...|
|    0|doesn't enjoy lea...|[doesnt, enjoy, l...|[still, online, a...|{contains(coding)...|
|    0|@theresawhite thi...|[true, lol, still...|[still, online, a...|{contains(coding)...|
|    0|@siddharth_ind ye...|[yeah, applicatio...|[still, online, a...|{contains(coding)...|
+-----+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [15]:
# from pyspark.sql.functions import struct
# labeled_features = joined.withColumn("labeled_features",struct(joined.features,joined.label))

In [16]:
training, test = training_data.select("features", "label").randomSplit([0.5, 0.5])

In [17]:
#training = training.select(functions.col("features"), functions.col("label"))

In [18]:
training.show(5)

KeyboardInterrupt: 

In [None]:
#training_features = training.rdd.map(tuple).collect()

In [None]:
#training_features

In [None]:
# import nltk
# Classifier = nltk.NaiveBayesClassifier.train(training_features)