In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark import StorageLevel
from pyspark.sql import functions as f

spark = (
    SparkSession.builder.appName("ModelTraining")
    .config("spark.executor.memory", "4g")
    .getOrCreate()
)

pd.options.display.max_columns = None
pd.options.display.max_rows = 30
pd.options.display.max_colwidth = 150

schema = "polarity FLOAT, id LONG, date_time TIMESTAMP, query STRING, user STRING, text STRING"
timestampformat = "EEE MMM dd HH:mm:ss zzz yyyy"


IN_PATH = "datasets/sentiment-140-training-data/CLEAN"
OUT_PATH = "datasets/sentiment-140-training-data/MODEL"

spark_reader = spark.read.schema(schema)


df_clean = spark_reader.parquet(IN_PATH)
df_clean = (
    df_clean
    # Remove all numbers
    .withColumn("text", f.regexp_replace(f.col("text"), "[^a-zA-Z']", " "))
    # Remove all double/multiple spaces
    .withColumn("text", f.regexp_replace(f.col("text"), " +", " "))
    # Remove leading and trailing whitespaces
    .withColumn("text", f.trim(f.col("text")))
    # Ensure we don't end up with empty rows
    .filter("text != ''")
)

data = df_clean.select("text", "polarity").coalesce(3).cache()

In [2]:
(training_data, validation_data, test_data) = data.randomSplit([0.98, 0.01, 0.01], seed=2020)

In [3]:
%%time
from pyspark.ml.feature import (
    StopWordsRemover,
    Tokenizer,
    HashingTF,
    IDF
)
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

# Tokenizer converts input string to lowercase and then splits it by white spaces.
# https://spark.apache.org/docs/3.0.1/api/python/pyspark.ml.html#pyspark.ml.feature.Tokenizer
# Params:
tokenizer = Tokenizer(
    inputCol="text",
    outputCol="words1"
)

# A feature transformer that filters out stop words from input.
# https://spark.apache.org/docs/3.0.1/api/python/pyspark.ml.html#pyspark.ml.feature.StopWordsRemover
# Params:
stopword_remover = StopWordsRemover(
    inputCol="words1",
    outputCol="words2",
    stopWords = StopWordsRemover.loadDefaultStopWords("english")
)

# Maps a sequence of terms to their term frequencies using the hashing trick
# https://spark.apache.org/docs/3.0.1/api/python/pyspark.ml.html#pyspark.ml.feature.HashingTF
# Params:
hashing_tf = HashingTF(
    inputCol="words2",
    outputCol="term_frequency"
)

# Compute the Inverse Document Frequency (IDF) given a collection of documents
# https://spark.apache.org/docs/3.0.1/api/python/pyspark.ml.html#pyspark.ml.feature.IDF
# Params:
idf = IDF(
    inputCol="term_frequency",
    outputCol="features",
    minDocFreq=5
)

lr = LogisticRegression(labelCol="polarity")

semantic_analysis_pipeline = Pipeline(
        stages = [
            tokenizer,
            stopword_remover,
            hashing_tf,
            idf,
            lr
        ]
)

#semantic_analysis_model = semantic_analysis_pipeline.fit(training_data)

Wall time: 5min 42s


In [4]:
%%time
trained_df = semantic_analysis_model.transform(training_data)
val_df = semantic_analysis_model.transform(validation_data)
test_df = semantic_analysis_model.transform(test_data)

trained_df.show()
val_df.show()
test_df.show()

+--------------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|                text|polarity|              words1|              words2|      term_frequency|            features|       rawPrediction|         probability|prediction|
+--------------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|                   '|     4.0|                 [']|                 [']|(262144,[186171],...|(262144,[186171],...|[8.07978709803161...|[0.46818513567162...|       4.0|
|         ' ' ' ' ' '|     4.0|  [', ', ', ', ', ']|  [', ', ', ', ', ']|(262144,[186171],...|(262144,[186171],...|[8.45869192231039...|[0.65111331376428...|       0.0|
|' Bored on aim wa...|     4.0|[', bored, on, ai...|[', bored, aim, w...|(262144,[9958,180...|(262144,[9958,180...|[6.87635157225856...|[0.07204831822632..

In [5]:
%%time
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="polarity", metricName="accuracy")
accuracy_val = evaluator.evaluate(val_df)
accuracy_test = evaluator.evaluate(test_df)
print("Validation Data:")
print(f"Accuracy: {accuracy_val*100:.5f}%")
print("Testing Data:")
print(f"Accuracy: {accuracy_test*100:.5f}%")

Validation Data:
Accuracy: 77.20708%
Testing Data:
Accuracy: 76.91339%
Wall time: 18.7 s


In [7]:
final_model = semantic_analysis_pipeline.fit(data)
accuracy_test = evaluator.evaluate(final_model.transform(test_data))


print(f"Accuracy: {accuracy_test*100:.5f}%")


Accuracy: 79.23780%


In [11]:
final_model.save(OUT_PATH)