In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import SparkSession
from datetime import datetime

## Constants

In [2]:
TRAINING_FILE = "dataset/dataset.csv"
MODEL_PATH = "model/"
SPARK_MASTER = "spark://localhost:5000"
SPARK_APP_NAME = "Final - PSPD - Train"
KAFKA_SERVER = 'localhost:9093'
PACKAGES = "org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0"

## Startup

In [None]:
conf = SparkConf() \
    .setMaster(SPARK_MASTER) \
    .setAppName(SPARK_APP_NAME) \
    .set("spark.jars.packages", PACKAGES)
    
context = SparkContext(conf=conf)
context.setLogLevel("ERROR")

## Ingest Dataset

In [None]:
training = spark \
    .read \
    .format("csv") \
    .option("sep", ";") \
    .option("header", "true") \
    .load(TRAINING_FILE) \
    .selectExpr("sentence", "CAST(sentiment AS FLOAT) AS label")

## Model Architecture

In [None]:
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression()
lrparamGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.001, 0.01, 0.1, 0.5, 1.0, 2.0])
             .addGrid(lr.elasticNetParam, [0.0, 0.25, 0.5, 0.75, 1.0])
             .addGrid(lr.maxIter, [1, 5, 10, 20, 50])
             .build())
lrevaluator = RegressionEvaluator(metricName="rmse")
lrcv = CrossValidator(estimator = lr,
                    estimatorParamMaps = lrparamGrid,
                    evaluator = lrevaluator,
                    numFolds = 5)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lrcv])

## Training

In [None]:
model = pipeline.fit(training)

## Saving

In [None]:
model.save(F"{MODEL_PATH}{datetime.now()}.model")