In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import SparkSession
from datetime import datetime

## Constants

In [2]:
TRAINING_FILE = "dataset/dataset.csv"
MODEL_PATH = "model/"
SPARK_MASTER = "spark://localhost:5000"
SPARK_APP_NAME = "Final - PSPD - Train"

## Startup

In [3]:
conf = SparkConf() \
    .setMaster(SPARK_MASTER) \
    .setAppName(SPARK_APP_NAME)

context = SparkContext(conf=conf)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/17 21:55:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
spark = SparkSession.builder.getOrCreate()

## Ingest Dataset

In [5]:
training = spark \
    .read \
    .format("csv") \
    .option("sep", ",") \
    .option("header", "true") \
    .load(TRAINING_FILE) \
    .selectExpr("sentence", "CAST(sentiment AS FLOAT) AS label")

                                                                                

In [6]:
training

DataFrame[sentence: string, label: float]

## Model Architecture

In [7]:
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression()
lrparamGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.001, 0.01, 0.1, 0.5, 1.0, 2.0])
             .addGrid(lr.elasticNetParam, [0.0, 0.25, 0.5, 0.75, 1.0])
             .addGrid(lr.maxIter, [1, 5, 10, 20])
             .build())
lrevaluator = RegressionEvaluator(metricName="rmse")
lrcv = CrossValidator(estimator = lr,
                    estimatorParamMaps = lrparamGrid,
                    evaluator = lrevaluator,
                    numFolds = 5)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lrcv])

## Training

In [8]:
model = pipeline.fit(training)

                                                                                

22/09/17 21:55:58 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/09/17 21:55:58 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
22/09/17 21:59:34 WARN BlockManager: Asked to remove block broadcast_2712_piece0, which does not exist
22/09/17 22:01:36 WARN BlockManager: Asked to remove block broadcast_4514, which does not exist
22/09/17 22:02:20 WARN BlockManagerMaster: Failed to remove broadcast 5070 with removeFromMaster = true - Block broadcast_5070 does not exist
org.apache.spark.SparkException: Block broadcast_5070 does not exist
	at org.apache.spark.errors.SparkCoreErrors$.blockDoesNotExistError(SparkCoreErrors.scala:234)
	at org.apache.spark.storage.BlockInfoManager.blockInfo(BlockInfoManager.scala:237)
	at org.apache.spark.storage.BlockInfoManager.removeBlock(BlockInfoManager.scala:500)
	at org.apache.spark.storage.BlockManager.removeBlockInternal(BlockManager.scala:1984)
	at org.apache.spark.storag

## Saving

In [9]:
model.save(F"{MODEL_PATH}{datetime.now()}.model")

                                                                                

## END

In [10]:
spark.stop()
context.stop()