In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import SparkSession
from datetime import datetime
from dotenv import load_dotenv
import os
load_dotenv()

True

## Constants

In [4]:
TRAINING_FILE = os.getenv("TRAINING_FILE","dataset/dataset.csv")
MODEL_PATH = os.getenv("MODEL_PATH", "model/")
SPARK_MASTER = os.getenv("SPARK_MASTER", "spark://gpu3.esw:7077")
SPARK_APP_NAME = "Final - PSPD - Train"

## Startup

In [6]:
conf = SparkConf() \
    .setMaster(SPARK_MASTER) \
    .setAppName(SPARK_APP_NAME)

context = SparkContext(conf=conf)

22/09/18 17:52:52 WARN Utils: Your hostname, notebook resolves to a loopback address: 127.0.0.1; using 192.168.43.185 instead (on interface wlp2s0)
22/09/18 17:52:52 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/18 17:52:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
spark = SparkSession.builder.getOrCreate()

## Ingest Dataset

In [8]:
training = spark \
    .read \
    .format("csv") \
    .option("sep", ",") \
    .option("header", "true") \
    .load(TRAINING_FILE) \
    .selectExpr("sentence", "CAST(sentiment AS FLOAT) AS label")

                                                                                

In [9]:
training

DataFrame[sentence: string, label: float]

## Model Architecture

In [10]:
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression()
lrparamGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.001, 0.01, 0.1, 0.5, 1.0, 2.0])
             .addGrid(lr.elasticNetParam, [0.0, 0.25, 0.5, 0.75, 1.0])
             .addGrid(lr.maxIter, [1, 5, 10, 20, 50, 100])
             .build())
lrevaluator = RegressionEvaluator(metricName="rmse")
lrcv = CrossValidator(estimator = lr,
                    estimatorParamMaps = lrparamGrid,
                    evaluator = lrevaluator,
                    numFolds = 5)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lrcv])

## Training

In [None]:
model = pipeline.fit(training)

## Saving

In [None]:
model.save(F"{MODEL_PATH}{datetime.now()}.model")

## END

In [None]:
spark.stop()
context.stop()