In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.pipeline import PipelineModel
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.types import StringType
from pyspark.sql.functions import lower, when, col, udf

## Constants

In [2]:
MODEL_FILE = "model/trained.model"
SPARK_MASTER = "spark://localhost:5000"
SPARK_APP_NAME = "Final - PSPD - Predict"
KAFKA_SERVER = 'localhost:9093'
PREDICT_TOPIC = 'predict'
PACKAGES = "org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0"

## Startup

In [3]:
conf = SparkConf() \
    .setMaster(SPARK_MASTER) \
    .setAppName(SPARK_APP_NAME) \
    .set("spark.jars.packages", PACKAGES)
    
context = SparkContext(conf=conf)
context.setLogLevel("ERROR")

:: loading settings :: url = jar:file:/home/thiago/.local/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/thiago/.ivy2/cache
The jars for the packages stored in: /home/thiago/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-4dca2192-51a0-4b8d-9200-c51dda7b61bb;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.2.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.2.0 in central
	found org.apache.kafka#kafka-clients;2.8.0 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.1 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-client-api;3.3.1 in central
	found org.apache.htrace#htrace-core4;4.1.0-incubating in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central

22/09/17 22:44:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
spark = SparkSession.builder.getOrCreate()

## Cleaner

In [10]:
import re

STOPWORDS_PATH = "dataset/stopwords.txt"
CLEAN_REGEX = r"[.,/\\\[\]\{\}`~^\d&!@#$%*\)\(\'\"<>=+-:;?]"

stopwords = set()

with open(STOPWORDS_PATH, "r") as stop_file:
    for w in stop_file:
        stopwords.add(w.strip().lower())

def cleaner(sentence):
    sentence = " ".join(
        filter(
            lambda x: x not in stopwords,
            re.sub(CLEAN_REGEX, '', sentence).split()
        )
    )
    return sentence

## Prediction

In [11]:
cleaner_func = udf(lambda s: cleaner(s), StringType())
def foreach_batch_func(df: DataFrame, _):
    sentences = df.select(cleaner_func(lower(df.value)).alias("sentence"))

    model = PipelineModel.load(MODEL_FILE)
    prediction = model.transform(sentences)

    prediction \
        .select(
            "sentence",
            "probability",
            when(col("prediction") == 1.0, "positive").otherwise("negative").alias("prediction")
        ) \
        .write \
        .format("console") \
        .save()

## Sink

In [12]:
lines = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_SERVER) \
    .option("subscribe", PREDICT_TOPIC) \
    .option("failOnDataLoss", "false") \
    .load() \
    .writeStream \
    .foreachBatch(foreach_batch_func) \
    .option("checkpointLocation", "/tmp/spark/mllib-predict") \
    .trigger(processingTime="10 seconds") \
    .start()

                                                                                

+--------+--------------------+----------+
|sentence|         probability|prediction|
+--------+--------------------+----------+
|   odeio|[0.27990843001391...|  positive|
|horrível|[0.27990843001391...|  positive|
|   feliz|[0.18525478159637...|  positive|
|  amavel|[0.27990843001391...|  positive|
+--------+--------------------+----------+

+--------+--------------------+----------+
|sentence|         probability|prediction|
+--------+--------------------+----------+
|   otimo|[0.27990843001391...|  positive|
|        |[0.27990843001391...|  positive|
|  triste|[0.27990843001391...|  positive|
|  triste|[0.27990843001391...|  positive|
|   feliz|[0.18525478159637...|  positive|
+--------+--------------------+----------+

+--------+--------------------+----------+
|sentence|         probability|prediction|
+--------+--------------------+----------+
|  triste|[0.27990843001391...|  positive|
+--------+--------------------+----------+



                                                                                

+--------+--------------------+----------+
|sentence|         probability|prediction|
+--------+--------------------+----------+
|horrivel|[0.27990843001391...|  positive|
+--------+--------------------+----------+



# End

In [None]:
lines.stop()
spark.stop()
context.stop()