In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.pipeline import PipelineModel
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.types import StringType
from pyspark.sql.functions import lower, when, col, udf, split, lit, format_string
from dotenv import load_dotenv
import os
load_dotenv()

True

## Constants

In [2]:
TRAINING_FILE = os.getenv("TRAINING_FILE","dataset/dataset.csv")
SPARK_MASTER = os.getenv("SPARK_MASTER", "spark://gpu3.esw:7077")
KAFKA_SERVER = os.getenv("KAFKA_SERVER", 'localhost:9092')

SPARK_APP_NAME = "Final - PSPD - Predict"
INTERVAL = os.getenv("INTERVAL", "10 seconds")

PREDICT_TOPIC = os.getenv("PREDICT_TOPIC", 'election')
STATS_TOPIC = os.getenv("STATS_TOPIC", 'test-elasticsearch-sink')

PACKAGES = "org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0"

PRETRAINED_MODEL_PATH = os.getenv("PRETRAINED_MODEL_PATH", "model/trained.model")
STOPWORDS_PATH = os.getenv("STOPWORDS_PATH", "dataset/stopwords.txt")

SPARK_CORES_MAX = os.getenv("SPARK_CORES_MAX", "2")

## Startup

In [3]:
conf = SparkConf() \
    .setMaster(SPARK_MASTER) \
    .setAppName(SPARK_APP_NAME) \
    .set("spark.jars.packages", PACKAGES) \
    .set("spark.cores.max", "2")
    
context = SparkContext(conf=conf)
context.setLogLevel("ERROR")



:: loading settings :: url = jar:file:/home/rcleydsonr/spark-3.2.2-bin-hadoop3.2/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/rcleydsonr/.ivy2/cache
The jars for the packages stored in: /home/rcleydsonr/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-605420fc-5cf7-4dad-a1f7-1cf27100cef8;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.2.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.2.0 in central
	found org.apache.kafka#kafka-clients;2.8.0 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.1 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-client-api;3.3.1 in central
	found org.apache.htrace#htrace-core4;4.1.0-incubating in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in

In [4]:
spark = SparkSession.builder.getOrCreate()

## Cleaner

In [5]:
import re

CLEAN_REGEX = r"[.,/\\\[\]\{\}`~^\d&!@#$%*\)\(\'\"<>=+-:;?“]"

stopwords = set()

with open(STOPWORDS_PATH, "r") as stop_file:
    for w in stop_file:
        stopwords.add(w.strip().lower())

def cleaner(sentence):
    sentence = " ".join(
        filter(
            lambda x: x not in stopwords,
            re.sub(CLEAN_REGEX, '', sentence).split()
        )
    )
    return sentence

cleaner_col = udf(lambda s: cleaner(s), StringType())

## Load Pre-trained Model

In [6]:
model = PipelineModel.load(PRETRAINED_MODEL_PATH)

                                                                                

## Prediction

In [7]:
def foreach_batch_func(df: DataFrame, _):
    # Preparations - split into candidate and message and clean
    candidateMessage = split(df.value, ",", 2)
    sentences = df \
                .withColumn("candidate", candidateMessage.getItem(0)) \
                .withColumn("sentence", cleaner_col(lower(candidateMessage.getItem(1))))

    # Predict
    prediction = model.transform(sentences) \
                .select(
                    "candidate",
                    "sentence",
                    "probability",
                    when(col("prediction") == 1.0, "positive").otherwise("negative").alias("prediction")
                ) \

    # Write in console
    prediction \
        .write \
        .format("console") \
        .save()

    # Prepare prediction to elasticsearch format
    # Group by candidate and prediction and format to json
    predictionElastic = prediction \
                        .groupBy(
                            "candidate",
                            "prediction"
                        ).count() \
                        .select(
                            lit('1').alias("key"),
                            format_string(
                                "{\"candidate\": \"%s\", \"%s\": %d}",
                                col("candidate"), col("prediction"), col("count")
                            ).alias("value")
                        )
    
    # Write to kafka elasticsearch topic
    predictionElastic.write \
                    .format("kafka") \
                    .option("kafka.bootstrap.servers", KAFKA_SERVER) \
                    .option('topic', STATS_TOPIC) \
                    .save()

## Sink

In [8]:
lines = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_SERVER) \
    .option("subscribe", PREDICT_TOPIC) \
    .option("failOnDataLoss", "false") \
    .load() \
    .writeStream \
    .foreachBatch(foreach_batch_func) \
    .option("checkpointLocation", "/tmp/spark/mllib-predict") \
    .trigger(processingTime=INTERVAL) \
    .start()

+---------+--------+-----------+----------+
|candidate|sentence|probability|prediction|
+---------+--------+-----------+----------+
+---------+--------+-----------+----------+



                                                                                

+---------+--------------------+--------------------+----------+
|candidate|            sentence|         probability|prediction|
+---------+--------------------+--------------------+----------+
|     Lula|rt blogdomiro tuí...|[0.05473215335437...|  positive|
|     Lula|rt carollinesarda...|[0.00176872172651...|  positive|
|     Lula|rt houston_souza ...|[0.04372798265905...|  positive|
|     Lula|felpweber pesquis...|[0.57049744512042...|  negative|
|     Lula|dcm_online eleiçõ...|[0.01699080700473...|  positive|
|     Lula|rt tradutordobr t...|[0.05473215335437...|  positive|
|     Lula|rt fernandoholida...|[0.05473215335437...|  positive|
|     Lula|rt skingotic reai...|[0.05473215335437...|  positive|
|     Lula|rt tigrinhapnd gu...|[0.05473215335437...|  positive|
|     Lula|obviamente cenári...|[0.08689507741586...|  positive|
|Bolsonaro|rt radiogenova cr...|[0.05473215335437...|  positive|
|Bolsonaro|rt jnascim sb “vi...|[0.89542213740066...|  negative|
|Bolsonaro|rt mfriasofici

                                                                                

+---------+--------------------+--------------------+----------+
|candidate|            sentence|         probability|prediction|
+---------+--------------------+--------------------+----------+
|     Lula|rt kimpaim eleito...|[0.00513322302445...|  positive|
|     Lula|rt terrabrasilnot...|[0.57200899456174...|  negative|
|     Lula|    andrectelles mto|[0.05473215335437...|  positive|
|     Lula|rt raimundodante ...|[0.13595631677014...|  positive|
|     Lula|ouvindo anestesis...|[0.02005886618672...|  positive|
|     Lula|rt oclannnn janei...|[0.03016229946684...|  positive|
|     Lula|rt jrguzzofatos g...|[0.05473215335437...|  positive|
|     Lula|racsouni clarisac...|[0.00919857986617...|  positive|
|     Lula|rt eixopolitico ?...|[0.01699080700473...|  positive|
|     Lula|rt jornalismojoao...|[0.01651828612116...|  positive|
|Bolsonaro|rt pedrohu taliri...|[0.05473215335437...|  positive|
|Bolsonaro|rt anneminas gent...|[0.02377783146400...|  positive|
|Bolsonaro|rt alexandreku

                                                                                

+---------+--------------------+--------------------+----------+
|candidate|            sentence|         probability|prediction|
+---------+--------------------+--------------------+----------+
|     Lula|         nhann livre|[0.02433642334546...|  positive|
|     Lula|rt vaibrasil vamo...|[0.50792903147364...|  negative|
|     Lula|rt linsaquiles je...|[0.05473215335437...|  positive|
|     Lula|demori enterrar e...|[0.00642662134076...|  positive|
|     Lula|rt pedroronchi vo...|[0.02156316250035...|  positive|
|     Lula|inaciocaotico lul...|[0.42967398700241...|  positive|
|     Lula|rt ____leh jairme...|[0.05473215335437...|  positive|
|     Lula|rt carvalho_a_hel...|[0.52954216306748...|  negative|
|     Lula|presidente turno ...|[0.20457842099022...|  positive|
|     Lula|rt jrguzzofatos g...|[0.05473215335437...|  positive|
|Bolsonaro|rt brom_elisa ima...|[0.01836784594453...|  positive|
|Bolsonaro|rt filipesabara r...|[0.04707871155792...|  positive|
|Bolsonaro|rt andreporci 

                                                                                

# End

In [None]:
lines.stop()

In [None]:
spark.stop()
context.stop()