In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.pipeline import PipelineModel
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.types import StringType
from pyspark.sql.functions import lower, when, col, udf, split, lit, format_string
from dotenv import load_dotenv
import os
load_dotenv()

True

## Constants

In [2]:
TRAINING_FILE = os.getenv("TRAINING_FILE","dataset/dataset.csv")
SPARK_MASTER = os.getenv("SPARK_MASTER", "spark://gpu3.esw:7077")
KAFKA_SERVER = os.getenv("KAFKA_SERVER", 'localhost:9092')

SPARK_APP_NAME = "Final - PSPD - Predict"
INTERVAL = os.getenv("INTERVAL", "10 seconds")

PREDICT_TOPIC = os.getenv("PREDICT_TOPIC", 'election')
STATS_TOPIC = os.getenv("STATS_TOPIC", 'test-elasticsearch-sink')

PACKAGES = "org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0"

PRETRAINED_MODEL_PATH = os.getenv("PRETRAINED_MODEL_PATH", "model/trained.model")
STOPWORDS_PATH = os.getenv("STOPWORDS_PATH", "dataset/stopwords.txt")

SPARK_CORES_MAX = os.getenv("SPARK_CORES_MAX", "2")

## Startup

In [3]:
conf = SparkConf() \
    .setMaster(SPARK_MASTER) \
    .setAppName(SPARK_APP_NAME) \
    .set("spark.jars.packages", PACKAGES) \
    .set("spark.cores.max", "2")
    
context = SparkContext(conf=conf)
context.setLogLevel("ERROR")

:: loading settings :: url = jar:file:/home/thiago/.local/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/thiago/.ivy2/cache
The jars for the packages stored in: /home/thiago/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-911440c7-d108-4004-8b1a-777d05ec1389;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.2.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.2.0 in central
	found org.apache.kafka#kafka-clients;2.8.0 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.1 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-client-api;3.3.1 in central
	found org.apache.htrace#htrace-core4;4.1.0-incubating in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central

22/09/18 22:36:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
spark = SparkSession.builder.getOrCreate()

## Cleaner

In [5]:
import re

CLEAN_REGEX = r"[.,/\\\[\]\{\}`~^\d&!@#$%*\)\(\'\"<>=+-:;?“]"

stopwords = set()

with open(STOPWORDS_PATH, "r") as stop_file:
    for w in stop_file:
        stopwords.add(w.strip().lower())

def cleaner(sentence):
    sentence = " ".join(
        filter(
            lambda x: x not in stopwords,
            re.sub(CLEAN_REGEX, '', sentence).split()
        )
    )
    return sentence

cleaner_col = udf(lambda s: cleaner(s), StringType())

## Load Pre-trained Model

In [6]:
model = PipelineModel.load(PRETRAINED_MODEL_PATH)

                                                                                

## Prediction

In [7]:
def foreach_batch_func(df: DataFrame, _):
    # Preparations - split into candidate and message and clean
    candidateMessage = split(df.value, ",", 2)
    sentences = df \
                .withColumn("candidate", candidateMessage.getItem(0)) \
                .withColumn("sentence", cleaner_col(lower(candidateMessage.getItem(1))))

    # Predict
    prediction = model.transform(sentences) \
                .select(
                    "candidate",
                    "sentence",
                    "probability",
                    when(col("prediction") == 1.0, "positive").otherwise("negative").alias("prediction")
                ) \

    # Write in console
    prediction \
        .write \
        .format("console") \
        .save()

    # Prepare prediction to elasticsearch format
    # Group by candidate and prediction and format to json
    predictionElastic = prediction \
                        .groupBy(
                            "candidate",
                            "prediction"
                        ).count() \
                        .select(
                            lit('1').alias("key"),
                            format_string(
                                "{\"candidate\": \"%s\", \"%s\": %d}",
                                col("candidate"), col("prediction"), col("count")
                            ).alias("value")
                        )
    
    # Write to kafka elasticsearch topic
    predictionElastic.write \
                    .format("kafka") \
                    .option("kafka.bootstrap.servers", KAFKA_SERVER) \
                    .option('topic', STATS_TOPIC) \
                    .save()

## Sink

In [8]:
lines = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_SERVER) \
    .option("subscribe", PREDICT_TOPIC) \
    .option("failOnDataLoss", "false") \
    .load() \
    .writeStream \
    .foreachBatch(foreach_batch_func) \
    .option("checkpointLocation", "/tmp/spark/mllib-predict") \
    .trigger(processingTime=INTERVAL) \
    .start()

+---------+--------+-----------+----------+
|candidate|sentence|probability|prediction|
+---------+--------+-----------+----------+
+---------+--------+-----------+----------+



                                                                                

+---------+--------------------+--------------------+----------+
|candidate|            sentence|         probability|prediction|
+---------+--------------------+--------------------+----------+
|     Lula|rt centraleleicoe...|[0.05473215335437...|  positive|
|     Lula|rt silviogrimaldo...|[0.01699080700473...|  positive|
|     Lula|voto voto voto co...|[0.05473215335437...|  positive|
|     Lula|andrejanonesadv m...|[0.99999846087890...|  negative|
|     Lula|rt mary_brunna de...|[0.05473215335437...|  positive|
|     Lula|rt nelly pau ferr...|[0.74190901942733...|  negative|
|     Lula|rt acheiteresa te...|[0.88172004002475...|  negative|
|     Lula|rt monederojc eeu...|[0.05473215335437...|  positive|
|     Lula|rt thiagoresiste ...|[0.01699080700473...|  positive|
|     Lula|thiagoresiste lul...|[0.05473215335437...|  positive|
|Bolsonaro|rt silviogrimaldo...|[0.01699080700473...|  positive|
|Bolsonaro|rt dalcolgiovane ...|[0.64888942855328...|  negative|
|Bolsonaro|rt filipesabar

                                                                                

+---------+--------------------+--------------------+----------+
|candidate|            sentence|         probability|prediction|
+---------+--------------------+--------------------+----------+
|     Lula|rt arquivoxandy m...|[0.01022091803771...|  positive|
|     Lula|rt pedroronchi tr...|[0.05473215335437...|  positive|
|     Lula|rt diocla governa...|[0.99844748193180...|  negative|
|     Lula|anaaraphaela nils...|[0.36659381609298...|  positive|
|     Lula|rt joaquimmonstra...|[0.24767692088327...|  positive|
|     Lula|marelhos cher_gue...|[0.28786085655245...|  positive|
|     Lula|rt geraldoalckmin...|[0.37168065933096...|  positive|
|     Lula|vestia olhem cami...|[4.59582117257998...|  positive|
|     Lula|andreaad monark l...|[0.96631028672029...|  negative|
|     Lula|pedroronchi salár...|[0.04013510271677...|  positive|
+---------+--------------------+--------------------+----------+



                                                                                

+---------+--------------------+--------------------+----------+
|candidate|            sentence|         probability|prediction|
+---------+--------------------+--------------------+----------+
|Bolsonaro|rt ale_pavanelli ...|[0.98326493015951...|  negative|
|Bolsonaro|rt medoedeliriobr...|[0.05473215335437...|  positive|
|Bolsonaro|rt benitorperez ⁦...|[0.89542213740066...|  negative|
|Bolsonaro|rt jpdoficial ree...|[0.04274809677888...|  positive|
|Bolsonaro|lulaverso cabra v...|[0.05473215335437...|  positive|
|Bolsonaro|rt thiagoresiste ...|[0.19203629038244...|  positive|
|Bolsonaro|rt leandroruschel...|[0.05473215335437...|  positive|
|Bolsonaro|rt pedroronchi le...|[0.81417785731452...|  negative|
|Bolsonaro|rt lira president...|[0.37423303445666...|  positive|
|Bolsonaro|rt carlosjordy ca...|[0.01699080700473...|  positive|
+---------+--------------------+--------------------+----------+



                                                                                

+---------+--------------------+--------------------+----------+
|candidate|            sentence|         probability|prediction|
+---------+--------------------+--------------------+----------+
|     Lula|rt kimpaim eleito...|[0.00513322302445...|  positive|
|     Lula|rt pedroronchi en...|[0.00398240512232...|  positive|
|     Lula|rt kimpaim eleito...|[0.00513322302445...|  positive|
|     Lula|jairmearrependi p...|[0.75124716926032...|  negative|
|     Lula|rt umdedodearte f...|[0.05473215335437...|  positive|
|     Lula|rt eliasjabbour i...|[0.01699080700473...|  positive|
|     Lula|rt kimpaim eleito...|[0.00513322302445...|  positive|
|     Lula|rt celle_a_celle ...|[0.12031581956437...|  positive|
|     Lula|costajr lulaofici...|[0.34420151030417...|  positive|
|     Lula|rt josivamalves l...|[0.36659381609298...|  positive|
+---------+--------------------+--------------------+----------+



                                                                                

+------------+--------------------+--------------------+----------+
|   candidate|            sentence|         probability|prediction|
+------------+--------------------+--------------------+----------+
|   Bolsonaro|rt netinhoespinha...|[0.08049147681987...|  positive|
|   Bolsonaro|rt dilmaresiste w...|[0.00943102952331...|  positive|
|   Bolsonaro|rt athenasmgf mer...|[0.11383431800661...|  positive|
|   Bolsonaro|rt flvialeo belís...|[0.03922312990939...|  positive|
|   Bolsonaro|rt rafaelbboa err...|[0.05473215335437...|  positive|
|   Bolsonaro|rt senadorhumbert...|[0.71045025666700...|  negative|
|   Bolsonaro|rt bernilton paul...|[2.41930989551548...|  positive|
|   Bolsonaro|rt pedroronchi en...|[0.00398240512232...|  positive|
|   Bolsonaro|rt jamilchade exc...|[0.05473215335437...|  positive|
|   Bolsonaro|rt profpaulamaris...|[0.05473215335437...|  positive|
|Simone Tebet|rt henriolliveira...|[0.41434642448243...|  positive|
|Simone Tebet|rt henriolliveira...|[0.4143464244

                                                                                

+---------+--------------------+--------------------+----------+
|candidate|            sentence|         probability|prediction|
+---------+--------------------+--------------------+----------+
|     Lula|rt monark ganhara...|[0.30803668091843...|  positive|
|     Lula|rt kimpaim tamanh...|[0.01544559309229...|  positive|
|     Lula|rt burlamaquip gl...|[0.05473215335437...|  positive|
|     Lula|acabou postar víd...|[0.98866585003161...|  negative|
|     Lula|rt folha janones ...|[0.00513322302445...|  positive|
|     Lula|rt heldersalomao ...|[0.01699080700473...|  positive|
|     Lula|luciolacolares lu...|[0.05473215335437...|  positive|
|     Lula|rt uolnoticias da...|[0.05473215335437...|  positive|
|     Lula|rt viccommie bent...|[0.05293129498728...|  positive|
|     Lula|rt damadeferroofi...|[0.05473215335437...|  positive|
|Bolsonaro|lcpj centraleleic...|[0.84642205838004...|  negative|
|Bolsonaro|monark danilogent...|[0.01441770223105...|  positive|
|Bolsonaro|rt augustonpis

                                                                                

+---------+--------------------+--------------------+----------+
|candidate|            sentence|         probability|prediction|
+---------+--------------------+--------------------+----------+
|     Lula|rt kimpaim tamanh...|[0.01544559309229...|  positive|
|     Lula|rt _janoninho bra...|[0.99303356470823...|  negative|
|     Lula|rt uylibertarios ...|[0.05473215335437...|  positive|
|     Lula|rt brianmtelesur ...|[0.09506243406881...|  positive|
|     Lula|q voltar pra gent...|[0.66470525795489...|  negative|
|     Lula|rt acheiteresa te...|[0.88172004002475...|  negative|
|     Lula|rt kimpaim eleito...|[0.00513322302445...|  positive|
|     Lula|rt joaogoulartjoa...|[0.05473215335437...|  positive|
|     Lula|rt renato_rovai f...|[0.05473215335437...|  positive|
|     Lula|rt burlamaquip gl...|[0.05473215335437...|  positive|
|Bolsonaro|rt erikamo impren...|[0.05473215335437...|  positive|
|Bolsonaro|rt augustonpistol...|[0.01699080700473...|  positive|
|Bolsonaro|glauber_braga 

                                                                                

+---------+--------------------+--------------------+----------+
|candidate|            sentence|         probability|prediction|
+---------+--------------------+--------------------+----------+
|     Lula|rt marcianobrito ...|[4.36863700934153...|  positive|
|     Lula|rt elaineg bolson...|[0.05473215335437...|  positive|
|     Lula|rt burlamaquip gl...|[0.05473215335437...|  positive|
|     Lula|rt paulolo hj rin...|[0.96593855111309...|  negative|
|     Lula|rt jornalismojoao...|[0.01651828612116...|  positive|
|     Lula|revistaoeste inst...|[0.96631028672029...|  negative|
|     Lula|compreensível des...|[0.97687882291981...|  negative|
|     Lula|rt centraleleicoe...|[0.05473215335437...|  positive|
|     Lula|rt arnoldonunes n...|[0.05371745543959...|  positive|
|     Lula|rt monark ganhara...|[0.30803668091843...|  positive|
|Bolsonaro|adoro pisa nordes...|[0.05473215335437...|  positive|
|Bolsonaro|rt alexandrekunz ...|[0.01784463538045...|  positive|
|Bolsonaro|rt laderechadi

                                                                                

# End

In [9]:
lines.stop()

In [10]:
spark.stop()
context.stop()