In [31]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.pipeline import PipelineModel
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.types import StringType
from pyspark.sql.functions import lower, when, col, udf, split, lit, format_string

## Constants

In [2]:
SPARK_MASTER = "spark://localhost:5000"
KAFKA_SERVER = 'localhost:9093'
SPARK_APP_NAME = "Final - PSPD - Predict"
INTERVAL = "10 seconds"

PREDICT_TOPIC = 'predict'
STATS_TOPIC = 'test-elasticsearch-sink'

PACKAGES = "org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0"

MODEL_PATH = "model/trained.model"
STOPWORDS_PATH = "dataset/stopwords.txt"

## Startup

In [3]:
conf = SparkConf() \
    .setMaster(SPARK_MASTER) \
    .setAppName(SPARK_APP_NAME) \
    .set("spark.jars.packages", PACKAGES)
    
context = SparkContext(conf=conf)
context.setLogLevel("ERROR")

:: loading settings :: url = jar:file:/home/thiago/.local/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/thiago/.ivy2/cache
The jars for the packages stored in: /home/thiago/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-8ddb0f41-291b-49ce-9a02-8f33792ba28c;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.2.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.2.0 in central
	found org.apache.kafka#kafka-clients;2.8.0 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.1 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-client-api;3.3.1 in central
	found org.apache.htrace#htrace-core4;4.1.0-incubating in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central

22/09/18 16:10:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
spark = SparkSession.builder.getOrCreate()

## Cleaner

In [16]:
import re

CLEAN_REGEX = r"[.,/\\\[\]\{\}`~^\d&!@#$%*\)\(\'\"<>=+-:;?]"

stopwords = set()

with open(STOPWORDS_PATH, "r") as stop_file:
    for w in stop_file:
        stopwords.add(w.strip().lower())

def cleaner(sentence):
    print(sentence)
    sentence = " ".join(
        filter(
            lambda x: x not in stopwords,
            re.sub(CLEAN_REGEX, '', sentence).split()
        )
    )
    return sentence

cleaner_col = udf(lambda s: cleaner(s), StringType())

## Load Pre-trained Model

In [6]:
model = PipelineModel.load(MODEL_PATH)

                                                                                

## Teste

In [23]:
lines = spark.read.option("inferSchema", "true").text("test.txt")

In [24]:
candidateMessage = split(lines.value, ",", 2)
sentences = lines \
            .withColumn("candidate", candidateMessage.getItem(0)) \
            .withColumn("sentence", candidateMessage.getItem(1))
sentences.show()

+--------------------+---------+---------------+
|               value|candidate|       sentence|
+--------------------+---------+---------------+
|lula,pessimo,asdadsa|     lula|pessimo,asdadsa|
|  bolsonaro,horrivel|bolsonaro|       horrivel|
|          lula,otimo|     lula|          otimo|
+--------------------+---------+---------------+



## Prediction

In [36]:
def foreach_batch_func(df: DataFrame, _):
    # Preparations - split into candidate and message and clean
    candidateMessage = split(df.value, ",", 2)
    sentences = df \
                .withColumn("candidate", candidateMessage.getItem(0)) \
                .withColumn("sentence", cleaner_col(lower(candidateMessage.getItem(1))))

    # Predict
    prediction = model.transform(sentences) \
                .select(
                    "candidate",
                    "sentence",
                    "probability",
                    when(col("prediction") == 1.0, "positive").otherwise("negative").alias("prediction")
                ) \

    # Write in console
    prediction \
        .write \
        .format("console") \
        .save()

    # Prepare prediction to elasticsearch format
    # Group by candidate and prediction and format to json
    predictionElastic = prediction \
                        .groupBy(
                            "candidate",
                            "prediction"
                        ).count() \
                        .select(
                            lit('1').alias("key"),
                            format_string(
                                "{\"candidate\": \"%s\", \"%s\": %d}",
                                col("candidate"), col("prediction"), col("count")
                            ).alias("value")
                        )
    
    # Write to kafka elasticsearch topic
    predictionElastic.write \
                    .format("kafka") \
                    .option("kafka.bootstrap.servers", KAFKA_SERVER) \
                    .option('topic', STATS_TOPIC) \
                    .save()

## Sink

In [37]:
lines = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_SERVER) \
    .option("subscribe", PREDICT_TOPIC) \
    .option("failOnDataLoss", "false") \
    .load() \
    .writeStream \
    .foreachBatch(foreach_batch_func) \
    .option("checkpointLocation", "/tmp/spark/mllib-predict") \
    .trigger(processingTime=INTERVAL) \
    .start()

+---------+--------+--------------------+----------+
|candidate|sentence|         probability|prediction|
+---------+--------+--------------------+----------+
|     lula|  alegra|[0.05473215335437...|  positive|
+---------+--------+--------------------+----------+



                                                                                

+---------+----------------+--------------------+----------+
|candidate|        sentence|         probability|prediction|
+---------+----------------+--------------------+----------+
|bolsonaro|odeio filho puta|[0.99998649826756...|  negative|
+---------+----------------+--------------------+----------+



                                                                                

+---------+--------+--------------------+----------+
|candidate|sentence|         probability|prediction|
+---------+--------+--------------------+----------+
|     lula|      la|[0.05473215335437...|  positive|
|     lula|      la|[0.05473215335437...|  positive|
+---------+--------+--------------------+----------+



                                                                                

+---------+--------+--------------------+----------+
|candidate|sentence|         probability|prediction|
+---------+--------+--------------------+----------+
|     lula|      la|[0.05473215335437...|  positive|
+---------+--------+--------------------+----------+



                                                                                

+---------+----------------+--------------------+----------+
|candidate|        sentence|         probability|prediction|
+---------+----------------+--------------------+----------+
|     lula|odeio filho puta|[0.99998649826756...|  negative|
|bolsonaro|           fuder|[0.05473215335437...|  positive|
+---------+----------------+--------------------+----------+



                                                                                

+---------+--------+--------------------+----------+
|candidate|sentence|         probability|prediction|
+---------+--------+--------------------+----------+
|     lula|   fuder|[0.05473215335437...|  positive|
+---------+--------+--------------------+----------+



                                                                                

+---------+--------+--------------------+----------+
|candidate|sentence|         probability|prediction|
+---------+--------+--------------------+----------+
|bolsonaro|tomar cu|[0.99152905470935...|  negative|
|     lula|amo voce|[0.05473215335437...|  positive|
|bolsonaro|amo voce|[0.05473215335437...|  positive|
+---------+--------+--------------------+----------+



                                                                                

+---------+--------------------+--------------------+----------+
|candidate|            sentence|         probability|prediction|
+---------+--------------------+--------------------+----------+
|     lula|            tomar cu|[0.99152905470935...|  negative|
|bolsonaro|            tomar cu|[0.99152905470935...|  negative|
|bolsonaro|               feliz|[0.03131361789908...|  positive|
|     lula|               feliz|[0.03131361789908...|  positive|
|     lula|            tomar cu|[0.99152905470935...|  negative|
|bolsonaro|            tomar cu|[0.99152905470935...|  negative|
|bolsonaro|               feliz|[0.03131361789908...|  positive|
|     lula|               feliz|[0.03131361789908...|  positive|
|     lula|            tomar cu|[0.99152905470935...|  negative|
|bolsonaro|            tomar cu|[0.99152905470935...|  negative|
|bolsonaro|               feliz|[0.03131361789908...|  positive|
|     lula|               feliz|[0.03131361789908...|  positive|
|     lula|            to

                                                                                

# End

In [34]:
lines.stop()

In [None]:
spark.stop()
context.stop()