In [1]:
import os
import socket
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, length, when, col
from pyspark.sql.types import BooleanType, IntegerType, LongType, StringType, ArrayType, FloatType, StructType, StructField, DoubleType, TimestampType
from pyspark.sql.functions import *
from pyspark.sql.functions import pandas_udf
from pyspark.sql.functions import PandasUDFType
from jinja2 import Environment, FileSystemLoader

In [2]:
# setting constants
APP_NAME = "VVORONIN-SPARK-APP_receive2"
NORMALIZED_APP_NAME = APP_NAME.replace('/', '_').replace(':', '_')

APPS_TMP_DIR = os.path.join(os.getcwd(), "tmp")
APPS_CONF_DIR = os.path.join(os.getcwd(), "conf")
APPS_LOGS_DIR = os.path.join(os.getcwd(), "logs")
LOG4J_PROP_FILE = os.path.join(APPS_CONF_DIR, "pyspark-log4j-{}.properties".format(NORMALIZED_APP_NAME))
LOG_FILE = os.path.join(APPS_LOGS_DIR, 'pyspark-{}.log'.format(NORMALIZED_APP_NAME))
EXTRA_JAVA_OPTIONS = "-Dlog4j.configuration=file://{} -Dspark.hadoop.dfs.replication=1 -Dhttps.protocols=TLSv1.0,TLSv1.1,TLSv1.2,TLSv1.3"\
    .format(LOG4J_PROP_FILE)

LOCAL_IP = socket.gethostbyname(socket.gethostname())

In [3]:
# preparing configuration files from templates
for directory in [APPS_CONF_DIR, APPS_LOGS_DIR, APPS_TMP_DIR]:
    if not os.path.exists(directory):
        os.makedirs(directory)

env = Environment(loader=FileSystemLoader('/opt'))
template = env.get_template("pyspark_log4j.properties.template")
template\
    .stream(logfile=LOG_FILE)\
    .dump(LOG4J_PROP_FILE)

In [4]:
SPARK_ADDRESS = 'local[4]'
# run spark
spark = SparkSession\
        .builder\
        .appName(APP_NAME)\
        .master(SPARK_ADDRESS)\
        .config("spark.driver.host", LOCAL_IP)\
        .config("spark.driver.bindAddress", "0.0.0.0")\
        .config("spark.executor.instances", "2")\
        .config("spark.executor.cores", '3')\
        .config("spark.memory.fraction", "0.8")\
        .config("spark.memory.storageFraction", "0.6")\
        .config("spark.executor.memory", '3g')\
        .config("spark.driver.memory", "3g")\
        .config("spark.driver.maxResultSize", "1g")\
        .config("spark.kubernetes.memoryOverheadFactor", "0.3")\
        .config("spark.driver.extraJavaOptions", EXTRA_JAVA_OPTIONS)\
        .config("spark.kubernetes.namespace", "vvoronin-306285")\
        .config("spark.kubernetes.driver.label.appname", APP_NAME)\
        .config("spark.kubernetes.executor.label.appname", APP_NAME)\
        .config("spark.kubernetes.container.image", "node03.st:5000/spark-executor:vvoronin-306285")\
        .config("spark.sql.streaming.checkpointLocation", "hdfs:///home/vvoronin-306285") \
        .config("spark.local.dir", "/tmp/spark")\
        .config("spark.driver.extraClassPath", "/home/jovyan/shared-data/my-project-name-jar-with-dependencies.jar")\
        .config("spark.executor.extraClassPath", "/home/jovyan/shared-data/my-project-name-jar-with-dependencies.jar")\
        .config("spark.kubernetes.executor.volumes.emptyDir.spark-local-dir-tmp-spark.mount.path", "/tmp/spark")\
        .config("spark.kubernetes.executor.volumes.emptyDir.spark-local-dir-tmp-spark.mount.readOnly", "false")\
        .config("spark.kubernetes.executor.volumes.hostPath.depdir.mount.path", "/home/jovyan/shared-data")\
        .config("spark.kubernetes.executor.volumes.hostPath.depdir.options.path", "/nfs/shared")\
        .config("spark.kubernetes.executor.volumes.hostPath.depdir.options.type", "Directory")\
        .config("spark.kubernetes.executor.volumes.hostPath.depdir.mount.readOnly", "true")\
        .getOrCreate()

In [21]:
from kafka import KafkaConsumer, KafkaProducer
import json
import random

['M', 0, 18, '1 hour'],
 ['M', 0, 18, '1 day'],
 ['M', 0, 18, '1 week'],
 ['M', 18, 27, '1 hour'],
 ['M', 18, 27, '1 day'],
 ['M', 18, 27, '1 week'],
 ['M', 27, 40, '1 hour'],
 ['M', 27, 40, '1 day'],
 ['M', 27, 40, '1 week'],
 ['M', 40, 60, '1 hour'],
 ['M', 40, 60, '1 day'],
 ['M', 40, 60, '1 week'],
 ['M', 60, 130, '1 hour'],
 ['M', 60, 130, '1 day'],
 ['M', 60, 130, '1 week'],
 ['F', 0, 18, '1 hour'],
 ['F', 0, 18, '1 day'],
 ['F', 0, 18, '1 week'],
 ['F', 18, 27, '1 hour'],
 ['F', 18, 27, '1 day'],
 ['F', 18, 27, '1 week'],
 ['F', 27, 40, '1 hour'],
 ['F', 27, 40, '1 day'],
 ['F', 27, 40, '1 week'],
 ['F', 40, 60, '1 hour'],
 ['F', 40, 60, '1 day'],
 ['F', 40, 60, '1 week'],
 ['F', 60, 130, '1 hour'],
 ['F', 60, 130, '1 day'],
 ['F', 60, 130, '1 week']]

In [24]:
def favorite_stream(name = "F_28-39", period = '1 day'):
    schema = StructType([
        StructField("user_id", DoubleType()),
        StructField("date", TimestampType()),
        StructField("text", StringType()),
        StructField("sex", StringType()),
        StructField("age", IntegerType()),
        StructField("age_group", StringType())])
    
    stream_df1 = spark.readStream \
                .format("kafka")\
                .option("kafka.bootstrap.servers", "kafka-svc:9092")\
                .option("subscribe", name)\
                .load()

    stream_df1 = stream_df1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
    stream_df1 = stream_df1.select('value', from_json("value", schema).alias("value_struct"))
    stream_df1 = stream_df1.select("value",  
        "value_struct.sex",
        'value_struct.age_group',
        "value_struct.date", 
        "value_struct.text")

    subtopic_name = name + str(int(random.random() *100))
    stream_df1.withColumn("tokens", split(col('text'), ' '))\
                .select("sex", "date", "age_group", 'text', 'tokens')\
                .groupBy("age_group", window('date', period, period))\
                .agg(sum(size("tokens")).alias('count'))\
                .selectExpr('to_json(struct(*)) as value')\
                .writeStream\
                .outputMode("update")\
                .format("kafka") \
                .option("kafka.bootstrap.servers", "kafka-svc:9092")\
                .option("topic", subtopic_name)\
                .start()
    print('start')
    
    consumer = KafkaConsumer(bootstrap_servers="kafka-svc:9092", value_deserializer=lambda value: json.loads(value))
    consumer.subscribe(subtopic_name)
    while(True):
        for message in consumer:
            print(message)
    pass

In [13]:
favorite_stream('F_28-39', '1 day')

{'age_group': '28-39', 'window': {'start': '2019-04-04T00:00:00.000Z', 'end': '2019-04-05T00:00:00.000Z'}, 'count': 4}
{'age_group': '28-39', 'window': {'start': '2019-10-21T00:00:00.000Z', 'end': '2019-10-22T00:00:00.000Z'}, 'count': 148}
{'age_group': '28-39', 'window': {'start': '2019-05-28T00:00:00.000Z', 'end': '2019-05-29T00:00:00.000Z'}, 'count': 3}
{'age_group': '28-39', 'window': {'start': '2019-10-17T00:00:00.000Z', 'end': '2019-10-18T00:00:00.000Z'}, 'count': 4}
{'age_group': '28-39', 'window': {'start': '2019-09-02T00:00:00.000Z', 'end': '2019-09-03T00:00:00.000Z'}, 'count': 1}
{'age_group': '28-39', 'window': {'start': '2019-08-31T00:00:00.000Z', 'end': '2019-09-01T00:00:00.000Z'}, 'count': 2}
{'age_group': '28-39', 'window': {'start': '2019-04-14T00:00:00.000Z', 'end': '2019-04-15T00:00:00.000Z'}, 'count': 3}
{'age_group': '28-39', 'window': {'start': '2019-03-30T00:00:00.000Z', 'end': '2019-03-31T00:00:00.000Z'}, 'count': 6}
{'age_group': '28-39', 'window': {'start': '20

KeyboardInterrupt: 

# Draft 
Источник вдохновения

In [8]:
import pyspark.sql.functions as F

In [9]:
schema = StructType([
        StructField("user_id", DoubleType()),
        StructField("date", TimestampType()),
        StructField("text", StringType()),
        StructField("sex", StringType()),
        StructField("age", IntegerType()),
        StructField("age_group", StringType())])

In [10]:
name = "F_28-39"
stream_df1 = spark.readStream \
                .format("kafka")\
                .option("kafka.bootstrap.servers", "kafka-svc:9092")\
                .option("subscribe", name)\
                .load()

stream_df1 = stream_df1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
stream_df1 = stream_df1.select('value', from_json("value", schema).alias("value_struct"))
stream_df1 = stream_df1.select("value",  
                "value_struct.sex",
                'value_struct.age_group',
                "value_struct.date", 
                "value_struct.text",
                )

In [12]:
period = '1 day'
stream_df1.withColumn("tokens", split(col('text'), ' '))\
                .select("sex", "date", "age_group", 'text', 'tokens')\
                .groupBy("age_group", window('date', period, period))\
                .agg(sum(size("tokens")).alias('count'))\
                .selectExpr('to_json(struct(*)) as value')\
                .writeStream\
                .outputMode("update")\
                .format("kafka") \
                .option("kafka.bootstrap.servers", "kafka-svc:9092")\
                .option("topic", 'hot')\
                .start()

<pyspark.sql.streaming.StreamingQuery at 0x7f3f28e01c70>

In [13]:
consumer_f27 = KafkaConsumer(bootstrap_servers="kafka-svc:9092", value_deserializer=lambda value: json.loads(value))
consumer_f27.subscribe('hot')
while(True):
    for message in consumer_f27:
        print(message.value)

{'age_group': '28-39', 'window': {'start': '2019-04-04T00:00:00.000Z', 'end': '2019-04-05T00:00:00.000Z'}, 'count': 4}
{'age_group': '28-39', 'window': {'start': '2019-10-21T00:00:00.000Z', 'end': '2019-10-22T00:00:00.000Z'}, 'count': 148}
{'age_group': '28-39', 'window': {'start': '2019-05-28T00:00:00.000Z', 'end': '2019-05-29T00:00:00.000Z'}, 'count': 3}
{'age_group': '28-39', 'window': {'start': '2019-10-17T00:00:00.000Z', 'end': '2019-10-18T00:00:00.000Z'}, 'count': 4}
{'age_group': '28-39', 'window': {'start': '2019-09-02T00:00:00.000Z', 'end': '2019-09-03T00:00:00.000Z'}, 'count': 1}
{'age_group': '28-39', 'window': {'start': '2019-08-31T00:00:00.000Z', 'end': '2019-09-01T00:00:00.000Z'}, 'count': 2}
{'age_group': '28-39', 'window': {'start': '2019-04-14T00:00:00.000Z', 'end': '2019-04-15T00:00:00.000Z'}, 'count': 3}
{'age_group': '28-39', 'window': {'start': '2019-03-30T00:00:00.000Z', 'end': '2019-03-31T00:00:00.000Z'}, 'count': 6}
{'age_group': '28-39', 'window': {'start': '20

KeyboardInterrupt: 

consumer = KafkaConsumer(bootstrap_servers="kafka-svc:9092", value_deserializer=lambda value: json.loads(value))
consumer.subscribe('main')

topic_set = set()
from nltk.tokenize import word_tokenize

for message in consumer:
    transaction: dict = message.value
    #print(transaction)
    transaction['text'] = [w for w in word_tokenize(transaction['text']) if not w.lower() in stop_words_ru]
    #print("\n\n", transaction, "\n\n")
    topic = get_topic_name(transaction)
    topic_set.add(topic)
    producer.send(topic, value=transaction)
#    time.sleep(random.random())