In [1]:
import os
import socket
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, length, when, col
from pyspark.sql.types import BooleanType, IntegerType, LongType, StringType, ArrayType, FloatType, StructType, StructField, DoubleType, TimestampType
from pyspark.sql.functions import *
from pyspark.sql.functions import pandas_udf
from pyspark.sql.functions import PandasUDFType
from jinja2 import Environment, FileSystemLoader

In [2]:
# setting constants
APP_NAME = "VVORONIN-SPARK-APP_receive"
NORMALIZED_APP_NAME = APP_NAME.replace('/', '_').replace(':', '_')

APPS_TMP_DIR = os.path.join(os.getcwd(), "tmp")
APPS_CONF_DIR = os.path.join(os.getcwd(), "conf")
APPS_LOGS_DIR = os.path.join(os.getcwd(), "logs")
LOG4J_PROP_FILE = os.path.join(APPS_CONF_DIR, "pyspark-log4j-{}.properties".format(NORMALIZED_APP_NAME))
LOG_FILE = os.path.join(APPS_LOGS_DIR, 'pyspark-{}.log'.format(NORMALIZED_APP_NAME))
EXTRA_JAVA_OPTIONS = "-Dlog4j.configuration=file://{} -Dspark.hadoop.dfs.replication=1 -Dhttps.protocols=TLSv1.0,TLSv1.1,TLSv1.2,TLSv1.3"\
    .format(LOG4J_PROP_FILE)

LOCAL_IP = socket.gethostbyname(socket.gethostname())

In [3]:
# preparing configuration files from templates
for directory in [APPS_CONF_DIR, APPS_LOGS_DIR, APPS_TMP_DIR]:
    if not os.path.exists(directory):
        os.makedirs(directory)

env = Environment(loader=FileSystemLoader('/opt'))
template = env.get_template("pyspark_log4j.properties.template")
template\
    .stream(logfile=LOG_FILE)\
    .dump(LOG4J_PROP_FILE)

In [4]:
SPARK_ADDRESS = 'local[4]'
# run spark
spark = SparkSession\
        .builder\
        .appName(APP_NAME)\
        .master(SPARK_ADDRESS)\
        .config("spark.driver.host", LOCAL_IP)\
        .config("spark.driver.bindAddress", "0.0.0.0")\
        .config("spark.executor.instances", "2")\
        .config("spark.executor.cores", '3')\
        .config("spark.memory.fraction", "0.8")\
        .config("spark.memory.storageFraction", "0.6")\
        .config("spark.executor.memory", '3g')\
        .config("spark.driver.memory", "3g")\
        .config("spark.driver.maxResultSize", "1g")\
        .config("spark.kubernetes.memoryOverheadFactor", "0.3")\
        .config("spark.driver.extraJavaOptions", EXTRA_JAVA_OPTIONS)\
        .config("spark.kubernetes.namespace", "vvoronin-306285")\
        .config("spark.kubernetes.driver.label.appname", APP_NAME)\
        .config("spark.kubernetes.executor.label.appname", APP_NAME)\
        .config("spark.kubernetes.container.image", "node03.st:5000/spark-executor:vvoronin-306285")\
        .config("spark.sql.streaming.checkpointLocation", "hdfs:///home/vvoronin-306285") \
        .config("spark.local.dir", "/tmp/spark")\
        .config("spark.driver.extraClassPath", "/home/jovyan/shared-data/my-project-name-jar-with-dependencies.jar")\
        .config("spark.executor.extraClassPath", "/home/jovyan/shared-data/my-project-name-jar-with-dependencies.jar")\
        .config("spark.kubernetes.executor.volumes.emptyDir.spark-local-dir-tmp-spark.mount.path", "/tmp/spark")\
        .config("spark.kubernetes.executor.volumes.emptyDir.spark-local-dir-tmp-spark.mount.readOnly", "false")\
        .config("spark.kubernetes.executor.volumes.hostPath.depdir.mount.path", "/home/jovyan/shared-data")\
        .config("spark.kubernetes.executor.volumes.hostPath.depdir.options.path", "/nfs/shared")\
        .config("spark.kubernetes.executor.volumes.hostPath.depdir.options.type", "Directory")\
        .config("spark.kubernetes.executor.volumes.hostPath.depdir.mount.readOnly", "true")\
        .getOrCreate()

In [5]:
schema = StructType([
        StructField("user_id", DoubleType()),
        StructField("date", TimestampType()),
        StructField("text", StringType()),
        StructField("sex", StringType()),
        StructField("age", IntegerType())])

In [6]:
stream_df = spark.readStream \
                .format("kafka")\
                .option("kafka.bootstrap.servers", "kafka-svc:9092")\
                .option("subscribe", "main")\
                .load()

In [7]:
stream_df = stream_df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

In [8]:
stream_df = stream_df.select('value', from_json("value", schema).alias("value_struct"))
stream_df = stream_df.select("value", 
                "value_struct.user_id", 
                "value_struct.date", 
                "value_struct.text", 
                "value_struct.sex", 
                "value_struct.age")


Удаляем стоп-слова

In [10]:
import nltk
nltk.data.path.append("/home/jovyan/nltk_data")

from nltk.corpus import stopwords
from pyspark.ml.feature import RegexTokenizer
from pyspark.ml.feature import StopWordsRemover

In [11]:
stop_words_ru = stopwords.words('russian')
stopwords_broadcast = spark.sparkContext.broadcast(stop_words_ru) 
remover = StopWordsRemover(inputCol="tokens", outputCol="cleaned_tokens", stopWords=stop_words_ru)

In [27]:
@udf(returnType=StringType())
def get_categories(age):
    categories = {(0, 18): '0-18', 
                  (18, 27): '18-27', 
                  (27, 40): '27-40', 
                  (40, 60): '40-60', 
                  (60, 130): '60+'}
    
    for key in categories.keys():    
        if age > key[0] and age < key[1]:
            return categories[key]

In [29]:
sex_list = ["M", "F"]
categories = [(0, 18), (18, 27), (27, 40), (40, 60), (60, 500)]
periods = ['hour', 'day', 'week']

for sex in sex_list:
    for age in categories:
        age_lower = age[0]
        age_upper = age[1]
        for period in periods:
            #period_norm = period.replace(" ", "_")
            topic = f"{sex}_{age_lower}_{age_upper}_{period}"
            print(f"Dividing topic: {topic}")

            remover.transform(stream_df.withColumn("tokens", split(col('text'), ' '))\
                                .withColumn("age_group", get_categories("age")))\
                                .where((col("sex") == sex) & (col("age") > age_lower) & (col("age") < age_upper))\
                                .select("date","age_group","sex", explode("cleaned_tokens").alias("token"))\
                                .groupBy('sex', 'age_group', window('date', period, period))\
                                .agg(count('token').alias('tokens_count'))\
                                .selectExpr('to_json(struct(*)) as value')\
                                .writeStream\
                                .outputMode("update")\
                                .format("kafka") \
                                .option("kafka.bootstrap.servers", "kafka-svc:9092")\
                                .option("topic", topic)\
                                .start()

Dividing topic: M_0_18_hour
Dividing topic: M_0_18_day
Dividing topic: M_0_18_week
Dividing topic: M_18_27_hour
Dividing topic: M_18_27_day
Dividing topic: M_18_27_week
Dividing topic: M_27_40_hour
Dividing topic: M_27_40_day
Dividing topic: M_27_40_week
Dividing topic: M_40_60_hour
Dividing topic: M_40_60_day
Dividing topic: M_40_60_week
Dividing topic: M_60_500_hour
Dividing topic: M_60_500_day
Dividing topic: M_60_500_week
Dividing topic: F_0_18_hour
Dividing topic: F_0_18_day
Dividing topic: F_0_18_week
Dividing topic: F_18_27_hour
Dividing topic: F_18_27_day
Dividing topic: F_18_27_week
Dividing topic: F_27_40_hour
Dividing topic: F_27_40_day
Dividing topic: F_27_40_week
Dividing topic: F_40_60_hour
Dividing topic: F_40_60_day
Dividing topic: F_40_60_week
Dividing topic: F_60_500_hour
Dividing topic: F_60_500_day
Dividing topic: F_60_500_week


In [30]:
from kafka import KafkaConsumer
import json

In [31]:
consumer = KafkaConsumer(bootstrap_servers="kafka-svc:9092", consumer_timeout_ms=1000)
consumer.subscribe('M_18_27_day')

In [None]:
while(True):
    for message in consumer:
        print(json.loads(message.value))