In [32]:
import os
import socket
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, length, when, col
from pyspark.sql.types import BooleanType, IntegerType, LongType, StringType, ArrayType, FloatType, StructType, StructField
from pyspark.sql.functions import *
from pyspark.sql.functions import pandas_udf
from pyspark.sql.functions import PandasUDFType
from jinja2 import Environment, FileSystemLoader

In [33]:
# setting constants
APP_NAME = "VVORONIN-SPARK-APP_send"
NORMALIZED_APP_NAME = APP_NAME.replace('/', '_').replace(':', '_')

APPS_TMP_DIR = os.path.join(os.getcwd(), "tmp")
APPS_CONF_DIR = os.path.join(os.getcwd(), "conf")
APPS_LOGS_DIR = os.path.join(os.getcwd(), "logs")
LOG4J_PROP_FILE = os.path.join(APPS_CONF_DIR, "pyspark-log4j-{}.properties".format(NORMALIZED_APP_NAME))
LOG_FILE = os.path.join(APPS_LOGS_DIR, 'pyspark-{}.log'.format(NORMALIZED_APP_NAME))
EXTRA_JAVA_OPTIONS = "-Dlog4j.configuration=file://{} -Dspark.hadoop.dfs.replication=1 -Dhttps.protocols=TLSv1.0,TLSv1.1,TLSv1.2,TLSv1.3"\
    .format(LOG4J_PROP_FILE)

LOCAL_IP = socket.gethostbyname(socket.gethostname())

In [34]:
# preparing configuration files from templates
for directory in [APPS_CONF_DIR, APPS_LOGS_DIR, APPS_TMP_DIR]:
    if not os.path.exists(directory):
        os.makedirs(directory)

env = Environment(loader=FileSystemLoader('/opt'))
template = env.get_template("pyspark_log4j.properties.template")
template\
    .stream(logfile=LOG_FILE)\
    .dump(LOG4J_PROP_FILE)

In [35]:
SPARK_ADDRESS = "local[4]"

spark = SparkSession\
    .builder\
    .appName("VVORONIN-SPARK-APP")\
    .master(SPARK_ADDRESS)\
    .config("spark.ui.port", "4040")\
    .config("spark.memory.fraction", "0.8")\
    .config("spark.memory.storageFraction", "0.6")\
    .config("spark.driver.memory", "4g")\
    .config("spark.driver.extraJavaOptions", EXTRA_JAVA_OPTIONS)\
    .config("spark.executor.memory", "6g")\
    .getOrCreate()

In [None]:
#для каждого поста получим id его автора
posts_data = spark.read.json("hdfs:///shared/bigdata20/followers_posts_api_final.json")
posts_df = posts_data.where((col("text") != "") & (col("text") != " "))\
                .select(col("owner_id").alias("user_id"), "date", "text")

In [None]:
#для каждого id получим возраст пользовалетя и пол
followers_data = spark.read.json("hdfs:///shared/bigdata20/followers_info.json")

In [None]:
followers_df = followers_data.select(col("id").alias("user_id"), "sex", "bdate")\
                    .withColumn("month_between", 
                        months_between(current_date(), regexp_extract(col("bdate"), r"(\d{1,2})\.(\d{1,2})\.(\d{4})", 3).cast("date")))\
                    .select("user_id", "sex", (col("month_between")/12).cast("int").alias("age"))\
                    .where(col("sex").isNotNull() & col("age").isNotNull())

In [None]:
#соединим одно с другим
posts_with_user_info_df = posts_df.join(followers_df, ["user_id"])\
                            .select("user_id", col("date"), "text", (when(col("sex") == 1, "M").otherwise("F")).alias("sex"), 
                                    "age")

In [None]:
posts_with_user_info_df.toPandas().head()

Начинаем передачу данных через kafka

In [None]:
from kafka import KafkaProducer
import json
import time
import random
#from playsound import playsound

In [None]:
producer = KafkaProducer(bootstrap_servers="kafka-svc:9092", value_serializer=lambda value: json.dumps(value).encode())

In [38]:
for row in posts_with_user_info_df.rdd.toLocalIterator():
    data = row.asDict()
    print(f"post: {data['user_id']}, date: {data['date']}, sex - {data['sex']}, age - {data['age']}")
    
    producer.send("main", data)
    producer.flush()
    time.sleep(random.random())

#playsound('~/nfs-home/beep-07a.wav')  
print("Broadcast has been finished")

KeyboardInterrupt: 

# Draft
Источник вдохновения. Удаляется без последствий для основной части

In [None]:
def connect_kafka_producer():
    _producer = None
    try:
        _producer = KafkaProducer(bootstrap_servers=['localhost:9092'], api_version=(0, 10))
    except Exception as ex:
        print('Exception while connecting Kafka')
        print(str(ex))
    finally:
        return _producer