In [1]:
from pyspark.sql import SparkSession

# 드라이버 4040로 접속
spark = SparkSession.builder \
    .appName("JupyterSparkSession2") \
    .master("spark://192.168.0.63:7077") \
    .getOrCreate()
    # .config("spark.driver.memory", "512m") \
    # .config("spark.executor.memory", "512m") \
    # .config("spark.executor.cores", "1") \
    # .config("spark.python.worker.memory", "512m") \
    # .config("spark.driver.maxResultSize", "512m") \

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType, TimestampType

# Kafka 브로커 주소 설정
kafka_broker = "kafka-broker-local:29092"

# PostgreSQL 연결 정보 설정
postgres_host = "postgres"  # Docker 컴포즈에서 정의한 서비스 이름
postgres_port = "5432"
postgres_db = "postgres"
postgres_user = "username"
postgres_password = "password"

# Spark 세션 생성
spark = SparkSession.builder \
    .appName("ClickEventProcessing") \
    .master("spark://spark-master:7077") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0,org.postgresql:postgresql:42.2.18") \
    .getOrCreate()

# 클릭 이벤트 스키마 정의
click_event_schema = StructType([
    StructField("user_id", StringType(), True),
    StructField("ad_id", StringType(), True),
    StructField("timestamp", TimestampType(), True)
])

# Kafka에서 클릭 이벤트 스트림 읽기
click_events = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_broker) \
    .option("subscribe", "click_events") \
    .load() \
    .select(from_json(col("value").cast("string"), click_event_schema).alias("data")) \
    .select("data.*")

# PostgreSQL에서 사용자 정보 읽기 (배치 데이터)
jdbc_url = f"jdbc:postgresql://{postgres_host}:{postgres_port}/{postgres_db}"
user_info = spark.read.format("jdbc") \
    .option("url", jdbc_url) \
    .option("dbtable", "users") \
    .option("user", postgres_user) \
    .option("password", postgres_password) \
    .option("driver", "org.postgresql.Driver") \
    .load()

# 클릭 이벤트와 사용자 정보 조인
joined_data = click_events.join(user_info, "user_id")

# 결과 처리 (콘솔에 출력)
query = joined_data \
    .writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

query.awaitTermination()

Py4JJavaError: An error occurred while calling o163.load.
: org.postgresql.util.PSQLException: FATAL: database "postgres" does not exist
	at org.postgresql.core.v3.QueryExecutorImpl.receiveErrorResponse(QueryExecutorImpl.java:2553)
	at org.postgresql.core.v3.QueryExecutorImpl.readStartupMessages(QueryExecutorImpl.java:2665)
	at org.postgresql.core.v3.QueryExecutorImpl.<init>(QueryExecutorImpl.java:147)
	at org.postgresql.core.v3.ConnectionFactoryImpl.openConnectionImpl(ConnectionFactoryImpl.java:273)
	at org.postgresql.core.ConnectionFactory.openConnection(ConnectionFactory.java:51)
	at org.postgresql.jdbc.PgConnection.<init>(PgConnection.java:225)
	at org.postgresql.Driver.makeConnection(Driver.java:465)
	at org.postgresql.Driver.connect(Driver.java:264)
	at org.apache.spark.sql.execution.datasources.jdbc.connection.BasicConnectionProvider.getConnection(BasicConnectionProvider.scala:49)
	at org.apache.spark.sql.execution.datasources.jdbc.connection.ConnectionProviderBase.create(ConnectionProvider.scala:102)
	at org.apache.spark.sql.jdbc.JdbcDialect.$anonfun$createConnectionFactory$1(JdbcDialects.scala:160)
	at org.apache.spark.sql.jdbc.JdbcDialect.$anonfun$createConnectionFactory$1$adapted(JdbcDialects.scala:156)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD$.getQueryOutputSchema(JDBCRDD.scala:63)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD$.resolveTable(JDBCRDD.scala:58)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCRelation$.getSchema(JDBCRelation.scala:241)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:37)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:346)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:229)
	at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:211)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:172)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)
