In [70]:
# Spark Hello World

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("WordCount").getOrCreate()

text = "Hello Spark Hello Python Hello Docker Hello World"
words = spark.sparkContext.parallelize(text.split(" "))
wordCounts = words.map(lambda word: (word, 1)). reduceByKey(lambda a, b: a + b)

for wc in wordCounts.collect():
    print(wc[0], wc[1])

spark.stop()

                                                                                

Docker 1
World 1
Hello 4
Python 1
Spark 1


In [73]:
from pyspark import SparkConf, SparkContext

spark_conf = SparkConf().setAppName("spark-context-test")
spark = SparkContext.getOrCreate(conf=spark_conf)
print(spark.getConf().getAll())  # 현재 설정 확인

[('spark.app.id', 'local-1717048920277'), ('spark.executor.id', 'driver'), ('spark.app.name', 'spark-context-test'), ('spark.driver.port', '34115'), ('spark.driver.extraJavaOptions', '-Dio.netty.tryReflectionSetAccessible=true'), ('spark.driver.host', '76bea6c276dc'), ('spark.app.startTime', '1717048920223'), ('spark.rdd.compress', 'True'), ('spark.serializer.objectStreamReset', '100'), ('spark.master', 'local[*]'), ('spark.submit.pyFiles', ''), ('spark.submit.deployMode', 'client'), ('spark.executor.extraJavaOptions', '-Dio.netty.tryReflectionSetAccessible=true'), ('spark.ui.showConsoleProgress', 'true')]


In [74]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
        .master("local")
        .appName("spark-session-test")
        ## .config("spark.some.config.option", "some-value")
        .getOrCreate()
)
print(f"Hello, {spark.sparkContext.sparkUser()}")

# enableHiveSupport()

Hello, jovyan


데이터 확인
--

In [75]:
df = spark.read.option("header", "true").option("inferSchema", "true").csv("./data/IMDb Top TV Series.csv")
df.printSchema()

root
 |-- Title: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Parental Rating: string (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Number of Votes: string (nullable = true)
 |-- Description: string (nullable = true)



In [76]:
df.count()

900

In [62]:
df.show(5)

+-------------------+---------+---------------+------+---------------+--------------------+
|              Title|     Year|Parental Rating|Rating|Number of Votes|         Description|
+-------------------+---------+---------------+------+---------------+--------------------+
| 1. Game of Thrones|2011–2019|          TV-MA|   9.2|           2.3M|Nine noble famili...|
|    2. Breaking Bad|2008–2013|          TV-MA|   9.5|           2.1M|A chemistry teach...|
| 3. Stranger Things|2016–2025|          TV-14|   8.7|           1.3M|When a young boy ...|
|         4. Friends|1994–2004|          TV-14|   8.9|           1.1M|Follows the perso...|
|5. The Walking Dead|2010–2022|          TV-MA|   8.1|           1.1M|Sheriff Deputy Ri...|
+-------------------+---------+---------------+------+---------------+--------------------+
only showing top 5 rows



In [77]:
df = spark.read.format('csv') \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("./data/IMDb Top TV Series.csv")
df.show(3)

+------------------+---------+---------------+------+---------------+--------------------+
|             Title|     Year|Parental Rating|Rating|Number of Votes|         Description|
+------------------+---------+---------------+------+---------------+--------------------+
|1. Game of Thrones|2011–2019|          TV-MA|   9.2|           2.3M|Nine noble famili...|
|   2. Breaking Bad|2008–2013|          TV-MA|   9.5|           2.1M|A chemistry teach...|
|3. Stranger Things|2016–2025|          TV-14|   8.7|           1.3M|When a young boy ...|
+------------------+---------+---------------+------+---------------+--------------------+
only showing top 3 rows



In [78]:
df.dtypes

[('Title', 'string'),
 ('Year', 'string'),
 ('Parental Rating', 'string'),
 ('Rating', 'double'),
 ('Number of Votes', 'string'),
 ('Description', 'string')]

In [79]:
from pyspark.sql.types import StructType, StructField, StringType, DateType, LongType, BooleanType, FloatType
# https://spark.apache.org/docs/latest/sql-ref-datatypes.html

schema = StructType([
      StructField("Title", StringType(), True),
      StructField("Year", LongType(), True),
      StructField("Parental Rating", StringType(), False),
      StructField("Rating", FloatType(), True),
      StructField("Number of Votes", StringType(), True),
      StructField("Description", StringType(), True)
  ])


In [80]:
df = spark.read.schema(schema).json("./data/IMDb Top TV Series.csv")
df.printSchema()

root
 |-- Title: string (nullable = true)
 |-- Year: long (nullable = true)
 |-- Parental Rating: string (nullable = true)
 |-- Rating: float (nullable = true)
 |-- Number of Votes: string (nullable = true)
 |-- Description: string (nullable = true)

