In [5]:
# Import SparkSession
from pyspark.sql import SparkSession

# Create SparkSession 
spark = (SparkSession.builder 
      .config("spark.driver.host","127.0.0.1") 
      .config("spark.driver.bindAddress","127.0.0.1")
      .appName("SparkByExamples.com") 
      .getOrCreate() )

In [6]:
# Create RDD from parallelize    
dataList = [("Java", 20000), ("Python", 100000), ("Scala", 3000)]
rdd=spark.sparkContext.parallelize(dataList)

In [8]:
rdd

ParallelCollectionRDD[1] at readRDDFromFile at PythonRDD.scala:287

In [9]:
rdd.collect()

[('Java', 20000), ('Python', 100000), ('Scala', 3000)]

In [28]:
dept = [("Finance",10),("Marketing",20),("Sales",30),("IT",40)]
rdd2 = spark.sparkContext.parallelize(dept)


In [29]:
rdd2.collect()

[('Finance', 10), ('Marketing', 20), ('Sales', 30), ('IT', 40)]

In [30]:
df = rdd2.toDF()
df.printSchema()
df.show(truncate=False)

root
 |-- _1: string (nullable = true)
 |-- _2: long (nullable = true)

+---------+---+
|_1       |_2 |
+---------+---+
|Finance  |10 |
|Marketing|20 |
|Sales    |30 |
|IT       |40 |
+---------+---+



In [31]:
deptColumns = ["dept_name","dept_id"]
df2 = rdd2.toDF(deptColumns)
df2.printSchema()
df2.show(truncate=False)

root
 |-- dept_name: string (nullable = true)
 |-- dept_id: long (nullable = true)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance  |10     |
|Marketing|20     |
|Sales    |30     |
|IT       |40     |
+---------+-------+



In [32]:
deptDF = spark.createDataFrame(rdd, schema = deptColumns)
deptDF.printSchema()
deptDF.show(truncate=False)

root
 |-- dept_name: string (nullable = true)
 |-- dept_id: long (nullable = true)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance  |10     |
|Marketing|20     |
|Sales    |30     |
|IT       |40     |
+---------+-------+



### 구조체 정의해서 처리 

In [33]:
from pyspark.sql.types import StructType,StructField, StringType
deptSchema = StructType([       
    StructField('dept_name', StringType(), True),
    StructField('dept_id', StringType(), True)
])

deptDF1 = spark.createDataFrame(rdd, schema = deptSchema)
deptDF1.printSchema()
deptDF1.show(truncate=False)

root
 |-- dept_name: string (nullable = true)
 |-- dept_id: string (nullable = true)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance  |10     |
|Marketing|20     |
|Sales    |30     |
|IT       |40     |
+---------+-------+

