In [17]:
import findspark

In [18]:
findspark.init()

In [19]:
import pyspark
from pyspark.sql import SparkSession

pyspark.sql.SparkSession 
- Main entry point for DataFrame and SQL functionality.

pyspark.sql.DataFrame 
- A distributed collection of data grouped into named columns.

pyspark.sql.Column 
- A column expression in a DataFrame.

pyspark.sql.Row 
- A row of data in a DataFrame.

pyspark.sql.GroupedData 
- Aggregation methods, returned by DataFrame.groupBy().

pyspark.sql.DataFrameNaFunctions 
- Methods for handling missing data (null values).

pyspark.sql.DataFrameStatFunctions 
- Methods for statistics functionality.

pyspark.sql.functions 
- List of built-in functions available for DataFrame.

pyspark.sql.types 
- List of data types available.

pyspark.sql.Window 
- For working with window functions.

SparkSession은 데이터프레임을 생성할 때 사용된다. 테이블에서 데이터프레임을 등록하거나, 테이블에 SQL문을 실행하거나 table을 캐싱하거나 parquet 파일을 읽을 때 쓰인다. 아래의 빌드 패턴으로 SparkSession을 생성할 수 있다. 

In [7]:
# builder - A class attribute having a Builder to construct SparkSession instances.
# appName - Sets a name for the application, which will be shown in the Spark web UI.
# Config - Sets a config option. Options set using this method are automatically propagated to both SparkConf and SparkSession’s own configuration.

spark = SparkSession.builder \
    .master("local") \
    .appName("Word Count") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [8]:
# For an existing SparkConf, use conf parameter.

from pyspark.conf import SparkConf
SparkSession.builder.config(conf=SparkConf())

<pyspark.sql.session.SparkSession.Builder at 0x6f5bc10>

#### getOrCreate()
Gets an existing SparkSession or, if there is no existing one, creates a new one based on the options set in this builder.

This method first checks whether there is a valid global default SparkSession, and if yes, return that one. If no valid global default SparkSession exists, the method creates a new SparkSession and assigns the newly created SparkSession as the global default.

In [9]:
s1 = SparkSession.builder.config("k1", "v1").getOrCreate()
s1.conf.get("k1") == "v1"

True

In [10]:
s2 = SparkSession.builder.config("k2", "v2").getOrCreate()
s1.conf.get("k1") == s2.conf.get("k2")

False

In [11]:
s2.conf.get("k2") == s2.conf.get("k2")

True

#### createDataFrame
Creates a DataFrame from an RDD, a list or a pandas.DataFrame.

In [32]:
I = [('Alice', 1)]
spark.createDataFrame(l).collect()

spark.createDataFrame(l, ['name', 'age']).collect()

[Row(name='Alice', age=1)]

In [33]:
d = [{'name': 'Alice', 'age': 1}]
spark.createDataFrame(d).collect()

[Row(age=1, name='Alice')]

In [34]:
from pyspark import SparkConf, SparkContext
conf = pyspark.SparkConf().setAppName('appName').setMaster('local')

In [35]:
sc = SparkContext.getOrCreate(conf=conf);

rdd = sc.parallelize(I)
spark.createDataFrame(rdd).collect()

df = spark.createDataFrame(rdd, ['name', 'age'])
df.collect()

[Row(name='Alice', age=1)]

In [36]:
spark.createDataFrame(rdd).collect()

df = spark.createDataFrame(rdd, ['name', 'age'])
df.collect()

[Row(name='Alice', age=1)]

### SparkSession

데이터셋을 다루기 위해 가장 먼저 알아야 하는 것은 SparkSession이다. RDD를 생성하기 위해 SparkContext가 필요했던 것처럼 데이터프레임을 생성하기 위해서는 SparkSession을 이용해야 한다. SparkSession은 인스턴스 생성을 위한 build() 메서드를 제공하고, 이 메서드를 이용하면 기존 인스턴스를 재사용하거나 새로운 인스턴스를 생성할 수 있다.

In [37]:
from pyspark.sql import Row
Person = Row('name', 'age')
person = rdd.map(lambda r: Person(*r))
df2 = spark.createDataFrame(person)
df2.collect()

[Row(name='Alice', age=1)]

In [38]:
from pyspark.sql.types import *
schema = StructType([
   StructField("name", StringType(), True),
   StructField("age", IntegerType(), True)])
df3 = spark.createDataFrame(rdd, schema)
df3.collect()

[Row(name='Alice', age=1)]

In [39]:
spark.createDataFrame(df.toPandas()).collect()  

[Row(name='Alice', age=1)]

In [41]:
import pandas
spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect()  

[Row(0=1, 1=2)]

In [51]:
spark.range(1, 7, 2).collect()

[Row(id=1), Row(id=3), Row(id=5)]

In [50]:
spark.range(3).collect()

[Row(id=0), Row(id=1), Row(id=2)]

In [48]:
df.createOrReplaceTempView("table1")

In [52]:
df2 = spark.sql("SELECT age AS f1, name as f2 from table1")

In [53]:
df2.collect()

[Row(f1=1, f2='Alice')]

In [54]:
df.createOrReplaceTempView("table1")
df2 = spark.table("table1")
sorted(df.collect()) == sorted(df2.collect())

True

In [59]:
from pyspark.sql import SQLContext 
import pandas as pd 


In [60]:
sqlContext = SQLContext(sc)

In [61]:
l = [('Alice', 1)]
sqlContext.createDataFrame(l).collect()

[Row(_1='Alice', _2=1)]

In [62]:
sqlContext.createDataFrame(l, ['name', 'age']).collect()

[Row(name='Alice', age=1)]

In [63]:
d = [{'name': 'Alice', 'age': 1}]
sqlContext.createDataFrame(d).collect()

[Row(age=1, name='Alice')]

In [64]:
rdd = sc.parallelize(I)

In [66]:
print(rdd.collect())

[('Alice', 1)]


In [67]:
print(I)

[('Alice', 1)]


In [69]:
sqlContext.createDataFrame(l, ['name', 'age']).collect()

[Row(name='Alice', age=1)]

In [70]:
d = [{'name': 'Alice', 'age': 1}]
sqlContext.createDataFrame(d).collect()



[Row(age=1, name='Alice')]

In [71]:
sqlContext.registerDataFrameAsTable(df, "table1")

In [73]:
sqlContext.dropTempTable("table1")

In [75]:
sqlContext.registerDataFrameAsTable(df, "table1")

In [76]:
df2 = sqlContext.tables()
df2.filter("tableName = 'table1'").first()

Row(database='', tableName='table1', isTemporary=True)

In [77]:
df = spark.createDataFrame([("a", 1), ("b", 2), ("c", 3)], ["Col1", "Col2"])

In [78]:
df.select(df.colRegex("`(Col1)?+.+`")).show()

+----+
|Col2|
+----+
|   1|
|   2|
|   3|
+----+



In [79]:
df.collect()

[Row(Col1='a', Col2=1), Row(Col1='b', Col2=2), Row(Col1='c', Col2=3)]

In [80]:
df.columns

['Col1', 'Col2']

In [81]:
df.count()

3