# SparkSQL Example

This notebook demonstrates using SparkSQL within a PySpark session.

Several [Spark examples](/tree/examples/spark) are included with TAP.

Related Documentation:
- [SparkSQL Documentation](http://spark.apache.org/docs/latest/sql-programming-guide.html)
- [PySpark API documentation](http://spark.apache.org/docs/latest/api/python/)

In [1]:
import pyspark

# Create a SparkContext in local mode
sc = pyspark.SparkContext("local")

# Create a SqlContext from the SparkContext
sqlContext = pyspark.SQLContext(sc)

In [2]:
data = [
    (1, 'a'), 
    (2, 'b'), 
    (3, 'c'), 
    (4, 'd'), 
    (5, 'e'), 
    (6, 'a'), 
    (7, 'b'), 
    (8, 'c'), 
    (9, 'd'), 
    (10, 'e')
]

# Convert a local data set into a DataFrame
df = sqlContext.createDataFrame(data, ['numbers', 'letters'])

# Convert to a Pandas DataFrame for easy display
df.toPandas()

Unnamed: 0,numbers,letters
0,1,a
1,2,b
2,3,c
3,4,d
4,5,e
5,6,a
6,7,b
7,8,c
8,9,d
9,10,e


In [3]:
# Register the DataFrame as a table
df.registerTempTable("mytable")

# Peform a simple select from the table
results = sqlContext.sql("select * from mytable")

# Convert the results to a Pandas DataFrame for easy viewing
results.toPandas()

Unnamed: 0,numbers,letters
0,1,a
1,2,b
2,3,c
3,4,d
4,5,e
5,6,a
6,7,b
7,8,c
8,9,d
9,10,e


In [4]:
# Perform a query with a where clause and order by
results = sqlContext.sql("select * from mytable where numbers < 8 order by numbers desc")

# Convert the results to a Pandas DataFrame for easy viewing
results.toPandas()

Unnamed: 0,numbers,letters
0,7,b
1,6,a
2,5,e
3,4,d
4,3,c
5,2,b
6,1,a


In [5]:
# Perform a more complex query on the table
results = sqlContext.sql("select letters, count(*) as count, avg(numbers) as avg, sum(numbers) as sum from mytable group by letters")

# Convert the results to a Pandas DataFrame for easy viewing
results.toPandas()

Unnamed: 0,letters,count,avg,sum
0,a,2,3.5,7
1,b,2,4.5,9
2,c,2,5.5,11
3,d,2,6.5,13
4,e,2,7.5,15


## Stop the Spark Context

In [6]:
# Stop the context when you are done with it. When you stop the SparkContext resources 
# are released and no further operations can be performed within that context
sc.stop()