# Using DataFrame - Spark SQL using Python in Jupyter Notebook

## Initialization

In [1]:
import os
import sys

# Note that the path to spark home could be different for you. This one is for HortonWorks Data Platform
os.environ["SPARK_HOME"] = "/usr/hdp/current/spark2-client"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/local/anaconda/bin/python" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/local/anaconda/bin/python"

# The version for py4j could be different for you
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.4-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

# NOTE: Whichever package you want mention here.
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.10:0.4.1 pyspark-shell' 
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0 pyspark-shell'

In [None]:
# # For spark 2.3
# import os
# import sys
 
# os.environ["SPARK_HOME"] = "/usr/spark2.3/"
# os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# # In below two lines, use /usr/bin/python2.7 if you want to use Python 2
# os.environ["PYSPARK_PYTHON"] = "/usr/local/anaconda/bin/python" 
# os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/local/anaconda/bin/python"
# sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
# sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

In [2]:
# #Entrypoint 1.x

# from pyspark import SparkContext, SparkConf
# conf = SparkConf().setAppName("appName")
# sc = SparkContext(conf=conf)

In [2]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().getOrCreate()
# specify .master("yarn")

## Hive Connectivite Check

In [10]:
spark.sql("show databases").show()

+------------------+
|      databaseName|
+------------------+
|             400bc|
|             a_vis|
|               abc|
|              abc1|
|              abc2|
|           abhinav|
|adarshbhoodhoo5510|
|             adi15|
|            aditya|
|     aduri1ram4183|
|        afarir4475|
|          aishboom|
|          ajaydb_1|
|         ajaydb_x1|
|   aleinsteinr2621|
|  alokdeosingh1995|
|alokpandey09933735|
|   amankisku931845|
|                an|
|ansarirahamath7347|
+------------------+
only showing top 20 rows



## Loading XML

In [11]:
!hadoop fs -cat /data/spark/books.xml

<?xml version="1.0"?>
<catalog>
   <book id="bk101">
      <author>Gambardella, Matthew</author>
      <title>XML Developer's Guide</title>
      <genre>Computer</genre>
      <price>44.95</price>
      <publish_date>2000-10-01</publish_date>
      <description>


         An in-depth look at creating applications
         with XML.This manual describes Oracle XML DB, and how you can use it to store, generate, manipulate, manage,
         and query XML data in the database.


         After introducing you to the heart of Oracle XML DB, namely the XMLType framework and Oracle XML DB repository,
         the manual provides a brief introduction to design criteria to consider when planning your Oracle XML DB
         application. It provides examples of how and where you can use Oracle XML DB.


         The manual then describes ways you can store and retrieve XML data using Oracle XML DB, APIs for manipulating
         XMLType data, and ways you can view, generate

In [12]:
df = spark.read.format("xml").option("rowTag", "book").load("/data/spark/books.xml")

In [13]:
df.show()

+-----+--------------------+--------------------+---------------+-----+------------+--------------------+
|  _id|              author|         description|          genre|price|publish_date|               title|
+-----+--------------------+--------------------+---------------+-----+------------+--------------------+
|bk101|Gambardella, Matthew|


         An in...|       Computer|44.95|  2000-10-01|XML Developer's G...|
|bk102|          Ralls, Kim|A former architec...|        Fantasy| 5.95|  2000-12-16|       Midnight Rain|
|bk103|         Corets, Eva|After the collaps...|        Fantasy| 5.95|  2000-11-17|     Maeve Ascendant|
|bk104|         Corets, Eva|In post-apocalyps...|        Fantasy| 5.95|  2001-03-10|     Oberon's Legacy|
|bk105|         Corets, Eva|The two daughters...|        Fantasy| 5.95|  2001-09-10|  The Sundered Grail|
|bk106|    Randall, Cynthia|When Carla meets ...|        Romance| 4.95|  2000-09-02|         Lover Birds|
|bk107|      Thurman, Paula|A deep sea diver .

## Loading JSON DATA

In [3]:
!hadoop fs -cat /data/spark/people.json

{"name":"Michael"}
{"name":"Andy", "age":30}
{"name":"Justin", "age":19}


In [3]:
df = spark.read.json("/data/spark/people.json")

In [15]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



## Dataframe Operations

In [19]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [20]:
df.select("name").show()

+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
+-------+



In [8]:
df.filter("age > 21").show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+



In [11]:
df.groupBy("age").count().show()

+----+-----+
| age|count|
+----+-----+
|  19|    1|
|null|    1|
|  30|    1|
+----+-----+



## Using SQL

In [12]:
df.createOrReplaceTempView("people")

In [14]:
sqlDF = spark.sql("SELECT * FROM people")
sqlDF.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



## Inferring the Schema Using Reflection

In [16]:
from pyspark.sql.types import Row

sc = spark.sparkContext
textRDD = sc.textFile("/data/spark/people.txt")

arrayRDD = textRDD.map(lambda x: x.split(","))

rowRDD = arrayRDD.map(lambda arr: Row(name=arr[0], age=int(arr[1].strip())))

peopleDF = rowRDD.toDF()
peopleDF.show()


+---+-------+
|age|   name|
+---+-------+
| 29|Michael|
| 30|   Andy|
| 19| Justin|
+---+-------+



In [17]:
# Register the DataFrame as a temporary view
peopleDF.createOrReplaceTempView("people")

# SQL statements can be run by using the sql methods provided by Spark
teenagersDF = spark.sql("SELECT * FROM people WHERE age BETWEEN 13 AND 19")
teenagersDF.show()

teenagersDF.rdd.map(lambda teenager: "Name: " + str(teenager[1])).collect()

#[u'Name: Justin']




+---+------+
|age|  name|
+---+------+
| 19|Justin|
+---+------+



['Name: Justin']

## Programmatically Specifying the Schema

In [19]:
# Creating Rows RDD
filename = "/data/spark/people.txt"
peopleRDD = spark.sparkContext.textFile(filename)
peopleRDD.take(10)

['Michael, 29', 'Andy, 30', 'Justin, 19']

In [28]:
from pyspark.sql import *
from pyspark.sql.types import *
# The schema is encoded in a string. User provided variable
schemaString = "name age"
fieldsArray = schemaString.split(" ")
fields = list(map(lambda f: StructField(f, StringType(), nullable = True),
fieldsArray))
schema = StructType(fields)


In [30]:
schema

StructType(List(StructField(name,StringType,true),StructField(age,StringType,true)))

In [31]:
arrRDD = peopleRDD.map(lambda x: x.split(","))
rowRDD = arrRDD.map(lambda attr: Row(attr[0], attr[1]))


In [32]:
rowRDD.take(10)

[<Row(Michael,  29)>, <Row(Andy,  30)>, <Row(Justin,  19)>]

In [33]:
# Creating Dataframe
peopleDF = spark.createDataFrame(rowRDD, schema)
peopleDF.show()

+-------+---+
|   name|age|
+-------+---+
|Michael| 29|
|   Andy| 30|
| Justin| 19|
+-------+---+

