In [30]:
# importing pyspark session
from pyspark.sql import SparkSession
spark = SparkSession.builder\
                    .appName('Demo To SparkSQl')\
                    .getOrCreate()

In [3]:
# importing row
from pyspark.sql.types import Row
from datetime import datetime

In [19]:
# creating RDD
records = sc.parallelize([Row(
                              id =1,
                              name = 'Jill',
                              active = True,
                              clubs = ['chess', 'hockey'],
                              subjects = {'maths':85, 'english':87},
                              enrolled = datetime(2019, 8, 1, 14, 5)),
                          Row(
                              id =2,
                              name = 'George',
                              active = False,
                              clubs = ['badminton', 'football'],
                              subjects = {'maths':89, 'english':79},
                              enrolled = datetime(2019, 4, 1, 22, 8)),
                          Row(
                              id =3,
                              name = 'Alice',
                              active = True,
                              clubs = ['cricket', 'pool'],
                              subjects = {'maths':95, 'english':77},
                              enrolled = datetime(2019, 10, 1, 16, 5)), 
                         ])
records

ParallelCollectionRDD[32] at parallelize at PythonRDD.scala:195

In [20]:
# converting it to DF
records_df = records.toDF()
records_df.show()

+------+--------------------+-------------------+---+------+--------------------+
|active|               clubs|           enrolled| id|  name|            subjects|
+------+--------------------+-------------------+---+------+--------------------+
|  true|     [chess, hockey]|2019-08-01 14:05:00|  1|  Jill|[english -> 87, m...|
| false|[badminton, footb...|2019-04-01 22:08:00|  2|George|[english -> 79, m...|
|  true|     [cricket, pool]|2019-10-01 16:05:00|  3| Alice|[english -> 77, m...|
+------+--------------------+-------------------+---+------+--------------------+



In [21]:
# registering/creating per-session template view for limeted time (upto session)
records_df.createOrReplaceTempView('records')

# sharing it to sparkcontext and quering from dataframe
all_records_df = sqlContext.sql('SELECT * FROM records')

# showing the records data
all_records_df.show()
# it return result as dataframe

+------+--------------------+-------------------+---+------+--------------------+
|active|               clubs|           enrolled| id|  name|            subjects|
+------+--------------------+-------------------+---+------+--------------------+
|  true|     [chess, hockey]|2019-08-01 14:05:00|  1|  Jill|[english -> 87, m...|
| false|[badminton, footb...|2019-04-01 22:08:00|  2|George|[english -> 79, m...|
|  true|     [cricket, pool]|2019-10-01 16:05:00|  3| Alice|[english -> 77, m...|
+------+--------------------+-------------------+---+------+--------------------+



In [22]:
# complex sql queries
sqlContext.sql('SELECT id, clubs[1], subjects["maths"] FROM records').show()

+---+--------+---------------+
| id|clubs[1]|subjects[maths]|
+---+--------+---------------+
|  1|  hockey|             85|
|  2|football|             89|
|  3|    pool|             95|
+---+--------+---------------+



In [25]:
# filtering data
sqlContext.sql('SELECT * FROM records WHERE active=True').show()

+------+---------------+-------------------+---+-----+--------------------+
|active|          clubs|           enrolled| id| name|            subjects|
+------+---------------+-------------------+---+-----+--------------------+
|  true|[chess, hockey]|2019-08-01 14:05:00|  1| Jill|[english -> 87, m...|
|  true|[cricket, pool]|2019-10-01 16:05:00|  3|Alice|[english -> 77, m...|
+------+---------------+-------------------+---+-----+--------------------+



In [27]:
# logical operators
sqlContext.sql('SELECT * FROM records WHERE subjects["english"] > 80').show()

+------+---------------+-------------------+---+----+--------------------+
|active|          clubs|           enrolled| id|name|            subjects|
+------+---------------+-------------------+---+----+--------------------+
|  true|[chess, hockey]|2019-08-01 14:05:00|  1|Jill|[english -> 87, m...|
+------+---------------+-------------------+---+----+--------------------+



In [28]:
# creating global tempview available for all spark session
records_df.createGlobalTempView('global_records')

In [31]:
# accesing table data by using keyword global_temp
sqlContext.sql('SELECT * FROM global_temp.global_records').show()

+------+--------------------+-------------------+---+------+--------------------+
|active|               clubs|           enrolled| id|  name|            subjects|
+------+--------------------+-------------------+---+------+--------------------+
|  true|     [chess, hockey]|2019-08-01 14:05:00|  1|  Jill|[english -> 87, m...|
| false|[badminton, footb...|2019-04-01 22:08:00|  2|George|[english -> 79, m...|
|  true|     [cricket, pool]|2019-10-01 16:05:00|  3| Alice|[english -> 77, m...|
+------+--------------------+-------------------+---+------+--------------------+

