In [204]:
# Initialize the entry point
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, StructField,IntegerType,StructType
from pyspark.sql.functions import desc, asc, col, cast, avg, round, min, max
spark = SparkSession.builder.appName("local_spark").getOrCreate()

In [205]:
spark

1. Read data from CSV

inferSchema=True assumes datatype

In [206]:
sparkdf = spark.read.csv('students.csv', header=True)
sparkdf.show()

+---+----------------+-----+----+------+
| id|            name|class|mark|gender|
+---+----------------+-----+----+------+
|  1|    Emily Hardie| Four|  75|female|
|  2|       John Star|Three|  85|  male|
|  3|  Arnold Walker |Three|  55|  male|
|  4|      Reana Talu| Four|  60|female|
|  5| Sidona Williams| Four|  60|female|
|  6|       Alex John| Four|  55|  male|
|  7|    Robert John |Three|  78|  male|
|  8|       Lee Malva| Four|  85|  male|
|  9|    Wookie Davey|  Two|  78|  male|
| 10|      Diane Rose|  Two|  55|female|
| 11|    Holly Daives|  Two|  89|female|
| 12|        Eva Cup |Three|  94|female|
| 13| Victoria Mathew| Four|  88|female|
| 14|       Iris Zhao|  Two|  88|female|
| 15|       Scott Row| Four|  88|  male|
| 16|     Daniel Page| Four|  88|  male|
| 17|  James Williams|Three|  54|  male|
| 18|Martin Johnston | Four|  75|  male|
| 19|     John Smith | Four|  48|  male|
| 20|       Lenny Lee|Three|  65|female|
+---+----------------+-----+----+------+
only showing top

2. Print 5 rows ( !!! head() method behaves differently in pySpark) 

In [207]:
# Displays table
sparkdf.show(5)

+---+---------------+-----+----+------+
| id|           name|class|mark|gender|
+---+---------------+-----+----+------+
|  1|   Emily Hardie| Four|  75|female|
|  2|      John Star|Three|  85|  male|
|  3| Arnold Walker |Three|  55|  male|
|  4|     Reana Talu| Four|  60|female|
|  5|Sidona Williams| Four|  60|female|
+---+---------------+-----+----+------+
only showing top 5 rows



In [208]:
# In a matrix if used the Head() method
sparkdf.head(5)

[Row(id='1', name='Emily Hardie', class='Four', mark='75', gender='female'),
 Row(id='2', name='John Star', class='Three', mark='85', gender='male'),
 Row(id='3', name='Arnold Walker ', class='Three', mark='55', gender='male'),
 Row(id='4', name='Reana Talu', class='Four', mark='60', gender='female'),
 Row(id='5', name='Sidona Williams', class='Four', mark='60', gender='female')]

3.Show schema of the spark data frame. 

In [209]:
sparkdf.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- class: string (nullable = true)
 |-- mark: string (nullable = true)
 |-- gender: string (nullable = true)



4. Update schema after reading form CSV. (StructField)

In [210]:
schema = StructType([ 
        StructField("id", IntegerType(),True),
        StructField("name", StringType(),True),
        StructField("class", StringType(),True),
        StructField("mark", IntegerType(),True),
        StructField("gender", StringType(),True),
])

In [211]:
df = spark.read.schema(schema).csv('students.csv')
df.printSchema()
df.show()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- class: string (nullable = true)
 |-- mark: integer (nullable = true)
 |-- gender: string (nullable = true)

+----+----------------+-----+----+------+
|  id|            name|class|mark|gender|
+----+----------------+-----+----+------+
|null|            name|class|null|gender|
|   1|    Emily Hardie| Four|  75|female|
|   2|       John Star|Three|  85|  male|
|   3|  Arnold Walker |Three|  55|  male|
|   4|      Reana Talu| Four|  60|female|
|   5| Sidona Williams| Four|  60|female|
|   6|       Alex John| Four|  55|  male|
|   7|    Robert John |Three|  78|  male|
|   8|       Lee Malva| Four|  85|  male|
|   9|    Wookie Davey|  Two|  78|  male|
|  10|      Diane Rose|  Two|  55|female|
|  11|    Holly Daives|  Two|  89|female|
|  12|        Eva Cup |Three|  94|female|
|  13| Victoria Mathew| Four|  88|female|
|  14|       Iris Zhao|  Two|  88|female|
|  15|       Scott Row| Four|  88|  male|
|  16|     Da

5)	Give schema as option while reading from CSV

In [212]:
df = spark.read.schema(schema).csv('students.csv')
df.show()

+----+----------------+-----+----+------+
|  id|            name|class|mark|gender|
+----+----------------+-----+----+------+
|null|            name|class|null|gender|
|   1|    Emily Hardie| Four|  75|female|
|   2|       John Star|Three|  85|  male|
|   3|  Arnold Walker |Three|  55|  male|
|   4|      Reana Talu| Four|  60|female|
|   5| Sidona Williams| Four|  60|female|
|   6|       Alex John| Four|  55|  male|
|   7|    Robert John |Three|  78|  male|
|   8|       Lee Malva| Four|  85|  male|
|   9|    Wookie Davey|  Two|  78|  male|
|  10|      Diane Rose|  Two|  55|female|
|  11|    Holly Daives|  Two|  89|female|
|  12|        Eva Cup |Three|  94|female|
|  13| Victoria Mathew| Four|  88|female|
|  14|       Iris Zhao|  Two|  88|female|
|  15|       Scott Row| Four|  88|  male|
|  16|     Daniel Page| Four|  88|  male|
|  17|  James Williams|Three|  54|  male|
|  18|Martin Johnston | Four|  75|  male|
|  19|     John Smith | Four|  48|  male|
+----+----------------+-----+----+

update schema without reading the files again

In [213]:
df.withColumn('MARK',col('mark').cast(IntegerType())).printSchema

<bound method DataFrame.printSchema of DataFrame[id: int, name: string, class: string, MARK: int, gender: string]>

6)	Show columns and show summary statistics of numeric columns 

In [214]:
df.columns

['id', 'name', 'class', 'mark', 'gender']

In [215]:
df.describe().show()  # SUMMARY OF pySpark

+-------+------------------+---------+-----+-----------------+------+
|summary|                id|     name|class|             mark|gender|
+-------+------------------+---------+-----+-----------------+------+
|  count|                35|       36|   36|               35|    36|
|   mean|              18.0|     null| null|75.51428571428572|  null|
| stddev|10.246950765959598|     null| null|13.95448784772974|  null|
|    min|                 1|Alex John| Four|               48|female|
|    max|                35|     name|class|               96|  male|
+-------+------------------+---------+-----+-----------------+------+



In [216]:
# With quartiles
df.summary().show()

+-------+------------------+---------+-----+-----------------+------+
|summary|                id|     name|class|             mark|gender|
+-------+------------------+---------+-----+-----------------+------+
|  count|                35|       36|   36|               35|    36|
|   mean|              18.0|     null| null|75.51428571428572|  null|
| stddev|10.246950765959598|     null| null|13.95448784772974|  null|
|    min|                 1|Alex John| Four|               48|female|
|    25%|                 9|     null| null|               60|  null|
|    50%|                18|     null| null|               79|  null|
|    75%|                27|     null| null|               88|  null|
|    max|                35|     name|class|               96|  male|
+-------+------------------+---------+-----+-----------------+------+



7)	Read from JSON

In [217]:
js= json_spark_df = spark.read.json('people.json')
js.show()


+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  36|   Alan|
|  19| Justin|
+----+-------+



8)	Subset the Dataframe for one and then more than one columns. 

In [218]:
id = df.select('id')
id.show()

+----+
|  id|
+----+
|null|
|   1|
|   2|
|   3|
|   4|
|   5|
|   6|
|   7|
|   8|
|   9|
|  10|
|  11|
|  12|
|  13|
|  14|
|  15|
|  16|
|  17|
|  18|
|  19|
+----+
only showing top 20 rows



In [219]:
id_name = df.select('id','mark')
id_name.show()

+----+----+
|  id|mark|
+----+----+
|null|null|
|   1|  75|
|   2|  85|
|   3|  55|
|   4|  60|
|   5|  60|
|   6|  55|
|   7|  78|
|   8|  85|
|   9|  78|
|  10|  55|
|  11|  89|
|  12|  94|
|  13|  88|
|  14|  88|
|  15|  88|
|  16|  88|
|  17|  54|
|  18|  75|
|  19|  48|
+----+----+
only showing top 20 rows



9)	Filter Data frame based on condition. 
- Filter according to gender 
- Filter according to mark >50
- Filter by multiple conditions


In [220]:
# Can use [] brackets to run the query
df.filter(df['gender'] =='male').show()

+---+----------------+-----+----+------+
| id|            name|class|mark|gender|
+---+----------------+-----+----+------+
|  2|       John Star|Three|  85|  male|
|  3|  Arnold Walker |Three|  55|  male|
|  6|       Alex John| Four|  55|  male|
|  7|    Robert John |Three|  78|  male|
|  8|       Lee Malva| Four|  85|  male|
|  9|    Wookie Davey|  Two|  78|  male|
| 15|       Scott Row| Four|  88|  male|
| 16|     Daniel Page| Four|  88|  male|
| 17|  James Williams|Three|  54|  male|
| 18|Martin Johnston | Four|  75|  male|
| 19|     John Smith | Four|  48|  male|
| 23|       Sam Adan |Three|  79|  male|
| 24|   Nova Prescott|  Two|  78|  male|
| 25|  William Taylor| Four|  88|  male|
| 26|   Laurin Wilson|Three|  79|  male|
| 29|         Ben Day| Four|  55|  male|
| 31|      Chris Ball| Four|  88|  male|
| 34|   Garry Richard|Three|  69|  male|
+---+----------------+-----+----+------+



In [221]:
df.filter(df.gender =='male').show()

+---+----------------+-----+----+------+
| id|            name|class|mark|gender|
+---+----------------+-----+----+------+
|  2|       John Star|Three|  85|  male|
|  3|  Arnold Walker |Three|  55|  male|
|  6|       Alex John| Four|  55|  male|
|  7|    Robert John |Three|  78|  male|
|  8|       Lee Malva| Four|  85|  male|
|  9|    Wookie Davey|  Two|  78|  male|
| 15|       Scott Row| Four|  88|  male|
| 16|     Daniel Page| Four|  88|  male|
| 17|  James Williams|Three|  54|  male|
| 18|Martin Johnston | Four|  75|  male|
| 19|     John Smith | Four|  48|  male|
| 23|       Sam Adan |Three|  79|  male|
| 24|   Nova Prescott|  Two|  78|  male|
| 25|  William Taylor| Four|  88|  male|
| 26|   Laurin Wilson|Three|  79|  male|
| 29|         Ben Day| Four|  55|  male|
| 31|      Chris Ball| Four|  88|  male|
| 34|   Garry Richard|Three|  69|  male|
+---+----------------+-----+----+------+



In [222]:
df.filter(df.mark>50).show()

+---+----------------+-----+----+------+
| id|            name|class|mark|gender|
+---+----------------+-----+----+------+
|  1|    Emily Hardie| Four|  75|female|
|  2|       John Star|Three|  85|  male|
|  3|  Arnold Walker |Three|  55|  male|
|  4|      Reana Talu| Four|  60|female|
|  5| Sidona Williams| Four|  60|female|
|  6|       Alex John| Four|  55|  male|
|  7|    Robert John |Three|  78|  male|
|  8|       Lee Malva| Four|  85|  male|
|  9|    Wookie Davey|  Two|  78|  male|
| 10|      Diane Rose|  Two|  55|female|
| 11|    Holly Daives|  Two|  89|female|
| 12|        Eva Cup |Three|  94|female|
| 13| Victoria Mathew| Four|  88|female|
| 14|       Iris Zhao|  Two|  88|female|
| 15|       Scott Row| Four|  88|  male|
| 16|     Daniel Page| Four|  88|  male|
| 17|  James Williams|Three|  54|  male|
| 18|Martin Johnston | Four|  75|  male|
| 20|       Lenny Lee|Three|  65|female|
| 21|    Ava Williams|  Two|  69|female|
+---+----------------+-----+----+------+
only showing top

In [223]:
df.filter((df.gender == 'male') & (df.mark > 50)).show()

+---+----------------+-----+----+------+
| id|            name|class|mark|gender|
+---+----------------+-----+----+------+
|  2|       John Star|Three|  85|  male|
|  3|  Arnold Walker |Three|  55|  male|
|  6|       Alex John| Four|  55|  male|
|  7|    Robert John |Three|  78|  male|
|  8|       Lee Malva| Four|  85|  male|
|  9|    Wookie Davey|  Two|  78|  male|
| 15|       Scott Row| Four|  88|  male|
| 16|     Daniel Page| Four|  88|  male|
| 17|  James Williams|Three|  54|  male|
| 18|Martin Johnston | Four|  75|  male|
| 23|       Sam Adan |Three|  79|  male|
| 24|   Nova Prescott|  Two|  78|  male|
| 25|  William Taylor| Four|  88|  male|
| 26|   Laurin Wilson|Three|  79|  male|
| 29|         Ben Day| Four|  55|  male|
| 31|      Chris Ball| Four|  88|  male|
| 34|   Garry Richard|Three|  69|  male|
+---+----------------+-----+----+------+



10)	Add new column
- New column name: corrected mark 
- It has values mark+3

In [224]:
sparkdf.withColumn('correct_mark',(sparkdf.mark +3)).show()

+---+----------------+-----+----+------+------------+
| id|            name|class|mark|gender|correct_mark|
+---+----------------+-----+----+------+------------+
|  1|    Emily Hardie| Four|  75|female|        78.0|
|  2|       John Star|Three|  85|  male|        88.0|
|  3|  Arnold Walker |Three|  55|  male|        58.0|
|  4|      Reana Talu| Four|  60|female|        63.0|
|  5| Sidona Williams| Four|  60|female|        63.0|
|  6|       Alex John| Four|  55|  male|        58.0|
|  7|    Robert John |Three|  78|  male|        81.0|
|  8|       Lee Malva| Four|  85|  male|        88.0|
|  9|    Wookie Davey|  Two|  78|  male|        81.0|
| 10|      Diane Rose|  Two|  55|female|        58.0|
| 11|    Holly Daives|  Two|  89|female|        92.0|
| 12|        Eva Cup |Three|  94|female|        97.0|
| 13| Victoria Mathew| Four|  88|female|        91.0|
| 14|       Iris Zhao|  Two|  88|female|        91.0|
| 15|       Scott Row| Four|  88|  male|        91.0|
| 16|     Daniel Page| Four|

In [225]:
df.withColumn('correct_mark',(df.mark +3)).show()

+----+----------------+-----+----+------+------------+
|  id|            name|class|mark|gender|correct_mark|
+----+----------------+-----+----+------+------------+
|null|            name|class|null|gender|        null|
|   1|    Emily Hardie| Four|  75|female|          78|
|   2|       John Star|Three|  85|  male|          88|
|   3|  Arnold Walker |Three|  55|  male|          58|
|   4|      Reana Talu| Four|  60|female|          63|
|   5| Sidona Williams| Four|  60|female|          63|
|   6|       Alex John| Four|  55|  male|          58|
|   7|    Robert John |Three|  78|  male|          81|
|   8|       Lee Malva| Four|  85|  male|          88|
|   9|    Wookie Davey|  Two|  78|  male|          81|
|  10|      Diane Rose|  Two|  55|female|          58|
|  11|    Holly Daives|  Two|  89|female|          92|
|  12|        Eva Cup |Three|  94|female|          97|
|  13| Victoria Mathew| Four|  88|female|          91|
|  14|       Iris Zhao|  Two|  88|female|          91|
|  15|    

11)	Groupby gender
- Calculate the average mark for each gender 
- Max 
- Min 


In [226]:
df.groupby('gender').avg('mark').show()

+------+-----------------+
|gender|        avg(mark)|
+------+-----------------+
|female|77.52941176470588|
|  male|73.61111111111111|
|gender|             null|
+------+-----------------+



In [227]:
df.groupby('gender').min('mark').show()

+------+---------+
|gender|min(mark)|
+------+---------+
|female|       55|
|  male|       48|
|gender|     null|
+------+---------+



In [228]:
df.groupby('gender').max('mark').show()

+------+---------+
|gender|max(mark)|
+------+---------+
|female|       96|
|  male|       88|
|gender|     null|
+------+---------+



In [229]:
df.groupBy('gender').agg((round(avg('mark'),2).alias('Avg')),
max('mark').alias('Max'),
min('mark').alias('Min')).show()

+------+-----+----+----+
|gender|  Avg| Max| Min|
+------+-----+----+----+
|female|77.53|  96|  55|
|  male|73.61|  88|  48|
|gender| null|null|null|
+------+-----+----+----+



12)	 Aggregation:
- Calculate the average mark of all students. 


In [230]:
df.groupby().avg('mark').show()

+-----------------+
|        avg(mark)|
+-----------------+
|75.51428571428572|
+-----------------+



13)	Order by 
- “class”
- “mark”
- “mark” and descending order.  

In [231]:
df.orderBy('class').show()

+---+----------------+-----+----+------+
| id|            name|class|mark|gender|
+---+----------------+-----+----+------+
|  5| Sidona Williams| Four|  60|female|
| 13| Victoria Mathew| Four|  88|female|
| 15|       Scott Row| Four|  88|  male|
|  6|       Alex John| Four|  55|  male|
|  1|    Emily Hardie| Four|  75|female|
| 25|  William Taylor| Four|  88|  male|
| 28|  Emily Thompson| Four|  86|female|
| 29|         Ben Day| Four|  55|  male|
| 31|      Chris Ball| Four|  88|  male|
| 32|        Ela Love| Four|  90|female|
| 33|   Elisa Richard| Four|  96|female|
| 16|     Daniel Page| Four|  88|  male|
| 18|Martin Johnston | Four|  75|  male|
|  4|      Reana Talu| Four|  60|female|
| 19|     John Smith | Four|  48|  male|
|  8|       Lee Malva| Four|  85|  male|
|  2|       John Star|Three|  85|  male|
|  3|  Arnold Walker |Three|  55|  male|
|  7|    Robert John |Three|  78|  male|
| 12|        Eva Cup |Three|  94|female|
+---+----------------+-----+----+------+
only showing top

In [232]:
df.orderBy('mark').show()

+----+----------------+-----+----+------+
|  id|            name|class|mark|gender|
+----+----------------+-----+----+------+
|null|            name|class|null|gender|
|  19|     John Smith | Four|  48|  male|
|  17|  James Williams|Three|  54|  male|
|   3|  Arnold Walker |Three|  55|  male|
|  10|      Diane Rose|  Two|  55|female|
|  22|    Katie Connon|  Two|  55|female|
|   6|       Alex John| Four|  55|  male|
|  29|         Ben Day| Four|  55|  male|
|   4|      Reana Talu| Four|  60|female|
|   5| Sidona Williams| Four|  60|female|
|  20|       Lenny Lee|Three|  65|female|
|  21|    Ava Williams|  Two|  69|female|
|  34|   Garry Richard|Three|  69|  male|
|  18|Martin Johnston | Four|  75|  male|
|   1|    Emily Hardie| Four|  75|female|
|   9|    Wookie Davey|  Two|  78|  male|
|  24|   Nova Prescott|  Two|  78|  male|
|   7|    Robert John |Three|  78|  male|
|  23|       Sam Adan |Three|  79|  male|
|  26|   Laurin Wilson|Three|  79|  male|
+----+----------------+-----+----+

In [233]:
# descending MARK 1st method
df.orderBy(desc('mark')).show()

+---+---------------+-----+----+------+
| id|           name|class|mark|gender|
+---+---------------+-----+----+------+
| 33|  Elisa Richard| Four|  96|female|
| 12|       Eva Cup |Three|  94|female|
| 32|       Ela Love| Four|  90|female|
| 11|   Holly Daives|  Two|  89|female|
| 13|Victoria Mathew| Four|  88|female|
| 16|    Daniel Page| Four|  88|  male|
| 35|     Ria Wright|  Two|  88|female|
| 14|      Iris Zhao|  Two|  88|female|
| 15|      Scott Row| Four|  88|  male|
| 25| William Taylor| Four|  88|  male|
| 31|     Chris Ball| Four|  88|  male|
| 28| Emily Thompson| Four|  86|female|
|  2|      John Star|Three|  85|  male|
|  8|      Lee Malva| Four|  85|  male|
| 27|Fatemah Abraham|Three|  81|female|
| 26|  Laurin Wilson|Three|  79|  male|
| 30|    Rabiya Khan|Three|  79|female|
| 23|      Sam Adan |Three|  79|  male|
|  7|   Robert John |Three|  78|  male|
|  9|   Wookie Davey|  Two|  78|  male|
+---+---------------+-----+----+------+
only showing top 20 rows



In [234]:
# descending MARK 2nd method
df.orderBy('mark', ascending=False).show()

+---+---------------+-----+----+------+
| id|           name|class|mark|gender|
+---+---------------+-----+----+------+
| 33|  Elisa Richard| Four|  96|female|
| 12|       Eva Cup |Three|  94|female|
| 32|       Ela Love| Four|  90|female|
| 11|   Holly Daives|  Two|  89|female|
| 13|Victoria Mathew| Four|  88|female|
| 16|    Daniel Page| Four|  88|  male|
| 35|     Ria Wright|  Two|  88|female|
| 14|      Iris Zhao|  Two|  88|female|
| 15|      Scott Row| Four|  88|  male|
| 25| William Taylor| Four|  88|  male|
| 31|     Chris Ball| Four|  88|  male|
| 28| Emily Thompson| Four|  86|female|
|  2|      John Star|Three|  85|  male|
|  8|      Lee Malva| Four|  85|  male|
| 27|Fatemah Abraham|Three|  81|female|
| 26|  Laurin Wilson|Three|  79|  male|
| 30|    Rabiya Khan|Three|  79|female|
| 23|      Sam Adan |Three|  79|  male|
|  7|   Robert John |Three|  78|  male|
|  9|   Wookie Davey|  Two|  78|  male|
+---+---------------+-----+----+------+
only showing top 20 rows



14)	Access to specific row( hint: collect() method)  and then convert it to dictionary. 

In [235]:
sparkdf.collect()

[Row(id='1', name='Emily Hardie', class='Four', mark='75', gender='female'),
 Row(id='2', name='John Star', class='Three', mark='85', gender='male'),
 Row(id='3', name='Arnold Walker ', class='Three', mark='55', gender='male'),
 Row(id='4', name='Reana Talu', class='Four', mark='60', gender='female'),
 Row(id='5', name='Sidona Williams', class='Four', mark='60', gender='female'),
 Row(id='6', name='Alex John', class='Four', mark='55', gender='male'),
 Row(id='7', name='Robert John ', class='Three', mark='78', gender='male'),
 Row(id='8', name='Lee Malva', class='Four', mark='85', gender='male'),
 Row(id='9', name='Wookie Davey', class='Two', mark='78', gender='male'),
 Row(id='10', name='Diane Rose', class='Two', mark='55', gender='female'),
 Row(id='11', name='Holly Daives', class='Two', mark='89', gender='female'),
 Row(id='12', name='Eva Cup ', class='Three', mark='94', gender='female'),
 Row(id='13', name='Victoria Mathew', class='Four', mark='88', gender='female'),
 Row(id='14', n

In [236]:
# The use of display(), show does not work
rows = sparkdf.collect()
for row in rows:
    display(row.asDict())

{'id': '1',
 'name': 'Emily Hardie',
 'class': 'Four',
 'mark': '75',
 'gender': 'female'}

{'id': '2',
 'name': 'John Star',
 'class': 'Three',
 'mark': '85',
 'gender': 'male'}

{'id': '3',
 'name': 'Arnold Walker ',
 'class': 'Three',
 'mark': '55',
 'gender': 'male'}

{'id': '4',
 'name': 'Reana Talu',
 'class': 'Four',
 'mark': '60',
 'gender': 'female'}

{'id': '5',
 'name': 'Sidona Williams',
 'class': 'Four',
 'mark': '60',
 'gender': 'female'}

{'id': '6',
 'name': 'Alex John',
 'class': 'Four',
 'mark': '55',
 'gender': 'male'}

{'id': '7',
 'name': 'Robert John ',
 'class': 'Three',
 'mark': '78',
 'gender': 'male'}

{'id': '8',
 'name': 'Lee Malva',
 'class': 'Four',
 'mark': '85',
 'gender': 'male'}

{'id': '9',
 'name': 'Wookie Davey',
 'class': 'Two',
 'mark': '78',
 'gender': 'male'}

{'id': '10',
 'name': 'Diane Rose',
 'class': 'Two',
 'mark': '55',
 'gender': 'female'}

{'id': '11',
 'name': 'Holly Daives',
 'class': 'Two',
 'mark': '89',
 'gender': 'female'}

{'id': '12',
 'name': 'Eva Cup ',
 'class': 'Three',
 'mark': '94',
 'gender': 'female'}

{'id': '13',
 'name': 'Victoria Mathew',
 'class': 'Four',
 'mark': '88',
 'gender': 'female'}

{'id': '14',
 'name': 'Iris Zhao',
 'class': 'Two',
 'mark': '88',
 'gender': 'female'}

{'id': '15',
 'name': 'Scott Row',
 'class': 'Four',
 'mark': '88',
 'gender': 'male'}

{'id': '16',
 'name': 'Daniel Page',
 'class': 'Four',
 'mark': '88',
 'gender': 'male'}

{'id': '17',
 'name': 'James Williams',
 'class': 'Three',
 'mark': '54',
 'gender': 'male'}

{'id': '18',
 'name': 'Martin Johnston ',
 'class': 'Four',
 'mark': '75',
 'gender': 'male'}

{'id': '19',
 'name': 'John Smith ',
 'class': 'Four',
 'mark': '48',
 'gender': 'male'}

{'id': '20',
 'name': 'Lenny Lee',
 'class': 'Three',
 'mark': '65',
 'gender': 'female'}

{'id': '21',
 'name': 'Ava Williams',
 'class': 'Two',
 'mark': '69',
 'gender': 'female'}

{'id': '22',
 'name': 'Katie Connon',
 'class': 'Two',
 'mark': '55',
 'gender': 'female'}

{'id': '23',
 'name': 'Sam Adan ',
 'class': 'Three',
 'mark': '79',
 'gender': 'male'}

{'id': '24',
 'name': 'Nova Prescott',
 'class': 'Two',
 'mark': '78',
 'gender': 'male'}

{'id': '25',
 'name': 'William Taylor',
 'class': 'Four',
 'mark': '88',
 'gender': 'male'}

{'id': '26',
 'name': 'Laurin Wilson',
 'class': 'Three',
 'mark': '79',
 'gender': 'male'}

{'id': '27',
 'name': 'Fatemah Abraham',
 'class': 'Three',
 'mark': '81',
 'gender': 'female'}

{'id': '28',
 'name': 'Emily Thompson',
 'class': 'Four',
 'mark': '86',
 'gender': 'female'}

{'id': '29',
 'name': 'Ben Day',
 'class': 'Four',
 'mark': '55',
 'gender': 'male'}

{'id': '30',
 'name': 'Rabiya Khan',
 'class': 'Three',
 'mark': '79',
 'gender': 'female'}

{'id': '31',
 'name': 'Chris Ball',
 'class': 'Four',
 'mark': '88',
 'gender': 'male'}

{'id': '32',
 'name': 'Ela Love',
 'class': 'Four',
 'mark': '90',
 'gender': 'female'}

{'id': '33',
 'name': 'Elisa Richard',
 'class': 'Four',
 'mark': '96',
 'gender': 'female'}

{'id': '34',
 'name': 'Garry Richard',
 'class': 'Three',
 'mark': '69',
 'gender': 'male'}

{'id': '35',
 'name': 'Ria Wright',
 'class': 'Two',
 'mark': '88',
 'gender': 'female'}

In [237]:
# Filtering and converting into a dict
rows = df.filter(df.mark >90).collect()
for row in rows:
    display(row.asDict())

{'id': 12,
 'name': 'Eva Cup ',
 'class': 'Three',
 'mark': 94,
 'gender': 'female'}

{'id': 33,
 'name': 'Elisa Richard',
 'class': 'Four',
 'mark': 96,
 'gender': 'female'}

15. Create a view from dataframe and filter it using SQL syntax

In [238]:
# Create a temporary view of the DataFrame
sparkdf.createOrReplaceTempView("students")

# Filter the temporary view using SQL syntax
filtered_df = spark.sql("SELECT * FROM students WHERE gender = 'female' AND mark > 50")

# Show the filtered DataFrame
filtered_df.show()

+---+---------------+-----+----+------+
| id|           name|class|mark|gender|
+---+---------------+-----+----+------+
|  1|   Emily Hardie| Four|  75|female|
|  4|     Reana Talu| Four|  60|female|
|  5|Sidona Williams| Four|  60|female|
| 10|     Diane Rose|  Two|  55|female|
| 11|   Holly Daives|  Two|  89|female|
| 12|       Eva Cup |Three|  94|female|
| 13|Victoria Mathew| Four|  88|female|
| 14|      Iris Zhao|  Two|  88|female|
| 20|      Lenny Lee|Three|  65|female|
| 21|   Ava Williams|  Two|  69|female|
| 22|   Katie Connon|  Two|  55|female|
| 27|Fatemah Abraham|Three|  81|female|
| 28| Emily Thompson| Four|  86|female|
| 30|    Rabiya Khan|Three|  79|female|
| 32|       Ela Love| Four|  90|female|
| 33|  Elisa Richard| Four|  96|female|
| 35|     Ria Wright|  Two|  88|female|
+---+---------------+-----+----+------+



In [239]:
# select name and mark coloumn
spark.sql("SELECT name, mark FROM students WHERE gender = 'female' AND mark > 50").show()


+---------------+----+
|           name|mark|
+---------------+----+
|   Emily Hardie|  75|
|     Reana Talu|  60|
|Sidona Williams|  60|
|     Diane Rose|  55|
|   Holly Daives|  89|
|       Eva Cup |  94|
|Victoria Mathew|  88|
|      Iris Zhao|  88|
|      Lenny Lee|  65|
|   Ava Williams|  69|
|   Katie Connon|  55|
|Fatemah Abraham|  81|
| Emily Thompson|  86|
|    Rabiya Khan|  79|
|       Ela Love|  90|
|  Elisa Richard|  96|
|     Ria Wright|  88|
+---------------+----+

