In [0]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Data Frames").getOrCreate()
# Create a spark session

In [0]:
df = spark.read.option("header", "true").option("inferSchema", "true").option("delimiter", ",").csv('/FileStore/tables/StudentData.csv')

# Read the data in

In [0]:
df.printSchema()
df.show()

# Print the schema

root
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- name: string (nullable = true)
 |-- course: string (nullable = true)
 |-- roll: integer (nullable = true)
 |-- marks: integer (nullable = true)
 |-- email: string (nullable = true)

+---+------+----------------+------+------+-----+--------------------+
|age|gender|            name|course|  roll|marks|               email|
+---+------+----------------+------+------+-----+--------------------+
| 28|Female| Hubert Oliveras|    DB|  2984|   59|Annika Hoffman_Na...|
| 29|Female|Toshiko Hillyard| Cloud| 12899|   62|Margene Moores_Ma...|
| 28|  Male|  Celeste Lollis|    PF| 21267|   45|Jeannetta Golden_...|
| 29|Female|    Elenore Choy|    DB| 32877|   29|Billi Clore_Mitzi...|
| 28|  Male|  Sheryll Towler|   DSA| 41487|   41|Claude Panos_Judi...|
| 28|  Male|  Margene Moores|   MVC| 52771|   32|Toshiko Hillyard_...|
| 28|  Male|     Neda Briski|   OOP| 61973|   69|Alberta Freund_El...|
| 28|Female|    Claude P

In [0]:
from pyspark.sql.functions import lit,  col

df = df.withColumn("total marks", lit(120))
df.show()

# Add a total marks column

+---+------+----------------+------+------+-----+--------------------+-----------+
|age|gender|            name|course|  roll|marks|               email|total marks|
+---+------+----------------+------+------+-----+--------------------+-----------+
| 28|Female| Hubert Oliveras|    DB|  2984|   59|Annika Hoffman_Na...|        120|
| 29|Female|Toshiko Hillyard| Cloud| 12899|   62|Margene Moores_Ma...|        120|
| 28|  Male|  Celeste Lollis|    PF| 21267|   45|Jeannetta Golden_...|        120|
| 29|Female|    Elenore Choy|    DB| 32877|   29|Billi Clore_Mitzi...|        120|
| 28|  Male|  Sheryll Towler|   DSA| 41487|   41|Claude Panos_Judi...|        120|
| 28|  Male|  Margene Moores|   MVC| 52771|   32|Toshiko Hillyard_...|        120|
| 28|  Male|     Neda Briski|   OOP| 61973|   69|Alberta Freund_El...|        120|
| 28|Female|    Claude Panos| Cloud| 72409|   85|Sheryll Towler_Al...|        120|
| 28|  Male|  Celeste Lollis|   MVC| 81492|   64|Nicole Harwood_Cl...|        120|
| 29

In [0]:
df = df.withColumn("average", col("marks")/col("total marks")*100)
df.show()

# Add a new average row

+---+------+----------------+------+------+-----+--------------------+-----------+------------------+
|age|gender|            name|course|  roll|marks|               email|total marks|           average|
+---+------+----------------+------+------+-----+--------------------+-----------+------------------+
| 28|Female| Hubert Oliveras|    DB|  2984|   59|Annika Hoffman_Na...|        120|49.166666666666664|
| 29|Female|Toshiko Hillyard| Cloud| 12899|   62|Margene Moores_Ma...|        120| 51.66666666666667|
| 28|  Male|  Celeste Lollis|    PF| 21267|   45|Jeannetta Golden_...|        120|              37.5|
| 29|Female|    Elenore Choy|    DB| 32877|   29|Billi Clore_Mitzi...|        120|24.166666666666668|
| 28|  Male|  Sheryll Towler|   DSA| 41487|   41|Claude Panos_Judi...|        120|34.166666666666664|
| 28|  Male|  Margene Moores|   MVC| 52771|   32|Toshiko Hillyard_...|        120|26.666666666666668|
| 28|  Male|     Neda Briski|   OOP| 61973|   69|Alberta Freund_El...|        120|

In [0]:
df_above = df.filter((df.course == "OOP") & (df.average > 80))
df_above.show()

# Filter students who have achieved above 80% in a course

+---+------+------------------+------+-------+-----+--------------------+-----------+-----------------+
|age|gender|              name|course|   roll|marks|               email|total marks|          average|
+---+------+------------------+------+-------+-----+--------------------+-----------+-----------------+
| 28|  Male|    Jenna Montague|   OOP|3331161|   98|Leontine Phillips...|        120|81.66666666666667|
| 29|Female|Priscila Tavernier|   OOP|3902993|   99|Celeste Lollis_Bi...|        120|             82.5|
| 28|Female|      Judie Chipps|   OOP|5451977|   99|Tamera Blakley_Mi...|        120|             82.5|
| 29|  Male|    Margene Moores|   OOP|5621072|   97|Sheryll Towler_Ma...|        120|80.83333333333333|
| 29|  Male|      Jc Andrepont|   OOP|8022618|   97|Cordie Harnois_Ja...|        120|80.83333333333333|
| 28|  Male|    Loris Crossett|   OOP|8172914|   98|Paris Hutton_Pari...|        120|81.66666666666667|
| 28|  Male|    Loris Crossett|   OOP|9692316|   99|Judie Chipps

In [0]:
df.select(["name", "marks"]).show()
# Print the name and marks of students

+----------------+-----+
|            name|marks|
+----------------+-----+
| Hubert Oliveras|   59|
|Toshiko Hillyard|   62|
|  Celeste Lollis|   45|
|    Elenore Choy|   29|
|  Sheryll Towler|   41|
|  Margene Moores|   32|
|     Neda Briski|   69|
|    Claude Panos|   85|
|  Celeste Lollis|   64|
|  Cordie Harnois|   51|
|       Kena Wild|   35|
| Ernest Rossbach|   53|
|  Latia Vanhoose|   27|
|  Latia Vanhoose|   55|
|     Neda Briski|   42|
|  Latia Vanhoose|   27|
|  Loris Crossett|   36|
|  Annika Hoffman|   22|
|   Santa Kerfien|   56|
|Mickey Cortright|   62|
+----------------+-----+
only showing top 20 rows

