In [10]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('practice_second').getOrCreate()

In [2]:
spark.version

'3.3.2'

# Missing Data

In [3]:
df = spark.read.option('header', True).csv('students.csv')
df.show()

                                                                                

+-------+-------+------+----+
|roll_no|   name|gender| age|
+-------+-------+------+----+
|      1| Aditya|  Male|  23|
|      2| Ananya|Female|  19|
|      3|  Arjun|  null|  21|
|      4|Amrutha|Female|  18|
|      5|   null|Female|  24|
|      6|   Ajay|  Male|  20|
|      7|   null|  null|null|
|   null|   null|  null|  36|
+-------+-------+------+----+



In [4]:
# remove all rows which contain atleast one null value
df.na.drop().show()

+-------+-------+------+---+
|roll_no|   name|gender|age|
+-------+-------+------+---+
|      1| Aditya|  Male| 23|
|      2| Ananya|Female| 19|
|      4|Amrutha|Female| 18|
|      6|   Ajay|  Male| 20|
+-------+-------+------+---+



In [6]:
# remove all rows with 2 null values
df.na.drop(thresh=2).show()

+-------+-------+------+---+
|roll_no|   name|gender|age|
+-------+-------+------+---+
|      1| Aditya|  Male| 23|
|      2| Ananya|Female| 19|
|      3|  Arjun|  null| 21|
|      4|Amrutha|Female| 18|
|      5|   null|Female| 24|
|      6|   Ajay|  Male| 20|
+-------+-------+------+---+



In [7]:
# remove null values by column
df.na.drop(subset=['gender']).show()

+-------+-------+------+---+
|roll_no|   name|gender|age|
+-------+-------+------+---+
|      1| Aditya|  Male| 23|
|      2| Ananya|Female| 19|
|      4|Amrutha|Female| 18|
|      5|   null|Female| 24|
|      6|   Ajay|  Male| 20|
+-------+-------+------+---+



In [8]:
# fill null values with some value
df.na.fill('Fill Value').show()

+----------+----------+----------+----------+
|   roll_no|      name|    gender|       age|
+----------+----------+----------+----------+
|         1|    Aditya|      Male|        23|
|         2|    Ananya|    Female|        19|
|         3|     Arjun|Fill Value|        21|
|         4|   Amrutha|    Female|        18|
|         5|Fill Value|    Female|        24|
|         6|      Ajay|      Male|        20|
|         7|Fill Value|Fill Value|Fill Value|
|Fill Value|Fill Value|Fill Value|        36|
+----------+----------+----------+----------+



In [9]:
df.na.fill(0).show()

+-------+-------+------+----+
|roll_no|   name|gender| age|
+-------+-------+------+----+
|      1| Aditya|  Male|  23|
|      2| Ananya|Female|  19|
|      3|  Arjun|  null|  21|
|      4|Amrutha|Female|  18|
|      5|   null|Female|  24|
|      6|   Ajay|  Male|  20|
|      7|   null|  null|null|
|   null|   null|  null|  36|
+-------+-------+------+----+



# Dataframes Dates and Timestamps

In [12]:
from pyspark.sql import Row
from datetime import datetime, date
df = spark.createDataFrame([
    Row(a=1, b=2., c='string1', d=date(2000, 1, 1), e=datetime(2000, 1, 1, 12, 0)),
    Row(a=2, b=3., c='string2', d=date(2000, 2, 1), e=datetime(2000, 1, 2, 12, 0)),
    Row(a=4, b=5., c='string3', d=date(2000, 3, 1), e=datetime(2000, 1, 3, 12, 0))
])
df.show()

[Stage 8:>                                                          (0 + 1) / 1]

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
|  2|3.0|string2|2000-02-01|2000-01-02 12:00:00|
|  4|5.0|string3|2000-03-01|2000-01-03 12:00:00|
+---+---+-------+----------+-------------------+



                                                                                

In [14]:
from pyspark.sql.functions import hour,month,year,dayofmonth,dayofweek,dayofyear
df.select(year(df.e)).show()

+-------+
|year(e)|
+-------+
|   2000|
|   2000|
|   2000|
+-------+



In [15]:
df.select(month(df.e)).show()

+--------+
|month(e)|
+--------+
|       1|
|       1|
|       1|
+--------+

