## Import Statements

In [0]:
from pyspark.sql.functions import col

## Create a dataframe

In [0]:
df = spark.read.format("csv").option("header", True).load("/FileStore/tables/store_sales/test.csv")

In [0]:
df.show(10)

+-------+----------+---------+------------+-----------+
|     id|      date|store_nbr|      family|onpromotion|
+-------+----------+---------+------------+-----------+
|3000888|2017-08-16|        1|  AUTOMOTIVE|          0|
|3000889|2017-08-16|        1|   BABY CARE|          0|
|3000890|2017-08-16|        1|      BEAUTY|          2|
|3000891|2017-08-16|        1|   BEVERAGES|         20|
|3000892|2017-08-16|        1|       BOOKS|          0|
|3000893|2017-08-16|        1|BREAD/BAKERY|         12|
|3000894|2017-08-16|        1| CELEBRATION|          0|
|3000895|2017-08-16|        1|    CLEANING|         25|
|3000896|2017-08-16|        1|       DAIRY|         45|
|3000897|2017-08-16|        1|        DELI|         18|
+-------+----------+---------+------------+-----------+
only showing top 10 rows



In [0]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- store_nbr: string (nullable = true)
 |-- family: string (nullable = true)
 |-- onpromotion: string (nullable = true)



## Dropping a single column
Important thing to notice here is, when a column name is not present in the dataframe then the functions ignore it. So no action is taken in such case.

In [0]:
df.drop("store_nbr").printSchema()

root
 |-- id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- family: string (nullable = true)
 |-- onpromotion: string (nullable = true)



In [0]:
df.drop(col("store_nbr")).printSchema()

root
 |-- id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- family: string (nullable = true)
 |-- onpromotion: string (nullable = true)



In [0]:
df.drop(df.store_nbr).printSchema()

root
 |-- id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- family: string (nullable = true)
 |-- onpromotion: string (nullable = true)



In [0]:
df.drop(df["store_nbr"]).printSchema()

root
 |-- id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- family: string (nullable = true)
 |-- onpromotion: string (nullable = true)



In [0]:
# if column name is not present then the schema remains the same.                  
df.drop(col("my_col")).printSchema()

root
 |-- id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- store_nbr: string (nullable = true)
 |-- family: string (nullable = true)
 |-- onpromotion: string (nullable = true)



## Dropping multiple columns

In [0]:
df.show(5)

+-------+----------+---------+----------+-----------+
|     id|      date|store_nbr|    family|onpromotion|
+-------+----------+---------+----------+-----------+
|3000888|2017-08-16|        1|AUTOMOTIVE|          0|
|3000889|2017-08-16|        1| BABY CARE|          0|
|3000890|2017-08-16|        1|    BEAUTY|          2|
|3000891|2017-08-16|        1| BEVERAGES|         20|
|3000892|2017-08-16|        1|     BOOKS|          0|
+-------+----------+---------+----------+-----------+
only showing top 5 rows



In [0]:
df.drop("date", "store_nbr").printSchema()

root
 |-- id: string (nullable = true)
 |-- family: string (nullable = true)
 |-- onpromotion: string (nullable = true)



In [0]:
# Column object does not work with multiple column deletion
df.drop(col("date"), col("store_num")).printSchema()

[0;31m---------------------------------------------------------------------------[0m
[0;31mTypeError[0m                                 Traceback (most recent call last)
[0;32m<command-1501412273574855>[0m in [0;36m<cell line: 2>[0;34m()[0m
[1;32m      1[0m [0;31m# Column object does not work with multiple column deletion[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 2[0;31m [0mdf[0m[0;34m.[0m[0mdrop[0m[0;34m([0m[0mcol[0m[0;34m([0m[0;34m"date"[0m[0;34m)[0m[0;34m,[0m [0mcol[0m[0;34m([0m[0;34m"store_num"[0m[0;34m)[0m[0;34m)[0m[0;34m.[0m[0mprintSchema[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
[0;32m/databricks/spark/python/pyspark/instrumentation_utils.py[0m in [0;36mwrapper[0;34m(*args, **kwargs)[0m
[1;32m     46[0m             [0mstart[0m [0;34m=[0m [0mtime[0m[0;34m.[0m[0mperf_counter[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m     47[0m             [0;32mtry[0m[0;34m:[0m[0;34m[

In [0]:
# Similar to single column, if column does not exist then no action is taken. 
df.drop("date", "store_nbr", "something").printSchema()

root
 |-- id: string (nullable = true)
 |-- family: string (nullable = true)
 |-- onpromotion: string (nullable = true)



## Dropping multiple column using *args

In [0]:
columns_to_drop = ["date", "store_num"]

In [0]:
df.drop(*columns_to_drop).printSchema()

root
 |-- id: string (nullable = true)
 |-- store_nbr: string (nullable = true)
 |-- family: string (nullable = true)
 |-- onpromotion: string (nullable = true)



## Dropping Duplicates

### Creating a dataframe with duplicate values

In [0]:
df1 = spark.createDataFrame(
    [
        (1, "val1"),  
        (2, "val2"),
        (3, "val3"),
        (4, "val4"),
        (2, "val2"),
        (5, "val5"),
        (6, "val6"),
        (6, "val7"),
    ],
    ["id", "value"] 
)

In [0]:
df1.show()

+---+-----+
| id|value|
+---+-----+
|  1| val1|
|  2| val2|
|  3| val3|
|  4| val4|
|  2| val2|
|  5| val5|
|  6| val6|
|  6| val7|
+---+-----+



### Finding unique records

In [0]:
help(df1.drop_duplicates)

Help on method dropDuplicates in module pyspark.sql.dataframe:

dropDuplicates(subset=None) method of pyspark.sql.dataframe.DataFrame instance
    :func:`drop_duplicates` is an alias for :func:`dropDuplicates`.
    
    .. versionadded:: 1.4



In [0]:
df1.distinct().count()

Out[25]: 7

In [0]:
# Duplicated removed
df1.drop_duplicates().show()

+---+-----+
| id|value|
+---+-----+
|  1| val1|
|  2| val2|
|  3| val3|
|  4| val4|
|  5| val5|
|  6| val6|
|  6| val7|
+---+-----+



In [0]:
# Dropping for the subset of specific column. 
df1.drop_duplicates(subset=["id"]).show()

+---+-----+
| id|value|
+---+-----+
|  1| val1|
|  2| val2|
|  3| val3|
|  4| val4|
|  5| val5|
|  6| val6|
+---+-----+



In [0]:
# Using multiple columns
df1.drop_duplicates(subset=["id", "value"]).show()

+---+-----+
| id|value|
+---+-----+
|  1| val1|
|  2| val2|
|  3| val3|
|  4| val4|
|  5| val5|
|  6| val6|
|  6| val7|
+---+-----+

