## Drop Operation On Rows and Columns

### Setting Environment Variables

In [1]:
import os
import sys

os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

### Create a DataFrame

In [2]:
# Imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr

# Create SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

In [3]:

# Prepare Data
data = [("James", "Sales", 3000), \
    ("Michael", "Sales", 4600), \
    ("Robert", "Sales", 4100), \
    ("Maria", "Finance", 3000), \
    ("James", "Sales", 3000), \
    ("Scott", "Finance", 3300), \
    ("Jen", "Finance", 3900), \
    ("Jeff", "Marketing", 3000), \
    ("Kumar", "Marketing", 2000), \
    ("Saif", "Sales", 4100) \
  ]

# Create DataFrame
columns= ["employee_name", "department", "salary"]
df = spark.createDataFrame(data = data, schema = columns)
df.printSchema()
df.show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|James        |Sales     |3000  |
|Michael      |Sales     |4600  |
|Robert       |Sales     |4100  |
|Maria        |Finance   |3000  |
|James        |Sales     |3000  |
|Scott        |Finance   |3300  |
|Jen          |Finance   |3900  |
|Jeff         |Marketing |3000  |
|Kumar        |Marketing |2000  |
|Saif         |Sales     |4100  |
+-------------+----------+------+



### Select distinct Rows

In [5]:
# distinct() method is used to display the rows by elimination of the duplications
df.distinct().show()

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|        James|     Sales|  3000|
|      Michael|     Sales|  4600|
|       Robert|     Sales|  4100|
|        Maria|   Finance|  3000|
|        Scott|   Finance|  3300|
|          Jen|   Finance|  3900|
|         Jeff| Marketing|  3000|
|        Kumar| Marketing|  2000|
|         Saif|     Sales|  4100|
+-------------+----------+------+



In [6]:
# Display the Counts.
print("The count of the Rows in the given Data frame :", df.count())
print("The count of rows without Duplicates :", df.distinct().count())

The count of the Rows in the given Data frame : 10
The count of rows without Duplicates : 9


### Dropping Duplicates

In [7]:
# dropDuplicates() method is used to drop the duplicates
df2 = df.dropDuplicates()
print("The Count of the new Data Frame : ", df2.count())
df2.show()

The Count of the new Data Frame :  9
+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|        James|     Sales|  3000|
|      Michael|     Sales|  4600|
|       Robert|     Sales|  4100|
|        Maria|   Finance|  3000|
|        Scott|   Finance|  3300|
|          Jen|   Finance|  3900|
|         Jeff| Marketing|  3000|
|        Kumar| Marketing|  2000|
|         Saif|     Sales|  4100|
+-------------+----------+------+



### Checking Count of individual columns

In [9]:
print("The distinct count of column 'employee_name'",df.select("employee_name").distinct().count())
print("The distinct count of column 'department'",df.select("department").distinct().count())
print("The distinct count of column 'salary'",df.select("salary").distinct().count())

The distinct count of column 'employee_name' 9
The distinct count of column 'department' 3
The distinct count of column 'salary' 6


### Drop duplicates for selected columns

In [10]:
# Remove Duplicates for selected columns using dropDuplicates() method
df_cols2 = df.dropDuplicates(['Department', 'Salary'])
print("The count of the Columns after dropping duplicates of 2 columns : ", df_cols2.count())

The count of the Columns after dropping duplicates of 2 columns :  8


### Drop Columns

In [11]:
simpleData = (("James","","Smith","36636","NewYork",3100), \
    ("Michael","Rose","","40288","California",4300), \
    ("Robert","","Williams","42114","Florida",1400), \
    ("Maria","Anne","Jones","39192","Florida",5500), \
    ("Jen","Mary","Brown","34561","NewYork",3000) \
  )
columns= ["firstname","middlename","lastname","id","location","salary"]

df = spark.createDataFrame(data = simpleData, schema = columns)

df.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- location: string (nullable = true)
 |-- salary: long (nullable = true)



#### Dropping Single Column

In [12]:
# Droppping Single Column
from pyspark.sql.functions import col

# Method 1 : Directly using the name of the column
df.drop("middlename").show()

# Method 2 : Accessing the column name using . operator
df.drop(df.lastname).show()

# Method 3 : Using the col function
df.drop(col("location")).show()

+---------+--------+-----+----------+------+
|firstname|lastname|   id|  location|salary|
+---------+--------+-----+----------+------+
|    James|   Smith|36636|   NewYork|  3100|
|  Michael|        |40288|California|  4300|
|   Robert|Williams|42114|   Florida|  1400|
|    Maria|   Jones|39192|   Florida|  5500|
|      Jen|   Brown|34561|   NewYork|  3000|
+---------+--------+-----+----------+------+

+---------+----------+-----+----------+------+
|firstname|middlename|   id|  location|salary|
+---------+----------+-----+----------+------+
|    James|          |36636|   NewYork|  3100|
|  Michael|      Rose|40288|California|  4300|
|   Robert|          |42114|   Florida|  1400|
|    Maria|      Anne|39192|   Florida|  5500|
|      Jen|      Mary|34561|   NewYork|  3000|
+---------+----------+-----+----------+------+

+---------+----------+--------+-----+------+
|firstname|middlename|lastname|   id|salary|
+---------+----------+--------+-----+------+
|    James|          |   Smith|3663

#### Dropping Multiple Columns

In [18]:
# Dropping Multiple COLUMNS

# Method 1 : Using name of the column
df.drop("middlename", "id").show()

# Method 2 : Using . operator
df.drop(df.middlename, df.location, df.id).show()

# Method 3 : Using tuple
cols = ("middlename", "id", "salary")
df.drop(*cols).show()

+---------+--------+----------+------+
|firstname|lastname|  location|salary|
+---------+--------+----------+------+
|    James|   Smith|   NewYork|  3100|
|  Michael|        |California|  4300|
|   Robert|Williams|   Florida|  1400|
|    Maria|   Jones|   Florida|  5500|
|      Jen|   Brown|   NewYork|  3000|
+---------+--------+----------+------+

+---------+--------+------+
|firstname|lastname|salary|
+---------+--------+------+
|    James|   Smith|  3100|
|  Michael|        |  4300|
|   Robert|Williams|  1400|
|    Maria|   Jones|  5500|
|      Jen|   Brown|  3000|
+---------+--------+------+

+---------+--------+----------+
|firstname|lastname|  location|
+---------+--------+----------+
|    James|   Smith|   NewYork|
|  Michael|        |California|
|   Robert|Williams|   Florida|
|    Maria|   Jones|   Florida|
|      Jen|   Brown|   NewYork|
+---------+--------+----------+

