###PySpark DataFrame Filtering


In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType

# Create data
data = [
    (("Aarav", "Kumar", "Patel"), ["Python", "SQL", "JavaScript"], "CA", "M"),
    (("Diya", "Rani", ""), ["Java", "Scala", "C++"], "TX", "F"),
    (("Karan", "", "Mehta"), ["Go", "Rust"], "CA", "M"),
    (("Isha", "Vikram", "Nair"), ["Python", "R"], "WA", "F"),
    (("Rahul", "Dev", "Singh"), ["C#", "VB", "Python"], "TX", "M"),
    (("Priya", "", "Menon"), ["TypeScript", "HTML", "CSS"], "FL", "F")
]

# Create schema
schema = StructType([
    StructField('name', StructType([
        StructField('firstname', StringType(), True),
        StructField('middlename', StringType(), True),
        StructField('lastname', StringType(), True)
    ])),
    StructField('languages', ArrayType(StringType()), True),
    StructField('state', StringType(), True),
    StructField('gender', StringType(), True)
])

# Create DataFrame
df = spark.createDataFrame(data=data, schema=schema)

# Print schema and show data
df.printSchema()
df.show(truncate=False)


In [0]:
# Using equal condition
df.filter(df.state == "TX").show(truncate=False)

In [0]:
# Not equals condition
df.filter(df.state != "TX") \
        .show(truncate=False) 

# Another expression
df.filter(~(df.state == "TX")) \
    .show(truncate=False)

In [0]:
# Using SQL col() function
from pyspark.sql.functions import col
df.filter(col("state") == "CA") \
    .show(truncate=False) 

In [0]:
# Using SQL Expression
df.filter("gender == 'M'").show()

# For not equal
df.filter("gender != 'M'").show()
#df.filter("gender <> 'M'").show()

In [0]:
# Filter multiple conditions
# AND condition
df.filter( (df.state  == "TX") & (df.gender  == "M") ) \
    .show(truncate=False)  

# Filter using OR operator
df.filter( (df.state  == "WA") | (df.gender  == "M") ) \
    .show(truncate=False)  

In [0]:
# Filter IS IN List values
li=["TX","CA","DE"]
df.filter(df.state.isin(li)).show()

In [0]:
# Using startswith
df.filter(df.state.startswith("T")).show()

#using endswith
df.filter(df.state.endswith("A")).show()

#contains
df.filter(df.name.firstname.contains("i")).show()

In [0]:
# like - SQL LIKE pattern
df.filter(df.name.firstname.like("%iya%")).show()