In [0]:
%run /Shared/PROD/core_function/core

In [0]:
data = [{"Category": 'A', "ID": 1, "Value": 121.44, "Truth": True},
        {"Category": 'B', "ID": 2, "Value": 300.01, "Truth": False},
        {"Category": 'C', "ID": 3, "Value": 10.99, "Truth": None},
        {"Category": 'E', "ID": 4, "Value": 33.87, "Truth": True}
        ]


In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [0]:
df = spark.createDataFrame(data)
df.show()

+--------+---+-----+------+
|Category| ID|Truth| Value|
+--------+---+-----+------+
|       A|  1| true|121.44|
|       B|  2|false|300.01|
|       C|  3| null| 10.99|
|       E|  4| true| 33.87|
+--------+---+-----+------+



In [0]:
data2 = [{"Category": 'A', "ID": 1, "Value": 121.44, "Truth": True},
        {"Category": 'B', "ID": 2, "Value": 300.01, "Truth": False},
        {"Category": 'C', "ID": 3, "Value": 10.99, "Truth": None},
        {"Category": 'E', "ID": 4, "Value": 33.87, "Truth": True}
        ]

In [0]:
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("projectName").setMaster("local[*]")
sc = SparkContext.getOrCreate(conf)

In [0]:
rdd = sc.parallelize(data2)
type(rdd)

Out[12]: pyspark.rdd.RDD

In [0]:
df = rdd.toDF()
df.show()

+--------+---+-----+------+
|Category| ID|Truth| Value|
+--------+---+-----+------+
|       A|  1| true|121.44|
|       B|  2|false|300.01|
|       C|  3| null| 10.99|
|       E|  4| true| 33.87|
+--------+---+-----+------+



In [0]:
df = (spark.read
  .format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
  .load("/databricks-datasets/samples/population-vs-price/data_geo.csv")
)

In [0]:
df.show()

+---------+-------------+----------+----------+------------------------+-----------------------+
|2014 rank|         City|     State|State Code|2014 Population estimate|2015 median sales price|
+---------+-------------+----------+----------+------------------------+-----------------------+
|      101|   Birmingham|   Alabama|        AL|                  212247|                  162.9|
|      125|   Huntsville|   Alabama|        AL|                  188226|                  157.7|
|      122|       Mobile|   Alabama|        AL|                  194675|                  122.5|
|      114|   Montgomery|   Alabama|        AL|                  200481|                  129.0|
|       64|Anchorage[19]|    Alaska|        AK|                  301010|                   null|
|       78|     Chandler|   Arizona|        AZ|                  254276|                   null|
|       86|  Gilbert[20]|   Arizona|        AZ|                  239277|                   null|
|       88|     Glendale|   Ar

In [0]:
df.filter(df.State == "Arizona").select(df.City,df.State).show()

+-----------+-------+
|       City|  State|
+-----------+-------+
|   Chandler|Arizona|
|Gilbert[20]|Arizona|
|   Glendale|Arizona|
|       Mesa|Arizona|
|     Peoria|Arizona|
|    Phoenix|Arizona|
| Scottsdale|Arizona|
|   Surprise|Arizona|
|      Tempe|Arizona|
|     Tucson|Arizona|
+-----------+-------+



In [0]:
#Filter IS IN List values and is not in value
df.filter(df.State.isin("Arizona","Alaska","California")).select(df.City,df.State).show()

In [0]:
#Filter is not in value
df.filter(~df.State.isin("Arizona","Alaska","California")).select(df.City,df.State).show()

+----------------+-----------+
|            City|      State|
+----------------+-----------+
|      Birmingham|    Alabama|
|      Huntsville|    Alabama|
|          Mobile|    Alabama|
|      Montgomery|    Alabama|
|     Little Rock|   Arkansas|
|          Arvada|   Colorado|
|          Aurora|   Colorado|
|         Boulder|   Colorado|
|      Centennial|   Colorado|
|Colorado Springs|   Colorado|
|      Denver[12]|   Colorado|
|    Fort Collins|   Colorado|
|        Lakewood|   Colorado|
|          Pueblo|   Colorado|
|        Thornton|   Colorado|
|     Westminster|   Colorado|
|      Bridgeport|Connecticut|
|        Hartford|Connecticut|
|       New Haven|Connecticut|
|        Stamford|Connecticut|
+----------------+-----------+
only showing top 20 rows



In [0]:
df.filter(df.State.isin("Arizona","Alaska","California")==False).select(df.City,df.State).show()

In [0]:
# Using startswith
df.filter(df.City.startswith("A")).select("City","State","State Code","2014 Population estimate").show()

+--------------+------------+----------+------------------------+
|          City|       State|State Code|2014 Population estimate|
+--------------+------------+----------+------------------------+
| Anchorage[19]|      Alaska|        AK|                  301010|
|       Anaheim|  California|        CA|                  346997|
|       Antioch|  California|        CA|                  108930|
|        Arvada|    Colorado|        CO|                  113574|
|        Aurora|    Colorado|        CO|                  353108|
|    Athens[28]|     Georgia|        GA|                  119648|
|       Atlanta|     Georgia|        GA|                  456002|
|   Augusta[24]|     Georgia|        GA|                  196741|
|        Aurora|    Illinois|        IL|                  200456|
|     Ann Arbor|    Michigan|        MI|                  117770|
|   Albuquerque|  New Mexico|        NM|                  557169|
|         Akron|        Ohio|        OH|                  197859|
|     Alle

In [0]:
#using endswith
df.filter(df.City.endswith("e")).show()

+---------+------------+----------+----------+------------------------+-----------------------+
|2014 rank|        City|     State|State Code|2014 Population estimate|2015 median sales price|
+---------+------------+----------+----------+------------------------+-----------------------+
|      125|  Huntsville|   Alabama|        AL|                  188226|                  157.7|
|      122|      Mobile|   Alabama|        AL|                  194675|                  122.5|
|       88|    Glendale|   Arizona|        AZ|                  237517|                   null|
|       95|  Scottsdale|   Arizona|        AZ|                  230512|                   null|
|      215|    Surprise|   Arizona|        AZ|                  126275|                   null|
|      142|       Tempe|   Arizona|        AZ|                  172816|                   null|
|      234|    El Monte|California|        CA|                  116631|                   null|
|      151|   Elk Grove|California|     

In [0]:
#contains
df.filter(df.City.contains("e")).show()

+---------+-------------+----------+----------+------------------------+-----------------------+
|2014 rank|         City|     State|State Code|2014 Population estimate|2015 median sales price|
+---------+-------------+----------+----------+------------------------+-----------------------+
|      125|   Huntsville|   Alabama|        AL|                  188226|                  157.7|
|      122|       Mobile|   Alabama|        AL|                  194675|                  122.5|
|      114|   Montgomery|   Alabama|        AL|                  200481|                  129.0|
|       64|Anchorage[19]|    Alaska|        AK|                  301010|                   null|
|       78|     Chandler|   Arizona|        AZ|                  254276|                   null|
|       86|  Gilbert[20]|   Arizona|        AZ|                  239277|                   null|
|       88|     Glendale|   Arizona|        AZ|                  237517|                   null|
|       38|         Mesa|   Ar

In [0]:
#PySpark Filter like and rlike
df.filter(df.City.like("%le%")).show()

+---------+---------------+----------+----------+------------------------+-----------------------+
|2014 rank|           City|     State|State Code|2014 Population estimate|2015 median sales price|
+---------+---------------+----------+----------+------------------------+-----------------------+
|      125|     Huntsville|   Alabama|        AL|                  188226|                  157.7|
|      122|         Mobile|   Alabama|        AL|                  194675|                  122.5|
|       78|       Chandler|   Arizona|        AZ|                  254276|                   null|
|       88|       Glendale|   Arizona|        AZ|                  237517|                   null|
|       95|     Scottsdale|   Arizona|        AZ|                  230512|                   null|
|      119|    Little Rock|  Arkansas|        AR|                  197706|                  131.8|
|      227|       Berkeley|California|        CA|                  118853|                   null|
|      186

In [0]:
#This check case insensitive
df.filter(df.City.rlike("(?i)^*les$")).show()

+---------+-----------+----------+----------+------------------------+-----------------------+
|2014 rank|       City|     State|State Code|2014 Population estimate|2015 median sales price|
+---------+-----------+----------+----------+------------------------+-----------------------+
|        2|Los Angeles|California|        CA|                 3928864|                  434.7|
+---------+-----------+----------+----------+------------------------+-----------------------+



In [0]:
#Struct condition
df.filter(df.City.lastname == "Williams") \
    .show(truncate=False) 

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-567115584125272>:2[0m
[1;32m      1[0m [38;5;66;03m#Struct condition[39;00m
[0;32m----> 2[0m [43mdf[49m[38;5;241m.[39mfilter(df[38;5;241m.[39mCity[38;5;241m.[39mlastname [38;5;241m==[39m [38;5;124m"[39m[38;5;124mWilliams[39m[38;5;124m"[39m) \
[1;32m      3[0m     [38;5;241m.[39mshow(truncate[38;5;241m=[39m[38;5;28;01mFalse[39;00m)

[0;31mNameError[0m: name 'df' is not defined