## Creating A Data Frame

In [1]:
import os
import sys

os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
        .appName("PythonCollect") \
        .getOrCreate()

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 60085)
Traceback (most recent call last):
  File "C:\Users\Yateesh Chandra\AppData\Local\Programs\Python\Python311\Lib\socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "C:\Users\Yateesh Chandra\AppData\Local\Programs\Python\Python311\Lib\socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "C:\Users\Yateesh Chandra\AppData\Local\Programs\Python\Python311\Lib\socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "C:\Users\Yateesh Chandra\AppData\Local\Programs\Python\Python311\Lib\socketserver.py", line 755, in __init__
    self.handle()
  File "C:\spark\spark-3.4.2-bin-hadoop3\python\pyspark\accumulators.py", line 281, in handle
    poll(accum_updates)
  File "C:\spark\spark-3.4.2-bin-hadoop3\python\pyspark\acc

In [3]:
# Creating data
dept = [("Finance",10), \
    ("Marketing",20), \
    ("Sales",30), \
    ("IT",40) \
  ]

# Creating column names
deptColumns = ["dept_name","dept_id"]

# Creating DataFrame
deptDF = spark.createDataFrame(data=dept, schema = deptColumns)

In [4]:
deptDF.show(truncate = False)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance  |10     |
|Marketing|20     |
|Sales    |30     |
|IT       |40     |
+---------+-------+



## Collect() Function

In [5]:
# Performing collect() method
dataCollect = deptDF.collect()
print(dataCollect)

[Row(dept_name='Finance', dept_id=10), Row(dept_name='Marketing', dept_id=20), Row(dept_name='Sales', dept_id=30), Row(dept_name='IT', dept_id=40)]


In [6]:
# Printing Collected data
for row in dataCollect:
    print(row['dept_id'], row['dept_name'])

10 Finance
20 Marketing
30 Sales
40 IT


In [7]:
deptDF.collect()[:][:]

[Row(dept_name='Finance', dept_id=10),
 Row(dept_name='Marketing', dept_id=20),
 Row(dept_name='Sales', dept_id=30),
 Row(dept_name='IT', dept_id=40)]

In [8]:
deptDF.collect()

[Row(dept_name='Finance', dept_id=10),
 Row(dept_name='Marketing', dept_id=20),
 Row(dept_name='Sales', dept_id=30),
 Row(dept_name='IT', dept_id=40)]

In [9]:
dataCollect2 =  deptDF.select("*").collect()
print(dataCollect2)

[Row(dept_name='Finance', dept_id=10), Row(dept_name='Marketing', dept_id=20), Row(dept_name='Sales', dept_id=30), Row(dept_name='IT', dept_id=40)]


## Filter() Function

In [10]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType

# Overcome the dataframe
data = [
    (("James","","Smith"),["Java","Scala","C++"],"OH","M"),
    (("Anna","Rose",""),["Spark","Java","C++"],"NY","F"),
    (("Julia","","Williams"),["CSharp","VB"],"OH","F"),
    (("Maria","Anne","Jones"),["CSharp","VB"],"NY","M"),
    (("Jen","Mary","Brown"),["CSharp","VB"],"NY","M"),
    (("Mike","Mary","Williams"),["Python","VB"],"OH","M")
]

schema = StructType([
     StructField('name', StructType([
        StructField('firstname', StringType(), True),
        StructField('middlename', StringType(), True),
         StructField('lastname', StringType(), True)
     ])),
     StructField('languages', ArrayType(StringType()), True),
     StructField('state', StringType(), True),
     StructField('gender', StringType(), True)
 ])

df = spark.createDataFrame(data, schema)

In [11]:
df.printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- languages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)



In [12]:
df.show(truncate = False)

+----------------------+------------------+-----+------+
|name                  |languages         |state|gender|
+----------------------+------------------+-----+------+
|{James, , Smith}      |[Java, Scala, C++]|OH   |M     |
|{Anna, Rose, }        |[Spark, Java, C++]|NY   |F     |
|{Julia, , Williams}   |[CSharp, VB]      |OH   |F     |
|{Maria, Anne, Jones}  |[CSharp, VB]      |NY   |M     |
|{Jen, Mary, Brown}    |[CSharp, VB]      |NY   |M     |
|{Mike, Mary, Williams}|[Python, VB]      |OH   |M     |
+----------------------+------------------+-----+------+



### Equals Condition

In [13]:
# Using equals condition with . operator
df.filter(df.state == 'OH').show(truncate = False)

+----------------------+------------------+-----+------+
|name                  |languages         |state|gender|
+----------------------+------------------+-----+------+
|{James, , Smith}      |[Java, Scala, C++]|OH   |M     |
|{Julia, , Williams}   |[CSharp, VB]      |OH   |F     |
|{Mike, Mary, Williams}|[Python, VB]      |OH   |M     |
+----------------------+------------------+-----+------+



### Not Equals Condition

In [14]:
# Using equals condition with column
df.filter(df['state'] == 'OH').show(truncate = False)

+----------------------+------------------+-----+------+
|name                  |languages         |state|gender|
+----------------------+------------------+-----+------+
|{James, , Smith}      |[Java, Scala, C++]|OH   |M     |
|{Julia, , Williams}   |[CSharp, VB]      |OH   |F     |
|{Mike, Mary, Williams}|[Python, VB]      |OH   |M     |
+----------------------+------------------+-----+------+



In [15]:
# Using Not equals condition in different ways
df.filter(df.state != 'OH').show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|      {Anna, Rose, }|[Spark, Java, C++]|   NY|     F|
|{Maria, Anne, Jones}|      [CSharp, VB]|   NY|     M|
|  {Jen, Mary, Brown}|      [CSharp, VB]|   NY|     M|
+--------------------+------------------+-----+------+



In [16]:
df.filter(~(df.state == 'OH')).show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|      {Anna, Rose, }|[Spark, Java, C++]|   NY|     F|
|{Maria, Anne, Jones}|      [CSharp, VB]|   NY|     M|
|  {Jen, Mary, Brown}|      [CSharp, VB]|   NY|     M|
+--------------------+------------------+-----+------+



In [17]:
from pyspark.sql.functions import col
df.filter(col('state') == 'OH').show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    {James, , Smith}|[Java, Scala, C++]|   OH|     M|
| {Julia, , Williams}|      [CSharp, VB]|   OH|     F|
|{Mike, Mary, Will...|      [Python, VB]|   OH|     M|
+--------------------+------------------+-----+------+



In [18]:
### Using col function
from pyspark.sql.functions import col
df.filter(col('name.lastname') == 'Williams').show()

+--------------------+------------+-----+------+
|                name|   languages|state|gender|
+--------------------+------------+-----+------+
| {Julia, , Williams}|[CSharp, VB]|   OH|     F|
|{Mike, Mary, Will...|[Python, VB]|   OH|     M|
+--------------------+------------+-----+------+



### DataFrame filter using sql Operations

In [19]:
df.filter("gender == 'M'").show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    {James, , Smith}|[Java, Scala, C++]|   OH|     M|
|{Maria, Anne, Jones}|      [CSharp, VB]|   NY|     M|
|  {Jen, Mary, Brown}|      [CSharp, VB]|   NY|     M|
|{Mike, Mary, Will...|      [Python, VB]|   OH|     M|
+--------------------+------------------+-----+------+



In [20]:
df.filter("gender <> 'M'").show()

+-------------------+------------------+-----+------+
|               name|         languages|state|gender|
+-------------------+------------------+-----+------+
|     {Anna, Rose, }|[Spark, Java, C++]|   NY|     F|
|{Julia, , Williams}|      [CSharp, VB]|   OH|     F|
+-------------------+------------------+-----+------+



### Select and Filter together

In [21]:
# Using select and filter together in a query
df.select("name.firstname", "name.lastname", "languages").filter(df['state'] == 'OH').show(truncate = False)

+---------+--------+------------------+
|firstname|lastname|languages         |
+---------+--------+------------------+
|James    |Smith   |[Java, Scala, C++]|
|Julia    |Williams|[CSharp, VB]      |
|Mike     |Williams|[Python, VB]      |
+---------+--------+------------------+



### Filter with multiple conditions

In [22]:
df.filter((df.state == 'OH') & (df.gender == 'M')).show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    {James, , Smith}|[Java, Scala, C++]|   OH|     M|
|{Mike, Mary, Will...|      [Python, VB]|   OH|     M|
+--------------------+------------------+-----+------+



### Filter based on list values

In [23]:
# Define a list
li = ["OA", "LI", "OH"]

# Checking for the values in the list
df.filter(df.state.isin(li)).show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    {James, , Smith}|[Java, Scala, C++]|   OH|     M|
| {Julia, , Williams}|      [CSharp, VB]|   OH|     F|
|{Mike, Mary, Will...|      [Python, VB]|   OH|     M|
+--------------------+------------------+-----+------+



In [24]:
# Checking for the values not in the list
df.filter(~(df.state.isin(li))).show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|      {Anna, Rose, }|[Spark, Java, C++]|   NY|     F|
|{Maria, Anne, Jones}|      [CSharp, VB]|   NY|     M|
|  {Jen, Mary, Brown}|      [CSharp, VB]|   NY|     M|
+--------------------+------------------+-----+------+



### Filters based on Ends with, Starts with, contains

In [25]:
# Starts with function
df.filter(df.state.startswith('O')).show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    {James, , Smith}|[Java, Scala, C++]|   OH|     M|
| {Julia, , Williams}|      [CSharp, VB]|   OH|     F|
|{Mike, Mary, Will...|      [Python, VB]|   OH|     M|
+--------------------+------------------+-----+------+



In [26]:
# Ends with function
df.filter(df.state.endswith('Y')).show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|      {Anna, Rose, }|[Spark, Java, C++]|   NY|     F|
|{Maria, Anne, Jones}|      [CSharp, VB]|   NY|     M|
|  {Jen, Mary, Brown}|      [CSharp, VB]|   NY|     M|
+--------------------+------------------+-----+------+



In [27]:
# Contains function
df.filter(df.name.firstname.contains('n')).show()

+------------------+------------------+-----+------+
|              name|         languages|state|gender|
+------------------+------------------+-----+------+
|    {Anna, Rose, }|[Spark, Java, C++]|   NY|     F|
|{Jen, Mary, Brown}|      [CSharp, VB]|   NY|     M|
+------------------+------------------+-----+------+



### Filters using like and rlike

In [28]:
# Prepare data
data2 = [
    (2, 'Michael Raj'),
    (3, 'Santhanam'),
    (4, 'Rolex'),
    (5, 'Leo')
]

# Creating the columns in the schema
schema = ["id", "name"]

# Creating a data frame
df2 = spark.createDataFrame(data = data2 , schema = schema)

In [29]:
df2.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)



In [30]:
# like is used to check if the match is done with the case being sensitive.
df2.filter(df2.name.like('M%')).show()

+---+-----------+
| id|       name|
+---+-----------+
|  2|Michael Raj|
+---+-----------+



In [31]:
# ilike is used to check if the match is done with the case being insensitive
df2.filter(df2.name.ilike('%m%')).show()

+---+-----------+
| id|       name|
+---+-----------+
|  2|Michael Raj|
|  3|  Santhanam|
+---+-----------+



In [32]:
# Using rlike for checking if the Regular Expression is matching
df2.filter(df2.name.rlike("x$")).show()

+---+-----+
| id| name|
+---+-----+
|  4|Rolex|
+---+-----+



### Filter on array Type Column

In [33]:
# We use array_contains to check for an element in a list
from pyspark.sql.functions import array_contains
df.filter(array_contains(df.languages, 'C++')).show()

# If we want to use check rows with multiple elements in an array, we have to use the array contains twice using AND
df.filter(array_contains(df.languages, 'C++') & array_contains(df.languages, 'Java')).show()

+----------------+------------------+-----+------+
|            name|         languages|state|gender|
+----------------+------------------+-----+------+
|{James, , Smith}|[Java, Scala, C++]|   OH|     M|
|  {Anna, Rose, }|[Spark, Java, C++]|   NY|     F|
+----------------+------------------+-----+------+

