# PySpark Data Manipulation Guide

## 1. Changing Data Types (Schema Transformation)
Learn how to modify column data types using `cast()`:


In [26]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.getOrCreate()

StatementMeta(, 161e0612-7219-44cc-b2cd-8b08e1150e56, 28, Finished, Available, Finished)

In [27]:
# creatin sample DataFrame
data = [
    ("John", 35, "IT", 50000),
    ("Alice", 40, "IT", 60000),
    ("Bob", None, "HR", 45000),
    ("John", 35, "IT", 50000),
    ("Alice", 40, "IT", 60000), 
    ("Charlie", 30, "HR", 55000),
    ("David", 45, "Finance", 70000),
    ("Eve", 28, "Finance", 65000)
]
col = ["Name", "Age", "Department", "Salary"]

df = spark.createDataFrame(data,col)

df.printSchema()

#Changing datatypes
df = df.withColumn("Age", df.Age.cast("double")) \
       .withColumn("Salary", df.Salary.cast("string"))

df.printSchema()

#Approach 2
df = df.selectExpr(
    "Name", 
    "CAST(Age AS DOUBLE) AS Age",
    "CAST(Department AS String) AS Department",
    "CAST(Salary AS String) AS Salary"
)
                   
df.printSchema()

StatementMeta(, 161e0612-7219-44cc-b2cd-8b08e1150e56, 29, Finished, Available, Finished)

root
 |-- Name: string (nullable = true)
 |-- Age: long (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: long (nullable = true)

root
 |-- Name: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: string (nullable = true)

root
 |-- Name: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: string (nullable = true)



## 2. Filtering Data
Examples of basic filtering operations:

In [28]:
# Basic filtering
filtered_df = df.filter(df.Salary > 50000)
filtered_df.show()

#or

filtered_df2 = df.where(df.Salary > 50000)
filtered_df2.show()


# Filter non-null values
filtered_notnull_df = df.filter(df["Name"].isNotNull())
filtered_notnull_df.show()

# Filterning null values
filtered_null_df = df.filter(df["Age"].isNull())
filtered_null_df.show()

StatementMeta(, 161e0612-7219-44cc-b2cd-8b08e1150e56, 30, Finished, Available, Finished)

+-------+----+----------+------+
|   Name| Age|Department|Salary|
+-------+----+----------+------+
|  Alice|40.0|        IT| 60000|
|  Alice|40.0|        IT| 60000|
|Charlie|30.0|        HR| 55000|
|  David|45.0|   Finance| 70000|
|    Eve|28.0|   Finance| 65000|
+-------+----+----------+------+

+-------+----+----------+------+
|   Name| Age|Department|Salary|
+-------+----+----------+------+
|  Alice|40.0|        IT| 60000|
|  Alice|40.0|        IT| 60000|
|Charlie|30.0|        HR| 55000|
|  David|45.0|   Finance| 70000|
|    Eve|28.0|   Finance| 65000|
+-------+----+----------+------+

+-------+----+----------+------+
|   Name| Age|Department|Salary|
+-------+----+----------+------+
|   John|35.0|        IT| 50000|
|  Alice|40.0|        IT| 60000|
|    Bob|NULL|        HR| 45000|
|   John|35.0|        IT| 50000|
|  Alice|40.0|        IT| 60000|
|Charlie|30.0|        HR| 55000|
|  David|45.0|   Finance| 70000|
|    Eve|28.0|   Finance| 65000|
+-------+----+----------+------+

+----+-

## 3. Multiple Filters
Combining multiple conditions:

In [29]:
# Multiple conditions
filtered_df = df.filter(
    (df.Age==35 ) & 
    (df.Salary>=50000))
display(filtered_df)


StatementMeta(, 161e0612-7219-44cc-b2cd-8b08e1150e56, 31, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, e8243d7f-23a2-478f-92f5-01dd2c24976c)

## 4. Working with Distinct Values
Handling duplicate data:

In [30]:
# Get distinct rows
distinct_df = df.distinct()
distinct_df.show()

# Get distinct values from specific column
distinct_names = df.select("Name").distinct()
distinct_names.show()

# Remove duplicates based on multiple columns (using "Name" and "Department" as an example)
unique_df = df.dropDuplicates(["Name", "Department"])
unique_df.show()

StatementMeta(, 161e0612-7219-44cc-b2cd-8b08e1150e56, 32, Finished, Available, Finished)

+-------+----+----------+------+
|   Name| Age|Department|Salary|
+-------+----+----------+------+
|   John|35.0|        IT| 50000|
|  Alice|40.0|        IT| 60000|
|Charlie|30.0|        HR| 55000|
|  David|45.0|   Finance| 70000|
|    Eve|28.0|   Finance| 65000|
|    Bob|NULL|        HR| 45000|
+-------+----+----------+------+

+-------+
|   Name|
+-------+
|   John|
|  Alice|
|    Bob|
|Charlie|
|  David|
|    Eve|
+-------+

+-------+----+----------+------+
|   Name| Age|Department|Salary|
+-------+----+----------+------+
|  Alice|40.0|        IT| 60000|
|    Bob|NULL|        HR| 45000|
|Charlie|30.0|        HR| 55000|
|  David|45.0|   Finance| 70000|
|    Eve|28.0|   Finance| 65000|
|   John|35.0|        IT| 50000|
+-------+----+----------+------+



## 5. Counting Distinct Values
Analyzing unique value counts:

In [31]:
# Count distinct names
distinct_name_count = df.select("Name").distinct().count()
print(f"Distinct names: {distinct_name_count}")

# Count distinct combinations of Name and Department
distinct_combo_count = df.select("Name", "Department").distinct().count()
print(f"Distinct name-department combinations: {distinct_combo_count}")

StatementMeta(, 161e0612-7219-44cc-b2cd-8b08e1150e56, 33, Finished, Available, Finished)

Distinct names: 6
Distinct name-department combinations: 6
