# PySpark


## Load from files - example
## Best place to start with PySpark

-- Right click

In [None]:
df = spark.read.format("csv").option("header","true").load("Files/Legos/sets.csv")
# df now is a Spark DataFrame containing CSV data from "Files/Legos/sets.csv".
display(df)

In [None]:
import pandas as pd
# Load data into pandas DataFrame from "/lakehouse/default/Files/Legos/sets.csv"
df = pd.read_csv("/lakehouse/default/Files/Legos/sets.csv")
display(df)


## 2. Load Pandas

### Pandas on Spark

### Distributed versus single node

### File this away for later

In [1]:
import pyspark.pandas as ps


# Load data into pandas DataFrame from "/lakehouse/default/Files/Legos/inventories.csv"
spark_df = ps.read_csv("Files/Legos/sets.csv", index_col="set_num")
spark_df.head(2)


StatementMeta(, 523fddb9-603d-444d-9429-d2cd79e7b940, 3, Finished, Available, Finished)



Unnamed: 0_level_0,name,year,theme_id,num_parts,img_url
set_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0003977811-1,Ninjago: Book of Adventures,2022,761,1,https://cdn.rebrickable.com/media/sets/0003977...
001-1,Gears,1965,756,43,https://cdn.rebrickable.com/media/sets/001-1.jpg


### DataFrame = Table


## Load CSV

In [1]:
legothemes = spark.read.format("csv").option("header","true").load("Files/Legos/themes.csv")
# legothemes now is a Spark DataFrame containing CSV data from "Files/Legos/themes.csv".

#display using the show() method limiting to 5 rows
legothemes.show(5)

#display using the show() method limiting to 5 rows
legothemes.show(5, truncate=10)

StatementMeta(, 2f307f39-f47e-4af5-af0e-281f80f37599, 3, Finished, Available, Finished)

+---+--------------+---------+
| id|          name|parent_id|
+---+--------------+---------+
|  1|       Technic|     NULL|
|  3|   Competition|        1|
|  4|Expert Builder|        1|
| 16|    RoboRiders|        1|
| 17|Speed Slammers|        1|
+---+--------------+---------+
only showing top 5 rows

+---+----------+---------+
| id|      name|parent_id|
+---+----------+---------+
|  1|   Technic|     NULL|
|  3|Competi...|        1|
|  4|Expert ...|        1|
| 16|RoboRiders|        1|
| 17|Speed S...|        1|
+---+----------+---------+
only showing top 5 rows



## Action will "execute" the code
## Displaying the data is one of those actions
.
## <u>**Two Ways to display**</u>
- show() method (shown above)
    - also can show(5, truncate=25)
- display() function (shown below)

In [5]:
# Only return info about dataframe

legothemes

StatementMeta(, e12bd849-8e08-4116-b11a-85121909521b, 7, Finished, Available, Finished)

DataFrame[id: string, name: string, parent_id: string]

## Notice it gives info about Dataframe
## but not the data itself - so <u>**lazy**</u>

## Select

In [None]:
onlynames =\
legothemes\
    .select(legothemes.name)\
    .limit(3)

display(onlynames)

## There are multiple ways to refer to columns
## - sets.select(sets.name)
## - sets.select(sets["name"])
## - sets.select("name)"
## - sets.select(col"name")
##      - requires importing col
##          - from pyspark.sql.functions import col

## Filtering

In [10]:
stawarslegos =\
    legothemes\
    .filter(legothemes.name == "Star Wars")
display(stawarslegos)

StatementMeta(, 0d01c6a6-53a8-4565-92c7-d1282ab39256, 12, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 06050ec2-ddd3-415a-a6b1-4463ee98c77f)

## Counting

In [5]:
legothemes.count()

StatementMeta(, e59774d7-742d-4ff2-af52-60d7aed5d19c, 17, Finished, Available, Finished)

480

## Rename column - alias

In [12]:
from pyspark.sql.functions import col

(
legothemes
    .select(col("id"), col("name")
    .alias("Theme Name"))
    .show(10)
)

StatementMeta(, 2f307f39-f47e-4af5-af0e-281f80f37599, 14, Finished, Available, Finished)

+---+--------------------+
| id|          Theme Name|
+---+--------------------+
|  1|             Technic|
|  3|         Competition|
|  4|      Expert Builder|
| 16|          RoboRiders|
| 17|      Speed Slammers|
| 18|           Star Wars|
| 19|        Supplemental|
| 20|     Throwbot Slizer|
| 21|Universal Buildin...|
| 22|             Creator|
+---+--------------------+
only showing top 10 rows



## GroupBy

In [20]:
from pyspark.sql.functions import count

legosets = spark.read.format("csv").option("header","true").load("Files/Legos/sets.csv")

legosets.groupBy("name").agg(count("theme_id").alias("Theme Count")).show(10)


StatementMeta(, 2f307f39-f47e-4af5-af0e-281f80f37599, 22, Finished, Available, Finished)

+--------------------+-----------+
|                name|Theme Count|
+--------------------+-----------+
| Castle Mini Figures|          4|
|Spider-Man Action...|          1|
|Friends Hearts Pe...|          3|
|Winnie the Pooh's...|          1|
|          Basic Pack|          1|
|    My First Tractor|          1|
|Mickey Mouse & Do...|          1|
|     Passenger Coach|          2|
|              Flower|          5|
|Island Xtreme Stu...|          1|
+--------------------+-----------+
only showing top 10 rows



## Joins

In [31]:
from pyspark.sql.functions import col

(
    legosets
    .join(legothemes, on=col("id") == col("theme_id"), how="left_outer")
    .show(10)
)

StatementMeta(, 2f307f39-f47e-4af5-af0e-281f80f37599, 36, Finished, Available, Finished)

+------------+--------------------+----+--------+---------+--------------------+---+--------------------+---------+
|     set_num|                name|year|theme_id|num_parts|             img_url| id|                name|parent_id|
+------------+--------------------+----+--------+---------+--------------------+---+--------------------+---------+
|0003977811-1|Ninjago: Book of ...|2022|     761|        1|https://cdn.rebri...|761|Activity Books wi...|      497|
|       001-1|               Gears|1965|     756|       43|https://cdn.rebri...|756|           Samsonite|      365|
|      0011-2|   Town Mini-Figures|1979|      67|       12|https://cdn.rebri...| 67|        Classic Town|       50|
|      0011-3|Castle 2 for 1 Bo...|1987|     199|        0|https://cdn.rebri...|199|        Lion Knights|      186|
|      0012-1|  Space Mini-Figures|1979|     143|       12|https://cdn.rebri...|143|        Supplemental|      126|
|      0013-1|  Space Mini-Figures|1979|     143|       12|https://cdn.r

## Data Wrangler

**Great way to learn Syntax**

**Works with Python Notebook**

 ### Start with:
- ### existing DataFrame 
- ### start with sample data
    - ### titanic
 ### 2. Make transformations

In [None]:
import pandas as pd

wrangler_sample_df = pd.read_csv("https://aka.ms/wrangler/titanic.csv")
display(wrangler_sample_df)

In [None]:
legosetsdf =\
    spark.read.format("csv")\
    .option("header","true")\
    .load("Files/Legos/sets.csv")

# df now is a Spark DataFrame containing CSV data from "Files/Legos/sets.csv".
display(legosetsdf)

### Code Generated from Data Wrangler

In [None]:
# Code generated by Data Wrangler for PySpark DataFrame

from pyspark.sql import types as T

def clean_data(legosetsdf):
    # Rename column 'set_num' to 'Set Number'
    legosetsdf = legosetsdf.withColumnRenamed('set_num', 'Set Number')
    # Rename column 'name' to 'Set Name'
    legosetsdf = legosetsdf.withColumnRenamed('name', 'Set Name')
    # Change column type to int64 for column: 'year'
    legosetsdf = legosetsdf.withColumn('year', legosetsdf['year'].cast(T.LongType()))
    # Filter rows based on column: 'year'
    legosetsdf = legosetsdf.filter(legosetsdf['year'] >= 2025)
    return legosetsdf

legosetsdf_clean = clean_data(legosetsdf)
display(legosetsdf_clean)

### Load from Table in Lakehouse

In [21]:
df = spark.sql("SELECT * FROM LakeBeach.legos.sets LIMIT 10")
display(df)

StatementMeta(, 044181c2-e26b-4018-818b-683c6a2b5dbc, 31, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 94eea791-bc2f-416c-969e-2408b6db9c78)

## Add to Pipeline

#

## Magic Commands

## Set language at the cell level

In [None]:
%%spark