In [None]:
! pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=bf578e04d9234f70baf1e4859d57d82d1f710da6883cb02724f21e048a9507a4
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
  .appName("SparkSQLExample")\
  .getOrCreate()


In [None]:
#Full refresh: Load the entire dataset

df_sales =  spark.read.format("csv") \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .load("/content/sales_data.csv")

#Apply transformations (if necessary)

df_transformed = df_sales.withColumn("total_sales", df_sales["quantity"] * df_sales["price"])

#Full refresh: Partition the data by 'date' and overwrite the existing data

output_path = "/content/sample_data/partitioned_data"

df_transformed.write.partitionBy("date").mode("overwrite").parquet(output_path)

#Verify partitioned data

partitioned_df = spark.read.parquet(output_path)

partitioned_df.show()

+--------------+-----------+--------+--------+-----+-------------------+-----------+----------+
|transaction_id|customer_id| product|quantity|price|         updated_at|total_sales|      date|
+--------------+-----------+--------+--------+-----+-------------------+-----------+----------+
|             1|        101|  Laptop|       1| 1000|2024-09-01 08:00:00|       1000|2024-09-01|
|             2|        102|   Phone|       2|  500|2024-09-01 09:00:00|       1000|2024-09-01|
|             5|        105|Keyboard|       1|   50|2024-09-03 12:00:00|         50|2024-09-03|
|             6|        106|   Mouse|       3|   30|2024-09-03 13:00:00|         90|2024-09-03|
|             3|        103|  Tablet|       1|  300|2024-09-02 10:00:00|        300|2024-09-02|
|             4|        104| Monitor|       2|  200|2024-09-02 11:00:00|        400|2024-09-02|
+--------------+-----------+--------+--------+-----+-------------------+-----------+----------+



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Initialize Spark session
spark = SparkSession.builder.appName("IncrementalLoad").getOrCreate()

# Define the last ETL run timestamp
last_etl_run = '2024-09-01 00:00:00'

# Load only new or updated records since the last ETL run
df_incremental = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("/content/sample_data/sales_data.csv") \
    .filter(F.col("updated_at") > last_etl_run)

# Apply transformations (if necessary)
df_transformed_incremental = df_incremental.withColumn(
    "total_sales", F.col("quantity") * F.col("price")
)

# Incremental load: Append the new data to the existing partitioned dataset
output_path = "/content/sample_data/partitioned_sales_data"
df_transformed_incremental.write.partitionBy("date").mode("append").parquet(output_path)

# Verify partitioned data after incremental load
partitioned_df = spark.read.parquet(output_path)
partitioned_df.show()


+--------------+-----------+--------+--------+-----+-------------------+-----------+----------+
|transaction_id|customer_id| product|quantity|price|         updated_at|total_sales|      date|
+--------------+-----------+--------+--------+-----+-------------------+-----------+----------+
|             1|        101|  Laptop|       1| 1000|2024-09-01 08:00:00|       1000|2024-09-01|
|             2|        102|   Phone|       2|  500|2024-09-01 09:00:00|       1000|2024-09-01|
|             5|        105|Keyboard|       1|   50|2024-09-03 12:00:00|         50|2024-09-03|
|             6|        106|   Mouse|       3|   30|2024-09-03 13:00:00|         90|2024-09-03|
|             3|        103|  Tablet|       1|  300|2024-09-02 10:00:00|        300|2024-09-02|
|             4|        104| Monitor|       2|  200|2024-09-02 11:00:00|        400|2024-09-02|
+--------------+-----------+--------+--------+-----+-------------------+-----------+----------+



In [None]:
! pip install ipywidgets



In [None]:
from pyspark.sql import SparkSession
import ipywidgets as widgets
from IPython.display import display

# Step 1: Initialize a Spark session
spark = SparkSession.builder.appName ("PySpark with Widgets Example").getOrCreate()
# Step 2: Create a simple DataFrame
data = [
("John", 28, "Male", 60000),
 ("Jane", 32, "Female", 72000),
  ("Mike", 45, "Male", 84000),
   ("Emily", 23, "Female", 52000),
    ("Alex", 36, "Male", 67000)
]
df =spark.createDataFrame(data, ["name", "age", "gender", "salary"])
# Show the DataFrame
df.show()
#Step 3: Create widgets
# Dropdown widget to select column for filtering
column_dropdown =widgets.Dropdown(
    options=["age", "salary"],
    value="age",
    description="Filter By:",
)
# Slider widget to choose a value for filtering
slider =widgets. IntSlider(
    value=30,
    min=20,
    max=100,
    step=5,
    description="Threshold:",
    continuous_update=False
)
#Button to trigger filtering
button= widgets. Button (description="Apply Filter")
# Output area to show the results
output= widgets. Output ()
# Display the widgets
display (column_dropdown, slider, button, output)
# Step 4: Define the function to apply filtering based on widget inputs
def apply_filter(b):
  column = column_dropdown.value
  threshold = slider.value
# Clear previous output
  output.clear_output()
# Filter the DataFrame based on widget values
  df_filtered =df.filter(df [column] > threshold)
# Show the filtered DataFrame
  with output:
    print (f"Filtering by (column) > (threshold)")
    df_filtered.show()
#Step 5: Attach the function to the button click event
button.on_click(apply_filter)

+-----+---+------+------+
| name|age|gender|salary|
+-----+---+------+------+
| John| 28|  Male| 60000|
| Jane| 32|Female| 72000|
| Mike| 45|  Male| 84000|
|Emily| 23|Female| 52000|
| Alex| 36|  Male| 67000|
+-----+---+------+------+



Dropdown(description='Filter By:', options=('age', 'salary'), value='age')

IntSlider(value=30, continuous_update=False, description='Threshold:', min=20, step=5)

Button(description='Apply Filter', style=ButtonStyle())

Output()