<a href="https://colab.research.google.com/github/lucprosa/dataeng-basic-course/blob/main/spark_streaming/example1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Usecase 1
## Reading CSV data from input folder
## Checking results from query in memory

In [26]:
import pandas as pd
df = pd.read_csv("https://raw.githubusercontent.com/lucprosa/dataeng-basic-course/main/data/squirrel-data/squirrel-data.csv", sep = ",", on_bad_lines='skip')
df.head()

Unnamed: 0,Area Name,Area ID,Park Name,Park ID,Squirrel ID,Primary Fur Color,Highlights in Fur Color,Color Notes,Location,Above Ground (Height in Feet),Specific Location,Activities,Interactions with Humans,Other Notes or Observations,Squirrel Latitude (DD.DDDDDD),Squirrel Longitude (-DD.DDDDDD)
0,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-01,Gray,White,,Ground Plane,,,Foraging,Indifferent,,40.85941,-73.933936
1,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-02,Gray,White,,Ground Plane,,,Foraging,Indifferent,Looks skinny,40.859436,-73.933937
2,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-03,Gray,White,,Ground Plane,,,"Eating, Digging something",Indifferent,,40.859416,-73.933894
3,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-04,Gray,White,,Ground Plane,,,Running,Indifferent,,40.859418,-73.933895
4,UPPER MANHATTAN,A,Fort Tryon Park,1,A-01-05,Gray,Cinnamon,,Ground Plane,,,"Running, Eating",Indifferent,She left food,40.859493,-73.93359


# Setting up PySpark

In [9]:
%pip install pyspark



In [10]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('Test streaming').config('spark.ui.port', '4050').getOrCreate()

In [37]:
from pyspark import SparkFiles
url = "https://raw.githubusercontent.com/lucprosa/dataeng-basic-course/main/data/squirrel-data/squirrel-data.csv"

from pyspark.sql.types import *
schema = StructType([
StructField('Area Name',StringType(),True),
StructField('Area ID',StringType(),True),
StructField('Park Name',StringType(),True),
StructField('Park ID', StringType(), True),
StructField('Squirrel ID', StringType(), True),
StructField('Primary Fur Color', StringType(), True),
StructField('Highlights in Fur Color', StringType(), True),
StructField('Color Notes', StringType(), True),
StructField('Location', StringType(), True),
StructField('Above Ground (Height in Feet)', StringType(), True),
StructField('Specific Location', StringType(), True),
StructField('Activities', StringType(), True),
StructField('Interactions with Humans', StringType(), True),
StructField('Squirrel Latitude (DD.DDDDDD)', StringType(), True),
StructField('Squirrel Longitude (-DD.DDDDDD)', StringType(), True)
])

spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("squirrel-data.csv"), header=True, schema=schema)

df.show()

+---------------+-------+-------------------+-------+-----------+-----------------+-----------------------+-----------+------------+-----------------------------+-----------------+--------------------+------------------------+-----------------------------+-------------------------------+
|      Area Name|Area ID|          Park Name|Park ID|Squirrel ID|Primary Fur Color|Highlights in Fur Color|Color Notes|    Location|Above Ground (Height in Feet)|Specific Location|          Activities|Interactions with Humans|Squirrel Latitude (DD.DDDDDD)|Squirrel Longitude (-DD.DDDDDD)|
+---------------+-------+-------------------+-------+-----------+-----------------+-----------------------+-----------+------------+-----------------------------+-----------------+--------------------+------------------------+-----------------------------+-------------------------------+
|UPPER MANHATTAN|      A|    Fort Tryon Park|     01|    A-01-01|             Gray|                  White|       NULL|Ground Plane| 

In [108]:
from pyspark.sql import DataFrame
import time
import asyncio

async def splitDf(df: DataFrame, weight: float):
  weights = [weight for i in range(files)]
  dfs = df.randomSplit(weights)
  return dfs

async def writeFile(dfs: list[DataFrame], path: str, seconds_per_file: int):
  for i in range(len(dfs)):
    df = dfs[i]
    print(f"Writing file {path}file_{i}.csv with {df.count()} lines")
    df.write.mode("overwrite").format("csv").save(f"{path}file_{i}.csv")
    time.sleep(seconds_per_file)

files = 10
seconds_per_file = 0

df = df.cache()
rows = df.count() # 433 rows

dfs = splitDf(df, 1.0)
#await writeFile(dfs, "/content/input/", seconds_per_file)

asyncio.get_event_loop().run_until_complete(writeFile(dfs, "/content/input/", seconds_per_file))

RuntimeError: This event loop is already running

# Read CSVs as streaming

In [None]:
from pyspark.sql.types import *
schema = StructType([
StructField('Area Name',StringType(),True),
StructField('Area ID',StringType(),True),
StructField('Park Name',StringType(),True),
StructField('Park ID', StringType(), True),
StructField('Squirrel ID', StringType(), True),
StructField('Primary Fur Color', StringType(), True),
StructField('Highlights in Fur Color', StringType(), True),
StructField('Color Notes', StringType(), True),
StructField('Location', StringType(), True),
StructField('Above Ground (Height in Feet)', StringType(), True),
StructField('Specific Location', StringType(), True),
StructField('Activities', StringType(), True),
StructField('Interactions with Humans', StringType(), True),
StructField('Squirrel Latitude (DD.DDDDDD)', StringType(), True),
StructField('Squirrel Longitude (-DD.DDDDDD)', StringType(), True)
])

In [102]:
! rm -rf /content/input

In [91]:
stream1 = spark.readStream.format('csv').schema(schema).option('header', True).load('/content/input')

In [82]:
print(stream1.isStreaming)

True


In [92]:
query = (stream1.writeStream
.format('memory')
.queryName('my_query')
.outputMode('append')
.start()
)

In [87]:
spark.sql("select * from my_query").show()

+---------+-------+---------+-------+-----------+-----------------+-----------------------+-----------+--------+-----------------------------+-----------------+----------+------------------------+-----------------------------+-------------------------------+
|Area Name|Area ID|Park Name|Park ID|Squirrel ID|Primary Fur Color|Highlights in Fur Color|Color Notes|Location|Above Ground (Height in Feet)|Specific Location|Activities|Interactions with Humans|Squirrel Latitude (DD.DDDDDD)|Squirrel Longitude (-DD.DDDDDD)|
+---------+-------+---------+-------+-----------+-----------------+-----------------------+-----------+--------+-----------------------------+-----------------+----------+------------------------+-----------------------------+-------------------------------+
+---------+-------+---------+-------+-----------+-----------------+-----------------------+-----------+--------+-----------------------------+-----------------+----------+------------------------+---------------------------

In [90]:
query.stop()

In [None]:
# salvar em parquet , particionando por janela de 10 segundos


