<a href="https://colab.research.google.com/github/telmavcosta/data_processing/blob/main/spark_streaming/examples/coinbase_consumer_telma.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Consumer Coinbase

- To be executed in Google Colab
- Connect to GCLOUD
- Read data from GCS as streaming
- Analyze data

In [None]:
from google.colab import auth
auth.authenticate_user()

project_id = 'data-eng-dev-437916'
!gcloud config set project {project_id}

Updated property [core/project].


In [None]:
!apt-get install openjdk-11-jdk -y
!pip install pyspark gcsfs

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java
  libatk-wrapper-java-jni libxt-dev libxtst6 libxxf86dga1 openjdk-11-jre
  x11-utils
Suggested packages:
  libxt-doc openjdk-11-demo openjdk-11-source visualvm mesa-utils
The following NEW packages will be installed:
  fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java
  libatk-wrapper-java-jni libxt-dev libxtst6 libxxf86dga1 openjdk-11-jdk
  openjdk-11-jre x11-utils
0 upgraded, 10 newly installed, 0 to remove and 35 not upgraded.
Need to get 6,920 kB of archives.
After this operation, 16.9 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 fonts-dejavu-core all 2.37-2build1 [1,041 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 fonts-dejavu-extra all 2.37-2build1 [2,041 kB]
Get:3 http://archive.ubuntu.com/ubuntu jam

In [None]:
from pyspark.sql import SparkSession

GCS_JAR = "https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.9/gcs-connector-hadoop3-2.2.9-shaded.jar"

spark = SparkSession.builder \
    .appName("GCSStreamingDemo") \
    .config("spark.jars", GCS_JAR) \
    .config("spark.hadoop.fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") \
    .config("spark.hadoop.fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS") \
    .getOrCreate()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

schema = "type STRING, sequence LONG, product_id STRING, price STRING, time STRING"

df = spark.readStream.schema(schema).json("gs://edit-data-eng-dev/datalake/landing/btc/")

stream = df.select("time", "product_id", col("price").cast("double")) \
  .writeStream \
  .outputMode("append") \
  .queryName("btc_price_stream") \
  .format("memory") \
  .start()

In [None]:
df = spark.sql("select * from btc_price_stream")
df.show()

+--------------------+----------+---------+
|                time|product_id|    price|
+--------------------+----------+---------+
|2025-07-05T10:20:...|   BTC-USD|108163.06|
|2025-07-05T10:21:...|   BTC-USD|108163.06|
|2025-07-05T10:21:...|   BTC-USD|108163.06|
|2025-07-05T10:20:...|   BTC-USD|108163.06|
|2025-07-05T10:21:...|   BTC-USD|108163.06|
|2025-07-05T10:20:...|   BTC-USD|108163.06|
|2025-07-05T10:21:...|   BTC-USD|108163.06|
|2025-07-05T10:21:...|   BTC-USD|108163.06|
|2025-07-05T10:21:...|   BTC-USD|108163.06|
|2025-07-05T10:21:...|   BTC-USD|108163.06|
|2025-07-05T10:21:...|   BTC-USD|108163.06|
|2025-07-05T10:21:...|   BTC-USD|108156.85|
|2025-07-05T10:20:...|   BTC-USD|108154.98|
|2025-07-05T10:21:...|   BTC-USD|108163.07|
|2025-07-05T10:20:...|   BTC-USD|108163.07|
|2025-07-05T10:20:...|   BTC-USD|108157.71|
|2025-07-05T10:21:...|   BTC-USD|108156.86|
|2025-07-05T10:21:...|   BTC-USD|108156.86|
|2025-07-05T10:21:...|   BTC-USD|108156.86|
|2025-07-05T10:20:...|   BTC-USD

In [None]:
# Analysis

# Latest Bitcoin price
# Calculate average BTC price per minute
# Calculate standard deviation of price over time
# How many price tickets per minute?
# Find anomalies (price == nulls or with strange values)

In [None]:
from pyspark.sql.functions import *


In [None]:
# count and average per product_id
from pyspark.sql.functions import *

df.groupBy("product_id").agg(count(lit("1")).alias("count"), avg("price").alias("avg_price")).show()

+----------+-----+-------------------+
|product_id|count|          avg_price|
+----------+-----+-------------------+
|  DOGE-USD|   15|0.16447533333333333|
|   ETH-USD|   58| 2521.2998275862074|
|   BTC-USD|   78| 108158.83153846153|
+----------+-----+-------------------+



In [None]:
# Latest Bitcoin price
df.show()
df = spark.sql("SELECT * FROM btc_price_stream WHERE product_id = 'BTC-USD' ORDER BY time limit 1")

df_latest_btc = df.filter(col("product_id") == "BTC-USD").orderBy(desc("time")).limit(1)

df_latest_btc.show()

+--------------------+----------+---------+
|                time|product_id|    price|
+--------------------+----------+---------+
|2025-07-05T10:20:...|   BTC-USD|108163.06|
|2025-07-05T10:21:...|   BTC-USD|108163.06|
|2025-07-05T10:21:...|   BTC-USD|108163.06|
|2025-07-05T10:20:...|   BTC-USD|108163.06|
|2025-07-05T10:21:...|   BTC-USD|108163.06|
|2025-07-05T10:20:...|   BTC-USD|108163.06|
|2025-07-05T10:21:...|   BTC-USD|108163.06|
|2025-07-05T10:21:...|   BTC-USD|108163.06|
|2025-07-05T10:21:...|   BTC-USD|108163.06|
|2025-07-05T10:21:...|   BTC-USD|108163.06|
|2025-07-05T10:21:...|   BTC-USD|108163.06|
|2025-07-05T10:21:...|   BTC-USD|108156.85|
|2025-07-05T10:20:...|   BTC-USD|108154.98|
|2025-07-05T10:21:...|   BTC-USD|108163.07|
|2025-07-05T10:20:...|   BTC-USD|108163.07|
|2025-07-05T10:20:...|   BTC-USD|108157.71|
|2025-07-05T10:21:...|   BTC-USD|108156.86|
|2025-07-05T10:21:...|   BTC-USD|108156.86|
|2025-07-05T10:21:...|   BTC-USD|108156.86|
|2025-07-05T10:20:...|   BTC-USD

In [None]:
# Calculate average BTC price per minute
#df.show()

df.filter("product_id=='BTC-USD'").groupBy(window("time","1 minute")).agg(avg("price").alias("avg_price")).orderBy("window").show(10,False)

#df_avg_btc_per_min = df_btc.groupBy(
#    window(col("time"), "1 minute").alias("minute")
#).agg(
#    avg("price").alias("avg_price")
#).orderBy("window")

#df_avg_btc_per_min.show(10,False)




+------------------------------------------+------------------+
|window                                    |avg_price         |
+------------------------------------------+------------------+
|{2025-07-05 10:20:00, 2025-07-05 10:21:00}|108158.73717948717|
|{2025-07-05 10:21:00, 2025-07-05 10:22:00}|108159.21160919541|
|{2025-07-05 10:22:00, 2025-07-05 10:23:00}|108166.21611111108|
|{2025-07-05 10:23:00, 2025-07-05 10:24:00}|108172.72325581392|
+------------------------------------------+------------------+



In [None]:
# Calculate standard deviation of price over time
df.filter("product_id=='BTC-USD'").groupBy(window("time","1 minute")).agg(stddev("price").alias("stddev_price")).orderBy("window").show(10,False)


+------------------------------------------+-----------------+
|window                                    |stddev_price     |
+------------------------------------------+-----------------+
|{2025-07-05 10:20:00, 2025-07-05 10:21:00}|2.268814840997924|
|{2025-07-05 10:21:00, 2025-07-05 10:22:00}|3.028489866043643|
|{2025-07-05 10:22:00, 2025-07-05 10:23:00}|4.370803958317434|
|{2025-07-05 10:23:00, 2025-07-05 10:24:00}|4.573369892672111|
+------------------------------------------+-----------------+



In [None]:
# How many price tickets per minute?
df.groupBy(window("time","1 minute"), "product_id").agg(count(lit("1")).alias("count")).orderBy("window", "product_id").show(10,False)


+------------------------------------------+----------+-----+
|window                                    |product_id|count|
+------------------------------------------+----------+-----+
|{2025-07-05 10:20:00, 2025-07-05 10:21:00}|BTC-USD   |39   |
|{2025-07-05 10:20:00, 2025-07-05 10:21:00}|DOGE-USD  |15   |
|{2025-07-05 10:20:00, 2025-07-05 10:21:00}|ETH-USD   |24   |
|{2025-07-05 10:21:00, 2025-07-05 10:22:00}|BTC-USD   |87   |
|{2025-07-05 10:21:00, 2025-07-05 10:22:00}|DOGE-USD  |10   |
|{2025-07-05 10:21:00, 2025-07-05 10:22:00}|ETH-USD   |131  |
|{2025-07-05 10:22:00, 2025-07-05 10:23:00}|BTC-USD   |207  |
|{2025-07-05 10:22:00, 2025-07-05 10:23:00}|DOGE-USD  |19   |
|{2025-07-05 10:22:00, 2025-07-05 10:23:00}|ETH-USD   |84   |
|{2025-07-05 10:23:00, 2025-07-05 10:24:00}|BTC-USD   |70   |
+------------------------------------------+----------+-----+
only showing top 10 rows



In [None]:
# Find anomalies (price == nulls or with strange values)

from pyspark.sql.functions import *

df.filter("product_id=='BTC_USD'").filter((col("price").isNUll()) | (col("price") >=  lit(100000))).show()

TypeError: 'Column' object is not callable

In [None]:
stream.stop()

In [None]:
#Write Parquet

from pyspark.sql import SparkSession
from pyspark.sql.functions import col

schema = "type STRING, sequence LONG, product_id STRING, price STRING, time STRING"

df = spark.readStream.schema(schema).json("gs://edit-data-eng-dev/datalake/landing/btc/")

stream = df.select("time", "product_id", col("price").cast("double")) \
  .writeStream \
  .outputMode("append") \
  .format("parquer") \
  .option("path", "gs://edit-data-eng-dev/datalake/bronze_telma/btc/") \
  .option("checkpointlocation","gs://edit-data-eng-dev/datalake/bronze_telma/checkpoint/") \
  .start()