In [1]:
from pyspark.sql import SparkSession, SQLContext

In [2]:
spark = SparkSession \
  .builder \
  .master('yarn') \
  .appName('spark-bigquery-demo') \
  .getOrCreate()

In [3]:
# Load data from BigQuery.
fin = spark.read.format('bigquery') \
  .option('table', 'proud-sweep-342309:yahoofinance.bitcoin') \
  .load()

In [4]:
BUCKET="proud-sweep-342309"

from pyspark.sql import SparkSession
import pandas
from pyspark.sql.functions import pandas_udf, window, month, weekofyear

In [5]:
groups = fin.groupby(window('Datetime','5 minutes')).agg({
    'Open':'last','High':'last','Low':'last','Close':'last', 'Volume':'sum'
})

In [6]:
groups.select('window')

DataFrame[window: struct<start:timestamp,end:timestamp>]

In [7]:
monthdf = groups.withColumn('MonthYear', month(groups.window.end))
monthweekdf = monthdf.withColumn('Week_Number', weekofyear(groups.window.end))

In [8]:
monthweekdf.createOrReplaceTempView('df')

# Perform word count.
five_min_agg = spark.sql('SELECT * from df LIMIT 10')


In [9]:
five_min_agg.show()



+--------------------+-----------+--------------+--------------+--------------+--------------+---------+-----------+
|              window|sum(Volume)|    last(High)|   last(Close)|    last(Open)|     last(Low)|MonthYear|Week_Number|
+--------------------+-----------+--------------+--------------+--------------+--------------+---------+-----------+
|{2022-02-28 07:30...|  463765504|      38217.75|      38217.75|      38217.75|      38217.75|        2|          9|
|{2022-02-27 19:30...|   80285696| 38629.4453125| 38629.4453125| 38629.4453125| 38629.4453125|        2|          8|
|{2022-03-02 12:05...|   44113920|44147.53515625|44147.53515625|44147.53515625|44147.53515625|        3|          9|
|{2022-03-04 10:15...|  107704320|  41632.578125|  41632.578125|  41632.578125|  41632.578125|        3|          9|
|{2022-02-28 22:45...| 2153625600| 43659.6796875| 43659.6796875| 43659.6796875| 43659.6796875|        2|          9|
|{2022-03-01 11:55...|  394383360|43514.03515625|43514.03515625|

                                                                                

In [10]:
five_min_agg.printSchema()

root
 |-- window: struct (nullable = false)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- sum(Volume): long (nullable = true)
 |-- last(High): double (nullable = true)
 |-- last(Close): double (nullable = true)
 |-- last(Open): double (nullable = true)
 |-- last(Low): double (nullable = true)
 |-- MonthYear: integer (nullable = true)
 |-- Week_Number: integer (nullable = true)



In [11]:
BUCKET="proud-sweep-342309"

# Use the Cloud Storage bucket for temporary BigQuery export data used
# by the connector.
bucket = BUCKET
spark.conf.set('temporaryGcsBucket', bucket)

In [14]:
colnames = ["Window","Volume","High","Close","Open","Low","Month","WeekNum"]
new_df = monthweekdf.toDF(*colnames)
new_df.show()

[Stage 6:>                                                          (0 + 1) / 1]

+--------------------+----------+--------------+--------------+--------------+--------------+-----+-------+
|              Window|    Volume|          High|         Close|          Open|           Low|Month|WeekNum|
+--------------------+----------+--------------+--------------+--------------+--------------+-----+-------+
|{2022-02-28 07:30...| 463765504|      38217.75|      38217.75|      38217.75|      38217.75|    2|      9|
|{2022-02-27 19:30...|  80285696| 38629.4453125| 38629.4453125| 38629.4453125| 38629.4453125|    2|      8|
|{2022-03-02 12:05...|  44113920|44147.53515625|44147.53515625|44147.53515625|44147.53515625|    3|      9|
|{2022-03-04 10:15...| 107704320|  41632.578125|  41632.578125|  41632.578125|  41632.578125|    3|      9|
|{2022-02-28 22:45...|2153625600| 43659.6796875| 43659.6796875| 43659.6796875| 43659.6796875|    2|      9|
|{2022-03-01 11:55...| 394383360|43514.03515625|43514.03515625|43514.03515625|43514.03515625|    3|      9|
|{2022-03-04 23:50...| 55802

                                                                                

In [None]:
# Saving the data to BigQuery
new_df.write.format('bigquery') \
  .option('table', 'proud-sweep-342309:yahoofinance.bitcoin_5_min') \
  .save()

                                                                                