In [None]:
#using SparkSession to make spark session instance
from pyspark.sql import SparkSession
spark = SparkSession.builder \
  .appName('1.1. BigQuery Storage & Spark DataFrames - Python')\
  .config('spark.jars', 'gs://spark-lib/bigquery/spark-bigquery-latest.jar') \
  .getOrCreate()

In [None]:
#read data from bigquery table with read and then config with format
#check the table format with printSchema
#we can use option method to specify operation like select, filter.
table = "bigquery-public-data.wikipedia.pageviews_2020"
df_wiki_pageviews = spark.read \
  .format("bigquery") \
  .option("table", table) \
  .option("filter", "datehour >= '2020-03-01' AND datehour < '2020-03-02'") \
  .load()

df_wiki_pageviews.printSchema()

In [None]:
#data frame operstion method like select, where, cache to store dataframe into ram instead of recompute
df_wiki_en = df_wiki_pageviews \
  .select("title", "wiki", "views") \
  .where("views > 1000 AND wiki in ('en', 'en.m')") \
  .cache()

df_wiki_en

In [None]:
import pyspark.sql.functions as F
#given the cache dataframe computer aggreate data with method like groupBy, and agg
#get function from pyspark.sql.functions as F
#aggregated and then order by that aggregate result with method orderBy.
df_wiki_en_totals = df_wiki_en \
.groupBy("title") \
.agg(F.sum('views').alias('total_views'))

df_wiki_en_totals.orderBy('total_views', ascending=False)

In [None]:
#to write spark dataframe to bigquery table we need to specify: gsc bucket, bigquery dataset, table
# specify gcs bucket.
gcs_bucket = 'dataproc-bucket-name'
#specify data_set and table
bq_dataset = 'dataset_name'
bq_table = 'wiki_total_pageviews'

df_wiki_en_totals.write \
  .format("bigquery") \
  .option("table","{}.{}".format(bq_dataset, bq_table)) \
  .option("temporaryGcsBucket", gcs_bucket) \
  .mode('overwrite') \
  .save()

In [None]:
#using %%bigquery (bigquery magic syntax) to write query directly from jupiter notebook
%%bigquery
SELECT title, total_views
FROM dataset_name.wiki_total_pageviews
ORDER BY total_views DESC
LIMIT 10